one/jdpc.py

import scrapy
from jd.items import JdItem

class JdpcSpider(scrapy.Spider):
    #爬虫名称
    name = 'jdpc'
    #设置起始url
    start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&enc=utf-8&wq=%E4%B9%A6%E7%B1%8D&pvid=15ed5ab337ca4f00a23ea5584cb872bd']
    url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=15ed5ab337ca4f00a23ea5584cb872bd&page={0}&s={1}&click=0'
    page_num1 = 3
    page_num2 = 56
    count = 1
    #做数据解析
    def parse(self, response):
        #获取所有的li标签
        lis = response.xpath("//div[@id='J_goodsList']/ul/li")
        #for-in遍历li标签
        for li in lis:                     #索引从1开始
            #商品的价格
            price = li.xpath("./div/div[2]/strong/i/text()").extract()[0]
            #商品的名字
            name = li.xpath("./div/div[3]/a/em/text()").extract()[0]
            #商品的店铺
            shop = li.xpath("./div/div[6]/a/text()").extract()
            #使用join把shop编程字符串
            shop = ''.join(shop)
            #由于每个页面都会有广告，但是爬到的广告shop为空
            if len(shop)>2: #把广告的商品去掉
                print(self.count,price,name,shop)
                self.count = self.count +1
                #实例化一个管道对象
                item = JdItem()
                item['price'] = price
                item['name'] = name
                item['shop'] = shop
                item['count'] = self.count
                yield item   #item提交给管道
        #做翻页操作
        if self.page_num1 <= 199:
            #设置新的url
            new_url = self.url.format(self.page_num1,self.page_num2)
            #翻页
            self.page_num1 = self.page_num1 + 2
            self.page_num2 = self.page_num2 + 60
            self.count = self.count + 1
            #回调函数，调用self.parse函数继续做数据解析
            yield scrapy.Request(new_url, callback=self.parse)