one/jdpc.py

import scrapy
from jd.items import JdItem

class JdpcSpider(scrapy.Spider):
    name = 'jdpc'
    #allowed_domains = ['www.baidu.com']
    # start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&wq=%E4%B9%A6%E7%B1%8D&pvid=7e0642e9f0f44d4daebb57808162dc47&page=1&s=1&click=0']
    # url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=5ec1c4896438490c829018b723b2f994&page={0}&s={1}&click=0'
    # page_num1 = 3
    # page_num2 = 56
    start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&enc=utf-8&wq=%E4%B9%A6%E7%B1%8D&pvid=15ed5ab337ca4f00a23ea5584cb872bd']
    url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=15ed5ab337ca4f00a23ea5584cb872bd&page={0}&s={1}&click=0'
    page_num1 = 3
    page_num2 = 56

    def parse(self, response):
        lis = response.xpath("//div[@id='J_goodsList']/ul/li")

        for li in lis:                     #索引从1开始
            price = li.xpath("./div/div[2]/strong/i/text()").extract()[0]
            name = li.xpath("./div/div[3]/a/em/text()").extract()[0]
            #pj = li.xpath("./div/div[5]/strong/a/text()").extract()[0]
            print(price, name)
            item = JdItem()
            item['price'] = price
            item['name'] = name
            #item['pj'] = pj
            yield item   #item提交给管道
        if self.page_num1 <= 51:
            new_url = self.url.format(self.page_num1,self.page_num2)
            self.page_num1 = self.page_num1 + 2
            self.page_num2 = self.page_num2 + 60
            yield scrapy.Request(new_url, callback=self.parse)

    # def parse(self, response):
    #     div_list = response.xpath('//ul[@class="gl-warp clearfix"]/li')
    #     # data=[]
    #     for div in div_list:
    #         item = JdItem
    #         name = div.xpath('./div[@class="gl-i-wrap"]/div[3]/a/em/text()').extract()
    #         price = div.xpath('./div[@class="gl-i-wrap"]/div[2]/strong/i/text()').extract()
    #         shop = div.xpath('./div[@class="gl-i-wrap"]/div[7]/span/a/text()').extract()
    #         author = div.xpath('./div[@class="gl-i-wrap"]/div[4]/span[1]/a/text()').extract()
    #         press = div.xpath('./div[@class="gl-i-wrap"]/div[4]/span[2]/a/text()').extract()
    #         name = ''.join(name)
    #         price = ''.join(price)
    #         # shop = ''.join(shop)
    #         author = ''.join(author)
    #         press = ''.join(press)
    #         print(name,price)
    #         item["name"] = name
    #         item["price"] = price
    #         yield item

#数据解析 response就是请求成功之后的响应对象
    # def parse(self, response):    终端的持久化存储
    #     lis = response.xpath("//div[@id='J_goodsList']/ul/li")
    #     all_data = []
    #     for li in lis:                     #索引从1开始
    #         price = li.xpath("./div/div[2]/strong/i/text()").extract()[0]
    #         name = li.xpath("./div/div[3]/a/em/text()").extract()[0]
    #         print(price, name)
    #         dic = {
    #             'price':price,
    #             'name':name
    #         }
    #         all_data.append(dic)
    #         #终端的持久化存储   scrapy crawl jdpc -o ./jdpc.csv
    #     return all_data