import scrapy from jd.items import JdItem class JdpcSpider(scrapy.Spider): name = 'jdpc' #allowed_domains = ['www.baidu.com'] # start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&wq=%E4%B9%A6%E7%B1%8D&pvid=7e0642e9f0f44d4daebb57808162dc47&page=1&s=1&click=0'] # url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=5ec1c4896438490c829018b723b2f994&page={0}&s={1}&click=0' # page_num1 = 3 # page_num2 = 56 start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&enc=utf-8&wq=%E4%B9%A6%E7%B1%8D&pvid=15ed5ab337ca4f00a23ea5584cb872bd'] url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=15ed5ab337ca4f00a23ea5584cb872bd&page={0}&s={1}&click=0' page_num1 = 3 page_num2 = 56 def parse(self, response): lis = response.xpath("//div[@id='J_goodsList']/ul/li") for li in lis: #索引从1开始 price = li.xpath("./div/div[2]/strong/i/text()").extract()[0] name = li.xpath("./div/div[3]/a/em/text()").extract()[0] #pj = li.xpath("./div/div[5]/strong/a/text()").extract()[0] print(price, name) item = JdItem() item['price'] = price item['name'] = name #item['pj'] = pj yield item #item提交给管道 if self.page_num1 <= 51: new_url = self.url.format(self.page_num1,self.page_num2) self.page_num1 = self.page_num1 + 2 self.page_num2 = self.page_num2 + 60 yield scrapy.Request(new_url, callback=self.parse) # def parse(self, response): # div_list = response.xpath('//ul[@class="gl-warp clearfix"]/li') # # data=[] # for div in div_list: # item = JdItem # name = div.xpath('./div[@class="gl-i-wrap"]/div[3]/a/em/text()').extract() # price = div.xpath('./div[@class="gl-i-wrap"]/div[2]/strong/i/text()').extract() # shop = div.xpath('./div[@class="gl-i-wrap"]/div[7]/span/a/text()').extract() # author = div.xpath('./div[@class="gl-i-wrap"]/div[4]/span[1]/a/text()').extract() # press = div.xpath('./div[@class="gl-i-wrap"]/div[4]/span[2]/a/text()').extract() # name = ''.join(name) # price = ''.join(price) # # shop = ''.join(shop) # author = ''.join(author) # press = ''.join(press) # print(name,price) # item["name"] = name # item["price"] = price # yield item #数据解析 response就是请求成功之后的响应对象 # def parse(self, response): 终端的持久化存储 # lis = response.xpath("//div[@id='J_goodsList']/ul/li") # all_data = [] # for li in lis: #索引从1开始 # price = li.xpath("./div/div[2]/strong/i/text()").extract()[0] # name = li.xpath("./div/div[3]/a/em/text()").extract()[0] # print(price, name) # dic = { # 'price':price, # 'name':name # } # all_data.append(dic) # #终端的持久化存储 scrapy crawl jdpc -o ./jdpc.csv # return all_data