From 90db876a990269bb31e1dd81d5ec4009076f8128 Mon Sep 17 00:00:00 2001 From: p6mtf24ic Date: Fri, 15 Apr 2022 20:26:07 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E8=99=AB=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- jdpc.py | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 jdpc.py diff --git a/jdpc.py b/jdpc.py new file mode 100644 index 0000000..4fcb68f --- /dev/null +++ b/jdpc.py @@ -0,0 +1,78 @@ +import scrapy +from jd.items import JdItem + +class JdpcSpider(scrapy.Spider): + name = 'jdpc' + #allowed_domains = ['www.baidu.com'] + # start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&wq=%E4%B9%A6%E7%B1%8D&pvid=7e0642e9f0f44d4daebb57808162dc47&page=1&s=1&click=0'] + # url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=5ec1c4896438490c829018b723b2f994&page={0}&s={1}&click=0' + # page_num1 = 3 + # page_num2 = 56 + start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&enc=utf-8&wq=%E4%B9%A6%E7%B1%8D&pvid=15ed5ab337ca4f00a23ea5584cb872bd'] + url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=15ed5ab337ca4f00a23ea5584cb872bd&page={0}&s={1}&click=0' + page_num1 = 3 + page_num2 = 56 + + def parse(self, response): + lis = response.xpath("//div[@id='J_goodsList']/ul/li") + + for li in lis: #索引从1开始 + price = li.xpath("./div/div[2]/strong/i/text()").extract()[0] + name = li.xpath("./div/div[3]/a/em/text()").extract()[0] + #pj = li.xpath("./div/div[5]/strong/a/text()").extract()[0] + print(price, name) + item = JdItem() + item['price'] = price + item['name'] = name + #item['pj'] = pj + yield item #item提交给管道 + if self.page_num1 <= 51: + new_url = self.url.format(self.page_num1,self.page_num2) + self.page_num1 = self.page_num1 + 2 + self.page_num2 = self.page_num2 + 60 + yield scrapy.Request(new_url, callback=self.parse) + + # def parse(self, response): + # div_list = response.xpath('//ul[@class="gl-warp clearfix"]/li') + # # data=[] + # for div in div_list: + # item = JdItem + # name = div.xpath('./div[@class="gl-i-wrap"]/div[3]/a/em/text()').extract() + # price = div.xpath('./div[@class="gl-i-wrap"]/div[2]/strong/i/text()').extract() + # shop = div.xpath('./div[@class="gl-i-wrap"]/div[7]/span/a/text()').extract() + # author = div.xpath('./div[@class="gl-i-wrap"]/div[4]/span[1]/a/text()').extract() + # press = div.xpath('./div[@class="gl-i-wrap"]/div[4]/span[2]/a/text()').extract() + # name = ''.join(name) + # price = ''.join(price) + # # shop = ''.join(shop) + # author = ''.join(author) + # press = ''.join(press) + # print(name,price) + # item["name"] = name + # item["price"] = price + # yield item + +#数据解析 response就是请求成功之后的响应对象 + # def parse(self, response): 终端的持久化存储 + # lis = response.xpath("//div[@id='J_goodsList']/ul/li") + # all_data = [] + # for li in lis: #索引从1开始 + # price = li.xpath("./div/div[2]/strong/i/text()").extract()[0] + # name = li.xpath("./div/div[3]/a/em/text()").extract()[0] + # print(price, name) + # dic = { + # 'price':price, + # 'name':name + # } + # all_data.append(dic) + # #终端的持久化存储 scrapy crawl jdpc -o ./jdpc.csv + # return all_data + + + + + + + + +