爬虫文件

4 years ago · 90db876a99
parent 622d41e1ef
commit 90db876a99
1 changed files with 78 additions and 0 deletions
--- a/jdpc.py
+++ b/jdpc.py
@ -0,0 +1,78 @@
+import scrapy
+from jd.items import JdItem
+
+class JdpcSpider(scrapy.Spider):
+    name = 'jdpc'
+    #allowed_domains = ['www.baidu.com']
+    # start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&wq=%E4%B9%A6%E7%B1%8D&pvid=7e0642e9f0f44d4daebb57808162dc47&page=1&s=1&click=0']
+    # url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=5ec1c4896438490c829018b723b2f994&page={0}&s={1}&click=0'
+    # page_num1 = 3
+    # page_num2 = 56
+    start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&enc=utf-8&wq=%E4%B9%A6%E7%B1%8D&pvid=15ed5ab337ca4f00a23ea5584cb872bd']
+    url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=15ed5ab337ca4f00a23ea5584cb872bd&page={0}&s={1}&click=0'
+    page_num1 = 3
+    page_num2 = 56
+
+    def parse(self, response):
+        lis = response.xpath("//div[@id='J_goodsList']/ul/li")
+
+        for li in lis:                     #索引从1开始
+            price = li.xpath("./div/div[2]/strong/i/text()").extract()[0]
+            name = li.xpath("./div/div[3]/a/em/text()").extract()[0]
+            #pj = li.xpath("./div/div[5]/strong/a/text()").extract()[0]
+            print(price, name)
+            item = JdItem()
+            item['price'] = price
+            item['name'] = name
+            #item['pj'] = pj
+            yield item   #item提交给管道
+        if self.page_num1 <= 51:
+            new_url = self.url.format(self.page_num1,self.page_num2)
+            self.page_num1 = self.page_num1 + 2
+            self.page_num2 = self.page_num2 + 60
+            yield scrapy.Request(new_url, callback=self.parse)
+
+    # def parse(self, response):
+    #     div_list = response.xpath('//ul[@class="gl-warp clearfix"]/li')
+    #     # data=[]
+    #     for div in div_list:
+    #         item = JdItem
+    #         name = div.xpath('./div[@class="gl-i-wrap"]/div[3]/a/em/text()').extract()
+    #         price = div.xpath('./div[@class="gl-i-wrap"]/div[2]/strong/i/text()').extract()
+    #         shop = div.xpath('./div[@class="gl-i-wrap"]/div[7]/span/a/text()').extract()
+    #         author = div.xpath('./div[@class="gl-i-wrap"]/div[4]/span[1]/a/text()').extract()
+    #         press = div.xpath('./div[@class="gl-i-wrap"]/div[4]/span[2]/a/text()').extract()
+    #         name = ''.join(name)
+    #         price = ''.join(price)
+    #         # shop = ''.join(shop)
+    #         author = ''.join(author)
+    #         press = ''.join(press)
+    #         print(name,price)
+    #         item["name"] = name
+    #         item["price"] = price
+    #         yield item
+
+#数据解析 response就是请求成功之后的响应对象
+    # def parse(self, response):    终端的持久化存储
+    #     lis = response.xpath("//div[@id='J_goodsList']/ul/li")
+    #     all_data = []
+    #     for li in lis:                     #索引从1开始
+    #         price = li.xpath("./div/div[2]/strong/i/text()").extract()[0]
+    #         name = li.xpath("./div/div[3]/a/em/text()").extract()[0]
+    #         print(price, name)
+    #         dic = {
+    #             'price':price,
+    #             'name':name
+    #         }
+    #         all_data.append(dic)
+    #         #终端的持久化存储   scrapy crawl jdpc -o ./jdpc.csv
+    #     return all_data
+
+
+
+
+
+
+
+
+