ADD file via upload

4 years ago · a8550d2682
parent 7aa1afb112
commit a8550d2682
1 changed files with 60 additions and 0 deletions
--- a/jdpc.py
+++ b/jdpc.py
@ -0,0 +1,60 @@
+import scrapy
+from jd.items import JdItem
+
+class JdpcSpider(scrapy.Spider):
+    #爬虫名称
+    name = 'jdpc'
+    #设置起始url
+    start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&enc=utf-8&wq=%E4%B9%A6%E7%B1%8D&pvid=15ed5ab337ca4f00a23ea5584cb872bd']
+    url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=15ed5ab337ca4f00a23ea5584cb872bd&page={0}&s={1}&click=0'
+    page_num1 = 3
+    page_num2 = 56
+    count = 1
+    #做数据解析
+    def parse(self, response):
+        #获取所有的li标签
+        lis = response.xpath("//div[@id='J_goodsList']/ul/li")
+        #for-in遍历li标签
+        for li in lis:                     #索引从1开始
+            #商品的价格
+            price = li.xpath("./div/div[2]/strong/i/text()").extract()[0]
+            #商品的名字
+            name = li.xpath("./div/div[3]/a/em/text()").extract()[0]
+            #商品的店铺
+            shop = li.xpath("./div/div[6]/a/text()").extract()
+            #使用join把shop编程字符串
+            shop = ''.join(shop)
+            #由于每个页面都会有广告，但是爬到的广告shop为空
+            if len(shop)>2: #把广告的商品去掉
+                print(self.count,price,name,shop)
+                self.count = self.count +1
+                #实例化一个管道对象
+                item = JdItem()
+                item['price'] = price
+                item['name'] = name
+                item['shop'] = shop
+                item['count'] = self.count
+                yield item   #item提交给管道
+        #做翻页操作
+        if self.page_num1 <= 199:
+            #设置新的url
+            new_url = self.url.format(self.page_num1,self.page_num2)
+            #翻页
+            self.page_num1 = self.page_num1 + 2
+            self.page_num2 = self.page_num2 + 60
+            self.count = self.count + 1
+            #回调函数，调用self.parse函数继续做数据解析
+            yield scrapy.Request(new_url, callback=self.parse)
+
+
+
+
+
+
+
+
+
+
+
+
+