diff --git a/jdpc.py b/jdpc.py new file mode 100644 index 0000000..d645235 --- /dev/null +++ b/jdpc.py @@ -0,0 +1,60 @@ +import scrapy +from jd.items import JdItem + +class JdpcSpider(scrapy.Spider): + #爬虫名称 + name = 'jdpc' + #设置起始url + start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&enc=utf-8&wq=%E4%B9%A6%E7%B1%8D&pvid=15ed5ab337ca4f00a23ea5584cb872bd'] + url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=15ed5ab337ca4f00a23ea5584cb872bd&page={0}&s={1}&click=0' + page_num1 = 3 + page_num2 = 56 + count = 1 + #做数据解析 + def parse(self, response): + #获取所有的li标签 + lis = response.xpath("//div[@id='J_goodsList']/ul/li") + #for-in遍历li标签 + for li in lis: #索引从1开始 + #商品的价格 + price = li.xpath("./div/div[2]/strong/i/text()").extract()[0] + #商品的名字 + name = li.xpath("./div/div[3]/a/em/text()").extract()[0] + #商品的店铺 + shop = li.xpath("./div/div[6]/a/text()").extract() + #使用join把shop编程字符串 + shop = ''.join(shop) + #由于每个页面都会有广告,但是爬到的广告shop为空 + if len(shop)>2: #把广告的商品去掉 + print(self.count,price,name,shop) + self.count = self.count +1 + #实例化一个管道对象 + item = JdItem() + item['price'] = price + item['name'] = name + item['shop'] = shop + item['count'] = self.count + yield item #item提交给管道 + #做翻页操作 + if self.page_num1 <= 199: + #设置新的url + new_url = self.url.format(self.page_num1,self.page_num2) + #翻页 + self.page_num1 = self.page_num1 + 2 + self.page_num2 = self.page_num2 + 60 + self.count = self.count + 1 + #回调函数,调用self.parse函数继续做数据解析 + yield scrapy.Request(new_url, callback=self.parse) + + + + + + + + + + + + +