You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

61 lines
2.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import scrapy
from jd.items import JdItem
class JdpcSpider(scrapy.Spider):
#爬虫名称
name = 'jdpc'
#设置起始url
start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&enc=utf-8&wq=%E4%B9%A6%E7%B1%8D&pvid=15ed5ab337ca4f00a23ea5584cb872bd']
url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=15ed5ab337ca4f00a23ea5584cb872bd&page={0}&s={1}&click=0'
page_num1 = 3
page_num2 = 56
count = 1
#做数据解析
def parse(self, response):
#获取所有的li标签
lis = response.xpath("//div[@id='J_goodsList']/ul/li")
#for-in遍历li标签
for li in lis: #索引从1开始
#商品的价格
price = li.xpath("./div/div[2]/strong/i/text()").extract()[0]
#商品的名字
name = li.xpath("./div/div[3]/a/em/text()").extract()[0]
#商品的店铺
shop = li.xpath("./div/div[6]/a/text()").extract()
#使用join把shop编程字符串
shop = ''.join(shop)
#由于每个页面都会有广告但是爬到的广告shop为空
if len(shop)>2: #把广告的商品去掉
print(self.count,price,name,shop)
self.count = self.count +1
#实例化一个管道对象
item = JdItem()
item['price'] = price
item['name'] = name
item['shop'] = shop
item['count'] = self.count
yield item #item提交给管道
#做翻页操作
if self.page_num1 <= 199:
#设置新的url
new_url = self.url.format(self.page_num1,self.page_num2)
#翻页
self.page_num1 = self.page_num1 + 2
self.page_num2 = self.page_num2 + 60
self.count = self.count + 1
#回调函数调用self.parse函数继续做数据解析
yield scrapy.Request(new_url, callback=self.parse)