|
|
|
|
@ -1,60 +0,0 @@
|
|
|
|
|
import scrapy
|
|
|
|
|
from jd.items import JdItem
|
|
|
|
|
|
|
|
|
|
class JdpcSpider(scrapy.Spider):
|
|
|
|
|
#爬虫名称
|
|
|
|
|
name = 'jdpc'
|
|
|
|
|
#设置起始url
|
|
|
|
|
start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&enc=utf-8&wq=%E4%B9%A6%E7%B1%8D&pvid=15ed5ab337ca4f00a23ea5584cb872bd']
|
|
|
|
|
url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=15ed5ab337ca4f00a23ea5584cb872bd&page={0}&s={1}&click=0'
|
|
|
|
|
page_num1 = 3
|
|
|
|
|
page_num2 = 56
|
|
|
|
|
count = 1
|
|
|
|
|
#做数据解析
|
|
|
|
|
def parse(self, response):
|
|
|
|
|
#获取所有的li标签
|
|
|
|
|
lis = response.xpath("//div[@id='J_goodsList']/ul/li")
|
|
|
|
|
#for-in遍历li标签
|
|
|
|
|
for li in lis: #索引从1开始
|
|
|
|
|
#商品的价格
|
|
|
|
|
price = li.xpath("./div/div[2]/strong/i/text()").extract()[0]
|
|
|
|
|
#商品的名字
|
|
|
|
|
name = li.xpath("./div/div[3]/a/em/text()").extract()[0]
|
|
|
|
|
#商品的店铺
|
|
|
|
|
shop = li.xpath("./div/div[6]/a/text()").extract()
|
|
|
|
|
#使用join把shop编程字符串
|
|
|
|
|
shop = ''.join(shop)
|
|
|
|
|
#由于每个页面都会有广告,但是爬到的广告shop为空
|
|
|
|
|
if len(shop)>2: #把广告的商品去掉
|
|
|
|
|
print(self.count,price,name,shop)
|
|
|
|
|
self.count = self.count +1
|
|
|
|
|
#实例化一个管道对象
|
|
|
|
|
item = JdItem()
|
|
|
|
|
item['price'] = price
|
|
|
|
|
item['name'] = name
|
|
|
|
|
item['shop'] = shop
|
|
|
|
|
item['count'] = self.count
|
|
|
|
|
yield item #item提交给管道
|
|
|
|
|
#做翻页操作
|
|
|
|
|
if self.page_num1 <= 199:
|
|
|
|
|
#设置新的url
|
|
|
|
|
new_url = self.url.format(self.page_num1,self.page_num2)
|
|
|
|
|
#翻页
|
|
|
|
|
self.page_num1 = self.page_num1 + 2
|
|
|
|
|
self.page_num2 = self.page_num2 + 60
|
|
|
|
|
self.count = self.count + 1
|
|
|
|
|
#回调函数,调用self.parse函数继续做数据解析
|
|
|
|
|
yield scrapy.Request(new_url, callback=self.parse)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|