ADD file via upload

master
p6mtf24ic 4 years ago
parent 7aa1afb112
commit a8550d2682

@ -0,0 +1,60 @@
import scrapy
from jd.items import JdItem
class JdpcSpider(scrapy.Spider):
#爬虫名称
name = 'jdpc'
#设置起始url
start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&enc=utf-8&wq=%E4%B9%A6%E7%B1%8D&pvid=15ed5ab337ca4f00a23ea5584cb872bd']
url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=15ed5ab337ca4f00a23ea5584cb872bd&page={0}&s={1}&click=0'
page_num1 = 3
page_num2 = 56
count = 1
#做数据解析
def parse(self, response):
#获取所有的li标签
lis = response.xpath("//div[@id='J_goodsList']/ul/li")
#for-in遍历li标签
for li in lis: #索引从1开始
#商品的价格
price = li.xpath("./div/div[2]/strong/i/text()").extract()[0]
#商品的名字
name = li.xpath("./div/div[3]/a/em/text()").extract()[0]
#商品的店铺
shop = li.xpath("./div/div[6]/a/text()").extract()
#使用join把shop编程字符串
shop = ''.join(shop)
#由于每个页面都会有广告但是爬到的广告shop为空
if len(shop)>2: #把广告的商品去掉
print(self.count,price,name,shop)
self.count = self.count +1
#实例化一个管道对象
item = JdItem()
item['price'] = price
item['name'] = name
item['shop'] = shop
item['count'] = self.count
yield item #item提交给管道
#做翻页操作
if self.page_num1 <= 199:
#设置新的url
new_url = self.url.format(self.page_num1,self.page_num2)
#翻页
self.page_num1 = self.page_num1 + 2
self.page_num2 = self.page_num2 + 60
self.count = self.count + 1
#回调函数调用self.parse函数继续做数据解析
yield scrapy.Request(new_url, callback=self.parse)
Loading…
Cancel
Save