爬虫文件

master
p6mtf24ic 4 years ago
parent 622d41e1ef
commit 90db876a99

@ -0,0 +1,78 @@
import scrapy
from jd.items import JdItem
class JdpcSpider(scrapy.Spider):
name = 'jdpc'
#allowed_domains = ['www.baidu.com']
# start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&wq=%E4%B9%A6%E7%B1%8D&pvid=7e0642e9f0f44d4daebb57808162dc47&page=1&s=1&click=0']
# url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=5ec1c4896438490c829018b723b2f994&page={0}&s={1}&click=0'
# page_num1 = 3
# page_num2 = 56
start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&enc=utf-8&wq=%E4%B9%A6%E7%B1%8D&pvid=15ed5ab337ca4f00a23ea5584cb872bd']
url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=15ed5ab337ca4f00a23ea5584cb872bd&page={0}&s={1}&click=0'
page_num1 = 3
page_num2 = 56
def parse(self, response):
lis = response.xpath("//div[@id='J_goodsList']/ul/li")
for li in lis: #索引从1开始
price = li.xpath("./div/div[2]/strong/i/text()").extract()[0]
name = li.xpath("./div/div[3]/a/em/text()").extract()[0]
#pj = li.xpath("./div/div[5]/strong/a/text()").extract()[0]
print(price, name)
item = JdItem()
item['price'] = price
item['name'] = name
#item['pj'] = pj
yield item #item提交给管道
if self.page_num1 <= 51:
new_url = self.url.format(self.page_num1,self.page_num2)
self.page_num1 = self.page_num1 + 2
self.page_num2 = self.page_num2 + 60
yield scrapy.Request(new_url, callback=self.parse)
# def parse(self, response):
# div_list = response.xpath('//ul[@class="gl-warp clearfix"]/li')
# # data=[]
# for div in div_list:
# item = JdItem
# name = div.xpath('./div[@class="gl-i-wrap"]/div[3]/a/em/text()').extract()
# price = div.xpath('./div[@class="gl-i-wrap"]/div[2]/strong/i/text()').extract()
# shop = div.xpath('./div[@class="gl-i-wrap"]/div[7]/span/a/text()').extract()
# author = div.xpath('./div[@class="gl-i-wrap"]/div[4]/span[1]/a/text()').extract()
# press = div.xpath('./div[@class="gl-i-wrap"]/div[4]/span[2]/a/text()').extract()
# name = ''.join(name)
# price = ''.join(price)
# # shop = ''.join(shop)
# author = ''.join(author)
# press = ''.join(press)
# print(name,price)
# item["name"] = name
# item["price"] = price
# yield item
#数据解析 response就是请求成功之后的响应对象
# def parse(self, response): 终端的持久化存储
# lis = response.xpath("//div[@id='J_goodsList']/ul/li")
# all_data = []
# for li in lis: #索引从1开始
# price = li.xpath("./div/div[2]/strong/i/text()").extract()[0]
# name = li.xpath("./div/div[3]/a/em/text()").extract()[0]
# print(price, name)
# dic = {
# 'price':price,
# 'name':name
# }
# all_data.append(dic)
# #终端的持久化存储 scrapy crawl jdpc -o ./jdpc.csv
# return all_data
Loading…
Cancel
Save