You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
79 lines
3.2 KiB
79 lines
3.2 KiB
import scrapy
|
|
from jd.items import JdItem
|
|
|
|
class JdpcSpider(scrapy.Spider):
|
|
name = 'jdpc'
|
|
#allowed_domains = ['www.baidu.com']
|
|
# start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&wq=%E4%B9%A6%E7%B1%8D&pvid=7e0642e9f0f44d4daebb57808162dc47&page=1&s=1&click=0']
|
|
# url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=5ec1c4896438490c829018b723b2f994&page={0}&s={1}&click=0'
|
|
# page_num1 = 3
|
|
# page_num2 = 56
|
|
start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&enc=utf-8&wq=%E4%B9%A6%E7%B1%8D&pvid=15ed5ab337ca4f00a23ea5584cb872bd']
|
|
url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=15ed5ab337ca4f00a23ea5584cb872bd&page={0}&s={1}&click=0'
|
|
page_num1 = 3
|
|
page_num2 = 56
|
|
|
|
def parse(self, response):
|
|
lis = response.xpath("//div[@id='J_goodsList']/ul/li")
|
|
|
|
for li in lis: #索引从1开始
|
|
price = li.xpath("./div/div[2]/strong/i/text()").extract()[0]
|
|
name = li.xpath("./div/div[3]/a/em/text()").extract()[0]
|
|
#pj = li.xpath("./div/div[5]/strong/a/text()").extract()[0]
|
|
print(price, name)
|
|
item = JdItem()
|
|
item['price'] = price
|
|
item['name'] = name
|
|
#item['pj'] = pj
|
|
yield item #item提交给管道
|
|
if self.page_num1 <= 51:
|
|
new_url = self.url.format(self.page_num1,self.page_num2)
|
|
self.page_num1 = self.page_num1 + 2
|
|
self.page_num2 = self.page_num2 + 60
|
|
yield scrapy.Request(new_url, callback=self.parse)
|
|
|
|
# def parse(self, response):
|
|
# div_list = response.xpath('//ul[@class="gl-warp clearfix"]/li')
|
|
# # data=[]
|
|
# for div in div_list:
|
|
# item = JdItem
|
|
# name = div.xpath('./div[@class="gl-i-wrap"]/div[3]/a/em/text()').extract()
|
|
# price = div.xpath('./div[@class="gl-i-wrap"]/div[2]/strong/i/text()').extract()
|
|
# shop = div.xpath('./div[@class="gl-i-wrap"]/div[7]/span/a/text()').extract()
|
|
# author = div.xpath('./div[@class="gl-i-wrap"]/div[4]/span[1]/a/text()').extract()
|
|
# press = div.xpath('./div[@class="gl-i-wrap"]/div[4]/span[2]/a/text()').extract()
|
|
# name = ''.join(name)
|
|
# price = ''.join(price)
|
|
# # shop = ''.join(shop)
|
|
# author = ''.join(author)
|
|
# press = ''.join(press)
|
|
# print(name,price)
|
|
# item["name"] = name
|
|
# item["price"] = price
|
|
# yield item
|
|
|
|
#数据解析 response就是请求成功之后的响应对象
|
|
# def parse(self, response): 终端的持久化存储
|
|
# lis = response.xpath("//div[@id='J_goodsList']/ul/li")
|
|
# all_data = []
|
|
# for li in lis: #索引从1开始
|
|
# price = li.xpath("./div/div[2]/strong/i/text()").extract()[0]
|
|
# name = li.xpath("./div/div[3]/a/em/text()").extract()[0]
|
|
# print(price, name)
|
|
# dic = {
|
|
# 'price':price,
|
|
# 'name':name
|
|
# }
|
|
# all_data.append(dic)
|
|
# #终端的持久化存储 scrapy crawl jdpc -o ./jdpc.csv
|
|
# return all_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|