parent
e42c7c7f96
commit
409d790c6b
@ -1,78 +0,0 @@
|
||||
import scrapy
|
||||
from jd.items import JdItem
|
||||
|
||||
class JdpcSpider(scrapy.Spider):
|
||||
name = 'jdpc'
|
||||
#allowed_domains = ['www.baidu.com']
|
||||
# start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&wq=%E4%B9%A6%E7%B1%8D&pvid=7e0642e9f0f44d4daebb57808162dc47&page=1&s=1&click=0']
|
||||
# url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=5ec1c4896438490c829018b723b2f994&page={0}&s={1}&click=0'
|
||||
# page_num1 = 3
|
||||
# page_num2 = 56
|
||||
start_urls = ['https://search.jd.com/Search?keyword=%E4%B9%A6%E7%B1%8D&enc=utf-8&wq=%E4%B9%A6%E7%B1%8D&pvid=15ed5ab337ca4f00a23ea5584cb872bd']
|
||||
url = 'https://search.jd.com/Search?keyword=书籍&wq=书籍&pvid=15ed5ab337ca4f00a23ea5584cb872bd&page={0}&s={1}&click=0'
|
||||
page_num1 = 3
|
||||
page_num2 = 56
|
||||
|
||||
def parse(self, response):
|
||||
lis = response.xpath("//div[@id='J_goodsList']/ul/li")
|
||||
|
||||
for li in lis: #索引从1开始
|
||||
price = li.xpath("./div/div[2]/strong/i/text()").extract()[0]
|
||||
name = li.xpath("./div/div[3]/a/em/text()").extract()[0]
|
||||
#pj = li.xpath("./div/div[5]/strong/a/text()").extract()[0]
|
||||
print(price, name)
|
||||
item = JdItem()
|
||||
item['price'] = price
|
||||
item['name'] = name
|
||||
#item['pj'] = pj
|
||||
yield item #item提交给管道
|
||||
if self.page_num1 <= 51:
|
||||
new_url = self.url.format(self.page_num1,self.page_num2)
|
||||
self.page_num1 = self.page_num1 + 2
|
||||
self.page_num2 = self.page_num2 + 60
|
||||
yield scrapy.Request(new_url, callback=self.parse)
|
||||
|
||||
# def parse(self, response):
|
||||
# div_list = response.xpath('//ul[@class="gl-warp clearfix"]/li')
|
||||
# # data=[]
|
||||
# for div in div_list:
|
||||
# item = JdItem
|
||||
# name = div.xpath('./div[@class="gl-i-wrap"]/div[3]/a/em/text()').extract()
|
||||
# price = div.xpath('./div[@class="gl-i-wrap"]/div[2]/strong/i/text()').extract()
|
||||
# shop = div.xpath('./div[@class="gl-i-wrap"]/div[7]/span/a/text()').extract()
|
||||
# author = div.xpath('./div[@class="gl-i-wrap"]/div[4]/span[1]/a/text()').extract()
|
||||
# press = div.xpath('./div[@class="gl-i-wrap"]/div[4]/span[2]/a/text()').extract()
|
||||
# name = ''.join(name)
|
||||
# price = ''.join(price)
|
||||
# # shop = ''.join(shop)
|
||||
# author = ''.join(author)
|
||||
# press = ''.join(press)
|
||||
# print(name,price)
|
||||
# item["name"] = name
|
||||
# item["price"] = price
|
||||
# yield item
|
||||
|
||||
#数据解析 response就是请求成功之后的响应对象
|
||||
# def parse(self, response): 终端的持久化存储
|
||||
# lis = response.xpath("//div[@id='J_goodsList']/ul/li")
|
||||
# all_data = []
|
||||
# for li in lis: #索引从1开始
|
||||
# price = li.xpath("./div/div[2]/strong/i/text()").extract()[0]
|
||||
# name = li.xpath("./div/div[3]/a/em/text()").extract()[0]
|
||||
# print(price, name)
|
||||
# dic = {
|
||||
# 'price':price,
|
||||
# 'name':name
|
||||
# }
|
||||
# all_data.append(dic)
|
||||
# #终端的持久化存储 scrapy crawl jdpc -o ./jdpc.csv
|
||||
# return all_data
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in new issue