|
|
import scrapy
|
|
|
import json
|
|
|
import re
|
|
|
from spider.spiders.Redis_Con import Redis_Con
|
|
|
|
|
|
|
|
|
class DoubanSpider(scrapy.Spider):
|
|
|
name = 'douban'
|
|
|
allowed_domains = ['movie.douban.com']
|
|
|
|
|
|
def start_requests(self):
|
|
|
|
|
|
#urls=['https://movie.douban.com/j/chart/top_list?type=0&interval_id=100%3A90&action=&start=0&limit=100', 'https://movie.douban.com/j/chart/top_list?type=0&interval_id=95%3A85&action=&start=0&limit=100']
|
|
|
#urls = [father_url.format(t, max, min) for t in range(1) for max, min in zip(range(100, 9, -5), range(90, -1, -5))]
|
|
|
#print(urls)
|
|
|
|
|
|
f = Redis_Con()
|
|
|
url = f.geturl()
|
|
|
print('当前请求的URL----->'+url)
|
|
|
yield scrapy.http.Request(url=url, callback=self.parse,dont_filter=True)
|
|
|
|
|
|
|
|
|
def parse(self, response):
|
|
|
|
|
|
|
|
|
result = response.xpath('/html/body/p/text()').extract()
|
|
|
# '\"RecruitPostName\":\".*?\"'
|
|
|
print(result)
|
|
|
'''json转义'''
|
|
|
fin = re.findall(r'{.*?}', str(result).replace('\\\\','\\')[1:-1])
|
|
|
|
|
|
|
|
|
for x in fin:
|
|
|
print (x)
|
|
|
need = {}
|
|
|
d_json = json.loads(x)
|
|
|
need['评分'] = d_json['score']
|
|
|
need['电影名称'] = d_json['title']
|
|
|
need['排名'] = d_json['rank']
|
|
|
need['类型'] = d_json['types']
|
|
|
need['演员'] = d_json['actors']
|
|
|
need['国家'] = d_json['regions']
|
|
|
need['上映时间'] = d_json['release_date']
|
|
|
need['type'] = re.findall('type=.*?&',response.url)[0][5:-1]
|
|
|
#正则获取喜爱度区间,便于之后区分
|
|
|
need['喜爱区间'] = re.findall('interval_id=.*?&',response.url)[0][12:-1]
|
|
|
print('###############yield need is OK ###############')
|
|
|
if result[0] != '[]':
|
|
|
yield need
|
|
|
|
|
|
# 检查当前页面是否有信息
|
|
|
#print(result)
|
|
|
if result[0] != '[]':
|
|
|
# 构造下一页的url,发送下次请求
|
|
|
nexturl=self.replace_page(response.url)
|
|
|
# 在parse方法中发送请求,请求完成后调用parse方法。
|
|
|
headers = {
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
|
|
|
'Referer': 'https://accounts.douban.com/'
|
|
|
}
|
|
|
#print(nexturl)
|
|
|
yield scrapy.Request(nexturl,callback=self.parse)
|
|
|
|
|
|
def replace_page(self,s):
|
|
|
index = s.find('start=')
|
|
|
# 正则拿出数字
|
|
|
res = re.findall('start=.*?&', s)
|
|
|
# 修改后放回去(+1000)
|
|
|
fin = res[0][6:-1]
|
|
|
sum = str(int(fin) + 1000)
|
|
|
new_s = 'start=' + sum
|
|
|
return s.replace(res[0][0:-1], new_s)
|
|
|
|
|
|
|