import scrapy import json import re from spider.spiders.Redis_Con import Redis_Con class DoubanSpider(scrapy.Spider): name = 'douban' allowed_domains = ['movie.douban.com'] def start_requests(self): #urls=['https://movie.douban.com/j/chart/top_list?type=0&interval_id=100%3A90&action=&start=0&limit=100', 'https://movie.douban.com/j/chart/top_list?type=0&interval_id=95%3A85&action=&start=0&limit=100'] #urls = [father_url.format(t, max, min) for t in range(1) for max, min in zip(range(100, 9, -5), range(90, -1, -5))] #print(urls) f = Redis_Con() url = f.geturl() print('当前请求的URL----->'+url) yield scrapy.http.Request(url=url, callback=self.parse,dont_filter=True) def parse(self, response): result = response.xpath('/html/body/p/text()').extract() # '\"RecruitPostName\":\".*?\"' print(result) '''json转义''' fin = re.findall(r'{.*?}', str(result).replace('\\\\','\\')[1:-1]) for x in fin: print (x) need = {} d_json = json.loads(x) need['评分'] = d_json['score'] need['电影名称'] = d_json['title'] need['排名'] = d_json['rank'] need['类型'] = d_json['types'] need['演员'] = d_json['actors'] need['国家'] = d_json['regions'] need['上映时间'] = d_json['release_date'] need['type'] = re.findall('type=.*?&',response.url)[0][5:-1] #正则获取喜爱度区间,便于之后区分 need['喜爱区间'] = re.findall('interval_id=.*?&',response.url)[0][12:-1] print('###############yield need is OK ###############') if result[0] != '[]': yield need # 检查当前页面是否有信息 #print(result) if result[0] != '[]': # 构造下一页的url,发送下次请求 nexturl=self.replace_page(response.url) # 在parse方法中发送请求,请求完成后调用parse方法。 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 'Referer': 'https://accounts.douban.com/' } #print(nexturl) yield scrapy.Request(nexturl,callback=self.parse) def replace_page(self,s): index = s.find('start=') # 正则拿出数字 res = re.findall('start=.*?&', s) # 修改后放回去(+1000) fin = res[0][6:-1] sum = str(int(fin) + 1000) new_s = 'start=' + sum return s.replace(res[0][0:-1], new_s)