diff --git a/douban.py b/douban.py new file mode 100644 index 0000000..986bb8f --- /dev/null +++ b/douban.py @@ -0,0 +1,74 @@ +import scrapy +import json +import re +from spider.spiders.Redis_Con import Redis_Con + + +class DoubanSpider(scrapy.Spider): + name = 'douban' + allowed_domains = ['movie.douban.com'] + + def start_requests(self): + + #urls=['https://movie.douban.com/j/chart/top_list?type=0&interval_id=100%3A90&action=&start=0&limit=100', 'https://movie.douban.com/j/chart/top_list?type=0&interval_id=95%3A85&action=&start=0&limit=100'] + #urls = [father_url.format(t, max, min) for t in range(1) for max, min in zip(range(100, 9, -5), range(90, -1, -5))] + #print(urls) + + f = Redis_Con() + url = f.geturl() + print('当前请求的URL----->'+url) + yield scrapy.http.Request(url=url, callback=self.parse,dont_filter=True) + + + def parse(self, response): + + + result = response.xpath('/html/body/p/text()').extract() + # '\"RecruitPostName\":\".*?\"' + print(result) + '''json转义''' + fin = re.findall(r'{.*?}', str(result).replace('\\\\','\\')[1:-1]) + + + for x in fin: + print (x) + need = {} + d_json = json.loads(x) + need['评分'] = d_json['score'] + need['电影名称'] = d_json['title'] + need['排名'] = d_json['rank'] + need['类型'] = d_json['types'] + need['演员'] = d_json['actors'] + need['国家'] = d_json['regions'] + need['上映时间'] = d_json['release_date'] + need['type'] = re.findall('type=.*?&',response.url)[0][5:-1] + #正则获取喜爱度区间,便于之后区分 + need['喜爱区间'] = re.findall('interval_id=.*?&',response.url)[0][12:-1] + print('###############yield need is OK ###############') + if result[0] != '[]': + yield need + + # 检查当前页面是否有信息 + #print(result) + if result[0] != '[]': + # 构造下一页的url,发送下次请求 + nexturl=self.replace_page(response.url) + # 在parse方法中发送请求,请求完成后调用parse方法。 + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', + 'Referer': 'https://accounts.douban.com/' + } + #print(nexturl) + yield scrapy.Request(nexturl,callback=self.parse) + + def replace_page(self,s): + index = s.find('start=') + # 正则拿出数字 + res = re.findall('start=.*?&', s) + # 修改后放回去(+1000) + fin = res[0][6:-1] + sum = str(int(fin) + 1000) + new_s = 'start=' + sum + return s.replace(res[0][0:-1], new_s) + +