You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

75 lines
2.7 KiB

import scrapy
import json
import re
from spider.spiders.Redis_Con import Redis_Con
class DoubanSpider(scrapy.Spider):
name = 'douban'
allowed_domains = ['movie.douban.com']
def start_requests(self):
#urls=['https://movie.douban.com/j/chart/top_list?type=0&interval_id=100%3A90&action=&start=0&limit=100', 'https://movie.douban.com/j/chart/top_list?type=0&interval_id=95%3A85&action=&start=0&limit=100']
#urls = [father_url.format(t, max, min) for t in range(1) for max, min in zip(range(100, 9, -5), range(90, -1, -5))]
#print(urls)
f = Redis_Con()
url = f.geturl()
print('当前请求的URL----->'+url)
yield scrapy.http.Request(url=url, callback=self.parse,dont_filter=True)
def parse(self, response):
result = response.xpath('/html/body/p/text()').extract()
# '\"RecruitPostName\":\".*?\"'
print(result)
'''json转义'''
fin = re.findall(r'{.*?}', str(result).replace('\\\\','\\')[1:-1])
for x in fin:
print (x)
need = {}
d_json = json.loads(x)
need['评分'] = d_json['score']
need['电影名称'] = d_json['title']
need['排名'] = d_json['rank']
need['类型'] = d_json['types']
need['演员'] = d_json['actors']
need['国家'] = d_json['regions']
need['上映时间'] = d_json['release_date']
need['type'] = re.findall('type=.*?&',response.url)[0][5:-1]
#正则获取喜爱度区间,便于之后区分
need['喜爱区间'] = re.findall('interval_id=.*?&',response.url)[0][12:-1]
print('###############yield need is OK ###############')
if result[0] != '[]':
yield need
# 检查当前页面是否有信息
#print(result)
if result[0] != '[]':
# 构造下一页的url发送下次请求
nexturl=self.replace_page(response.url)
# 在parse方法中发送请求请求完成后调用parse方法。
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Referer': 'https://accounts.douban.com/'
}
#print(nexturl)
yield scrapy.Request(nexturl,callback=self.parse)
def replace_page(self,s):
index = s.find('start=')
# 正则拿出数字
res = re.findall('start=.*?&', s)
# 修改后放回去(+1000)
fin = res[0][6:-1]
sum = str(int(fin) + 1000)
new_s = 'start=' + sum
return s.replace(res[0][0:-1], new_s)