You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

75 lines
2.7 KiB

import scrapy
import json
import re
from spider.spiders.Redis_Con import Redis_Con
class DoubanSpider(scrapy.Spider):
name = 'douban'
allowed_domains = ['']
def start_requests(self):
#urls=['', '']
#urls = [father_url.format(t, max, min) for t in range(1) for max, min in zip(range(100, 9, -5), range(90, -1, -5))]
f = Redis_Con()
url = f.geturl()
yield scrapy.http.Request(url=url, callback=self.parse,dont_filter=True)
def parse(self, response):
result = response.xpath('/html/body/p/text()').extract()
# '\"RecruitPostName\":\".*?\"'
fin = re.findall(r'{.*?}', str(result).replace('\\\\','\\')[1:-1])
for x in fin:
print (x)
need = {}
d_json = json.loads(x)
need['评分'] = d_json['score']
need['电影名称'] = d_json['title']
need['排名'] = d_json['rank']
need['类型'] = d_json['types']
need['演员'] = d_json['actors']
need['国家'] = d_json['regions']
need['上映时间'] = d_json['release_date']
need['type'] = re.findall('type=.*?&',response.url)[0][5:-1]
need['喜爱区间'] = re.findall('interval_id=.*?&',response.url)[0][12:-1]
print('###############yield need is OK ###############')
if result[0] != '[]':
yield need
# 检查当前页面是否有信息
if result[0] != '[]':
# 构造下一页的url发送下次请求
# 在parse方法中发送请求请求完成后调用parse方法。
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Referer': ''
yield scrapy.Request(nexturl,callback=self.parse)
def replace_page(self,s):
index = s.find('start=')
# 正则拿出数字
res = re.findall('start=.*?&', s)
# 修改后放回去(+1000)
fin = res[0][6:-1]
sum = str(int(fin) + 1000)
new_s = 'start=' + sum
return s.replace(res[0][0:-1], new_s)