You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
80 lines
3.0 KiB
80 lines
3.0 KiB
3 years ago
|
# Define your item pipelines here
|
||
|
#
|
||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||
|
|
||
|
|
||
|
# useful for handling different item types with a single interface
|
||
|
from urllib import parse
|
||
|
|
||
|
import openpyxl
|
||
|
from itemadapter import ItemAdapter
|
||
|
import os
|
||
|
import json
|
||
|
from openpyxl import Workbook
|
||
|
from spider.spiders.Redis_Con import Redis_Con
|
||
|
|
||
|
|
||
|
class IPProxyPipeline:
|
||
|
|
||
|
def process_item(self, item, spider):
|
||
|
r = Redis_Con().r
|
||
|
r.lpush('freeip',item['ip'])
|
||
|
print('免费代理已写入IP池')
|
||
|
return item
|
||
|
|
||
|
|
||
|
class RedisSavePipeline(object):
|
||
|
|
||
|
def process_item(self, item, spider):
|
||
|
# 用于存储到Redis的Pipe
|
||
|
r = Redis_Con().r
|
||
|
|
||
|
# 获取当前的type值
|
||
|
print('json输出---->' + json.dumps(item))
|
||
|
# dict转json存redis
|
||
|
datanum = item['type']
|
||
|
r.lpush('%s' % self.typetostr(int(datanum)), json.dumps(item))
|
||
|
|
||
|
return item
|
||
|
|
||
|
def typetostr(self, id):
|
||
|
typeTostr = {24: '喜剧', 11: '剧情', 5: '动作', 13: '爱情', 17: '科幻', 25: '动画', 10: '悬疑', 19: '惊悚', 20: '恐怖', 1: '纪录片',
|
||
|
23: '短片', 6: '色情', 26: '同性', 14: '音乐', 7: '歌舞', 28: '家庭', 2: '传记', 8: '儿童', 4: '历史', 22: '战争',
|
||
|
3: '犯罪', 27: '西部',
|
||
|
16: '奇幻', 15: '冒险', 12: '灾难', 29: '武侠', 30: '古装', 18: '运动', 31: '黑色电影'}
|
||
|
return typeTostr[id]
|
||
|
|
||
|
|
||
|
class ExcelPipeline(object):
|
||
|
|
||
|
def typetostr(self, id):
|
||
|
typeTostr = {24: '喜剧', 11: '剧情', 5: '动作', 13: '爱情', 17: '科幻', 25: '动画', 10: '悬疑', 19: '惊悚', 20: '恐怖', 1: '纪录片',
|
||
|
23: '短片', 6: '色情', 26: '同性', 14: '音乐', 7: '歌舞', 28: '家庭', 2: '传记', 8: '儿童', 4: '历史', 22: '战争',
|
||
|
3: '犯罪', 27: '西部',
|
||
|
16: '奇幻', 15: '冒险', 12: '灾难', 29: '武侠', 30: '古装', 18: '运动', 31: '黑色电影'}
|
||
|
return typeTostr[id]
|
||
|
|
||
|
def process_item(self, item, spider):
|
||
|
now_t = self.typetostr(int(item['type']))
|
||
|
name = "豆瓣爬虫结果.xlsx"
|
||
|
line = [item['电影名称'], item['评分'], item['排名'], ''.join(item['类型']), item['国家'], item['上映时间'],
|
||
|
''.join(item['演员']), parse.unquote(item['喜爱区间'])]
|
||
|
|
||
|
|
||
|
if os.path.exists(name): #Execl文件是否存在判断
|
||
|
#直接打开返回对应表
|
||
|
wb = openpyxl.load_workbook(name)
|
||
|
# 表是否存在 ?
|
||
|
if now_t not in wb.get_sheet_names():
|
||
|
wb.create_sheet(now_t)
|
||
|
|
||
|
else:#不存在Excel文件
|
||
|
wb = openpyxl.Workbook(name)
|
||
|
wb.create_sheet(now_t)
|
||
|
|
||
|
ws = wb[now_t]
|
||
|
ws.append(['电影名称', '评分', '排名', '类型', '国家', '上映时间', '演员', '喜爱区间'])
|
||
|
ws.append(line)
|
||
|
ws.save(name)
|
||
|
return item
|