You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

80 lines
3.0 KiB

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from urllib import parse
import openpyxl
from itemadapter import ItemAdapter
import os
import json
from openpyxl import Workbook
from spider.spiders.Redis_Con import Redis_Con
class IPProxyPipeline:
def process_item(self, item, spider):
r = Redis_Con().r
r.lpush('freeip',item['ip'])
print('免费代理已写入IP池')
return item
class RedisSavePipeline(object):
def process_item(self, item, spider):
# 用于存储到Redis的Pipe
r = Redis_Con().r
# 获取当前的type值
print('json输出---->' + json.dumps(item))
# dict转json存redis
datanum = item['type']
r.lpush('%s' % self.typetostr(int(datanum)), json.dumps(item))
return item
def typetostr(self, id):
typeTostr = {24: '喜剧', 11: '剧情', 5: '动作', 13: '爱情', 17: '科幻', 25: '动画', 10: '悬疑', 19: '惊悚', 20: '恐怖', 1: '纪录片',
23: '短片', 6: '色情', 26: '同性', 14: '音乐', 7: '歌舞', 28: '家庭', 2: '传记', 8: '儿童', 4: '历史', 22: '战争',
3: '犯罪', 27: '西部',
16: '奇幻', 15: '冒险', 12: '灾难', 29: '武侠', 30: '古装', 18: '运动', 31: '黑色电影'}
return typeTostr[id]
class ExcelPipeline(object):
def typetostr(self, id):
typeTostr = {24: '喜剧', 11: '剧情', 5: '动作', 13: '爱情', 17: '科幻', 25: '动画', 10: '悬疑', 19: '惊悚', 20: '恐怖', 1: '纪录片',
23: '短片', 6: '色情', 26: '同性', 14: '音乐', 7: '歌舞', 28: '家庭', 2: '传记', 8: '儿童', 4: '历史', 22: '战争',
3: '犯罪', 27: '西部',
16: '奇幻', 15: '冒险', 12: '灾难', 29: '武侠', 30: '古装', 18: '运动', 31: '黑色电影'}
return typeTostr[id]
def process_item(self, item, spider):
now_t = self.typetostr(int(item['type']))
name = "豆瓣爬虫结果.xlsx"
line = [item['电影名称'], item['评分'], item['排名'], ''.join(item['类型']), item['国家'], item['上映时间'],
''.join(item['演员']), parse.unquote(item['喜爱区间'])]
if os.path.exists(name): #Execl文件是否存在判断
#直接打开返回对应表
wb = openpyxl.load_workbook(name)
# 表是否存在 ?
if now_t not in wb.get_sheet_names():
wb.create_sheet(now_t)
else:#不存在Excel文件
wb = openpyxl.Workbook(name)
wb.create_sheet(now_t)
ws = wb[now_t]
ws.append(['电影名称', '评分', '排名', '类型', '国家', '上映时间', '演员', '喜爱区间'])
ws.append(line)
ws.save(name)
return item