diff --git a/pipelines.py b/pipelines.py new file mode 100644 index 0000000..7383834 --- /dev/null +++ b/pipelines.py @@ -0,0 +1,80 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from urllib import parse + +import openpyxl +from itemadapter import ItemAdapter +import os +import json +from openpyxl import Workbook +from spider.spiders.Redis_Con import Redis_Con + + +class IPProxyPipeline: + + def process_item(self, item, spider): + r = Redis_Con().r + r.lpush('freeip',item['ip']) + print('免费代理已写入IP池') + return item + + +class RedisSavePipeline(object): + + def process_item(self, item, spider): + # 用于存储到Redis的Pipe + r = Redis_Con().r + + # 获取当前的type值 + print('json输出---->' + json.dumps(item)) + # dict转json存redis + datanum = item['type'] + r.lpush('%s' % self.typetostr(int(datanum)), json.dumps(item)) + + return item + + def typetostr(self, id): + typeTostr = {24: '喜剧', 11: '剧情', 5: '动作', 13: '爱情', 17: '科幻', 25: '动画', 10: '悬疑', 19: '惊悚', 20: '恐怖', 1: '纪录片', + 23: '短片', 6: '色情', 26: '同性', 14: '音乐', 7: '歌舞', 28: '家庭', 2: '传记', 8: '儿童', 4: '历史', 22: '战争', + 3: '犯罪', 27: '西部', + 16: '奇幻', 15: '冒险', 12: '灾难', 29: '武侠', 30: '古装', 18: '运动', 31: '黑色电影'} + return typeTostr[id] + + +class ExcelPipeline(object): + + def typetostr(self, id): + typeTostr = {24: '喜剧', 11: '剧情', 5: '动作', 13: '爱情', 17: '科幻', 25: '动画', 10: '悬疑', 19: '惊悚', 20: '恐怖', 1: '纪录片', + 23: '短片', 6: '色情', 26: '同性', 14: '音乐', 7: '歌舞', 28: '家庭', 2: '传记', 8: '儿童', 4: '历史', 22: '战争', + 3: '犯罪', 27: '西部', + 16: '奇幻', 15: '冒险', 12: '灾难', 29: '武侠', 30: '古装', 18: '运动', 31: '黑色电影'} + return typeTostr[id] + + def process_item(self, item, spider): + now_t = self.typetostr(int(item['type'])) + name = "豆瓣爬虫结果.xlsx" + line = [item['电影名称'], item['评分'], item['排名'], ''.join(item['类型']), item['国家'], item['上映时间'], + ''.join(item['演员']), parse.unquote(item['喜爱区间'])] + + + if os.path.exists(name): #Execl文件是否存在判断 + #直接打开返回对应表 + wb = openpyxl.load_workbook(name) + # 表是否存在 ? + if now_t not in wb.get_sheet_names(): + wb.create_sheet(now_t) + + else:#不存在Excel文件 + wb = openpyxl.Workbook(name) + wb.create_sheet(now_t) + + ws = wb[now_t] + ws.append(['电影名称', '评分', '排名', '类型', '国家', '上映时间', '演员', '喜爱区间']) + ws.append(line) + ws.save(name) + return item \ No newline at end of file