diff --git a/Bilispider _redis/README.md b/Bilispider _redis/README.md new file mode 100644 index 0000000..6376748 --- /dev/null +++ b/Bilispider _redis/README.md @@ -0,0 +1,2 @@ +# python + diff --git a/Bilispider _redis/STHeiti-Medium.ttc b/Bilispider _redis/STHeiti-Medium.ttc new file mode 100644 index 0000000..fb7116f Binary files /dev/null and b/Bilispider _redis/STHeiti-Medium.ttc differ diff --git a/Bilispider _redis/class.py b/Bilispider _redis/class.py new file mode 100644 index 0000000..cfeff68 --- /dev/null +++ b/Bilispider _redis/class.py @@ -0,0 +1,208 @@ + +import matplotlib.pyplot as plt +from matplotlib import font_manager +import pymysql + +class show(object): + info = {} + TScore = [] # 综合评分 + name = [] # 动漫名字 + bfl = [] # 播放量 + pls = [] # 评论数 + scs = [] # 收藏数 + + def save(self):#把数据从数据中取出存入对象 + db = pymysql.connect(host='47.106.183.36', port=3306, + user='fuchuang', password='fuchuang', + database='fuchuang', charset='utf8mb4') # 数据库连接 + use = 'use fuchuang;' + show = 'show tables;' + str = 'select * from bangumi;' + cursor = db.cursor() + try: + cursor.execute(show)# + cursor.execute(str)#选中表 + desc = cursor.description + data = cursor.fetchall()#获取表信息 + list = [] + for data in data:#筛选出评分大于6的数据 + if (data[3] != '暂无评分'): + if (float(data[3]) > 9.5): + #print(data) + self.name.append(data[1])# 动漫名字 + + list = [data[2]] + if '万' in list[0]: + list[0] = float(list[0].replace('万', '')) + elif '亿' in list[0]: + list[0] = float(list[0].replace('亿', '')) * 10000 + self.bfl.append(list[0]) # 播放量 + + list = [data[4]] + if '万' in list[0]: + list[0] = float(list[0].replace('万', '')) + else: + list[0] = float(list[0]) + self.TScore.append(float(data[3])*list[0]) # 综合评分 + self.scs.append(list[0])# 收藏数 + + list = [data[5]] + if '万' in list[0]: + list[0] = float(list[0].replace('万', '')) + else: + list[0] = float(list[0]) + self.pls.append(list[0])# 评论数 + + except Exception as e: + print(e) + db.rollback() + finally: + cursor.close() + db.close() + print(self.name) + print(self.TScore) + print(self.bfl) + print(self.pls) + print(self.scs) + #info = {'动漫名': self.name, '播放量(万)': self.bfl, '评论数(万)': self.pls, '收藏数(万)': self.scs, '综合评分': self.TScore} + #dm_file = pandas.DataFrame(info) + #dm_file.to_excel('Dongman.xlsx', sheet_name="动漫数据分析") + # 将所有列表返回 + return self.name, self.bfl, self.pls, self.scs, self.TScore + def view(self):#数据可视化 + # 为了坐标轴上能显示中文 + plt.rcParams['font.sans-serif'] = ['SimHei'] + plt.rcParams['axes.unicode_minus'] = False + + my_font = font_manager.FontProperties(fname='STHeiti-Medium.ttc') + dm_name = self.name # 番剧名 + dm_play = self.bfl # 番剧播放量 + dm_review = self.pls # 番剧评论数 + dm_favorite = self.scs # 番剧收藏数 + dm_com_score = self.TScore # 番剧评分 + y_score = [9.6, 9.7, 9.8, 9.9,10.0] + + #dm_all = self.TScore * self.scs + + + fig, ax1 = plt.subplots() + plt.bar(dm_name, dm_com_score, color='red') + plt.title('综合评分和播放量数据分校', fontproperties=my_font) + ax1.tick_params(labelsize=6) + plt.xlabel('番剧名') + plt.ylabel('综合评分') + plt.xticks(rotation=90, color='green') + + ax2 = ax1.twinx() + ax2.plot(dm_play, color='cyan') + plt.ylabel('播放量') + + plt.plot(1, label='评分', color='red', linewidth=5.0) + plt.plot(1, label='播放量', color="cyan", linewidth=1.0, linestyle="-") + plt.legend() + + plt.savefig(r'E:1.png', dpi=1000, bbox_inches='tight') + + # **********************************************************************评论数和收藏数对比 + # ********评论数条形图 + fig, ax3 = plt.subplots() + plt.bar(dm_name, dm_play, color='green') + plt.title('番剧收藏数与评论数分析') + plt.ylabel('评论数(万)') + ax3.tick_params(labelsize=6) + plt.xticks(rotation=90, color='green') + + # *******收藏数折线图 + ax4 = ax3.twinx() # 组合图必须加这个 + ax4.plot(dm_favorite, color='yellow') # 设置线粗细,节点样式 + plt.ylabel('收藏数(万)') + + plt.plot(1, label='评论数', color="green", linewidth=5.0) + plt.plot(1, label='收藏数', color="yellow", linewidth=1.0, linestyle="-") + plt.legend() + plt.savefig(r'E:2.png', dpi=1000, bbox_inches='tight') + + # **********************************************************************综合评分和收藏数对比 + # *******综合评分条形图 + fig, ax5 = plt.subplots() + plt.bar(dm_name, dm_com_score, color='red') + plt.title('综合评分和收藏数量数据分析') + plt.ylabel('综合评分') + ax5.tick_params(labelsize=6) + plt.xticks(rotation=90, color='green') + + # *******收藏折线图 + ax6 = ax5.twinx() # 组合图必须加这个 + ax6.plot(dm_favorite, color='yellow') # 设置线粗细,节点样式 + plt.ylabel('收藏数(万)') + plt.plot(1, label='综合评分', color="red", linewidth=5.0) + plt.plot(1, label='收藏数', color="yellow", linewidth=1.0, linestyle="-") + plt.legend() + + plt.savefig(r'E:3.png', dpi=1000, bbox_inches='tight') + + # **********************************************************************播放量和评论数对比 + # *******播放量条形图 + fig, ax7 = plt.subplots() + plt.bar(dm_name, dm_play, color='cyan') + plt.title('播放量和收藏数 数据分析') + plt.ylabel('播放量(万)') + ax7.tick_params(labelsize=6) + plt.xticks(rotation=90, color='green') + + # *******收藏数折线图 + ax8 = ax7.twinx() # 组合图必须加这个 + ax8.plot(dm_favorite, color='yellow') # 设置线粗细,节点样式 + plt.ylabel('收藏数(万)') + + plt.plot(1, label='评论数', color="green", linewidth=5.0) + plt.plot(1, label='收藏数', color="yellow", linewidth=1.0, linestyle="-") + plt.legend() + plt.savefig(r'E:4.png', dpi=1000, bbox_inches='tight') + + # *******评论数折线图 + # ax8 = ax7.twinx() # 组合图必须加这个 + # ax8.plot(dm_review, color='green') # 设置线粗细,节点样式 + # plt.ylabel('评论数(万)') + # plt.plot(1, label='播放量', color="cyan", linewidth=5.0) + # plt.plot(1, label='评论数', color="green", linewidth=1.0, linestyle="-") + # plt.legend() + # plt.savefig(r'E:4.png', dpi=1000, bbox_inches='tight') + #评论数的数据展示有问题 + plt.show() + def print(self): + print(len(self.name)) + print(len(self.bfl)) + print(len(self.pls)) + print(len(self.scs)) + print(len(self.TScore)) + def sort(self,i,j): + temp = self.name[i] + self.name[i] = self.name[j] + self.name[j] = temp + + temp = self.bfl[i] + self.bfl[i] = self.bfl[j] + self.bfl[j] = temp + + temp = self.pls[i] + self.pls[i] = self.pls[j] + self.pls[j] = temp + + temp = self.scs[i] + self.scs[i] = self.scs[j] + self.scs[j] = temp + + temp = self.TScore[i] + self.TScore[i] = self.TScore[j] + self.TScore[j] = temp + +def main(): + a=show()#创建对象 + a.save()#从数据库取数据 + a.print() # 输出各个数据个数 + a.view()#可视化 + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/Bilispider _redis/ecxel.py b/Bilispider _redis/ecxel.py new file mode 100644 index 0000000..50974f8 --- /dev/null +++ b/Bilispider _redis/ecxel.py @@ -0,0 +1,37 @@ +import pymysql +import xlwt as xlwt + + +def Toexcel(path, sql, title): + conn = pymysql.connect(host='47.106.183.36', port=3306, + user='fuchuang', password='fuchuang', + database='fuchuang', charset='utf8mb4') + curs = conn.cursor() + curs.execute(sql) + rows = curs.fetchall() + w = xlwt.Workbook(encoding='utf-8') + style = xlwt.XFStyle() # 初始化样式 + font = xlwt.Font() # 为样式创建字体 + font.name = "微软雅黑" # 如果是 python2 ,需要这样写 u"微软雅黑" + style.font = font # 为样式设置字体 + ws = w.add_sheet("视频信息", cell_overwrite_ok=True) + # 将 title 作为 Excel 的列名 + title = title.split(",") + for i in range(len(title)): + ws.write(0, i, title[i], style) + # 开始写入数据库查询到的数据 + for i in range(len(rows)): + row = rows[i] + for j in range(len(row)): + if row[j]: + item = row[j] + ws.write(i + 1, j, item, style) + # 写文件完成,开始保存xls文件 + w.save(path) + conn.close() + + +sql_1 = '''select * from video''' +Toexcel('视频信息.xls', sql_1, "id,标题,播放量,弹幕数,发布者") +sql_2 = '''select * from bangumi''' +Toexcel('番剧信息.xls', sql_2, "id,番名,播放量,评分,弹幕数") diff --git a/Bilispider _redis/geckodriver.log b/Bilispider _redis/geckodriver.log new file mode 100644 index 0000000..e69de29 diff --git a/Bilispider _redis/requestments.txt b/Bilispider _redis/requestments.txt new file mode 100644 index 0000000..2ea4406 Binary files /dev/null and b/Bilispider _redis/requestments.txt differ diff --git a/Bilispider _redis/scrapy.cfg b/Bilispider _redis/scrapy.cfg new file mode 100644 index 0000000..90734e2 --- /dev/null +++ b/Bilispider _redis/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = spider.settings + +[deploy] +#url = http://localhost:6800/ +project = spider diff --git a/Bilispider _redis/spider/__init__.py b/Bilispider _redis/spider/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Bilispider _redis/spider/__pycache__/__init__.cpython-310.pyc b/Bilispider _redis/spider/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..db150bd Binary files /dev/null and b/Bilispider _redis/spider/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bilispider _redis/spider/__pycache__/items.cpython-310.pyc b/Bilispider _redis/spider/__pycache__/items.cpython-310.pyc new file mode 100644 index 0000000..f06886f Binary files /dev/null and b/Bilispider _redis/spider/__pycache__/items.cpython-310.pyc differ diff --git a/Bilispider _redis/spider/__pycache__/middlewares.cpython-310.pyc b/Bilispider _redis/spider/__pycache__/middlewares.cpython-310.pyc new file mode 100644 index 0000000..0fe2bd6 Binary files /dev/null and b/Bilispider _redis/spider/__pycache__/middlewares.cpython-310.pyc differ diff --git a/Bilispider _redis/spider/__pycache__/pipelines.cpython-310.pyc b/Bilispider _redis/spider/__pycache__/pipelines.cpython-310.pyc new file mode 100644 index 0000000..5fdb4a4 Binary files /dev/null and b/Bilispider _redis/spider/__pycache__/pipelines.cpython-310.pyc differ diff --git a/Bilispider _redis/spider/__pycache__/settings.cpython-310.pyc b/Bilispider _redis/spider/__pycache__/settings.cpython-310.pyc new file mode 100644 index 0000000..1544cc3 Binary files /dev/null and b/Bilispider _redis/spider/__pycache__/settings.cpython-310.pyc differ diff --git a/Bilispider _redis/spider/__pycache__/utils.cpython-310.pyc b/Bilispider _redis/spider/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000..1bdfbbe Binary files /dev/null and b/Bilispider _redis/spider/__pycache__/utils.cpython-310.pyc differ diff --git a/Bilispider _redis/spider/cookie.py b/Bilispider _redis/spider/cookie.py new file mode 100644 index 0000000..ac915e2 --- /dev/null +++ b/Bilispider _redis/spider/cookie.py @@ -0,0 +1,20 @@ +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By +from time import sleep +import json + +s = Service("../chromedriver.exe") +bro = webdriver.Chrome(service=s) +# 打开b站 +bro.get('https://www.bilibili.com') +bro.delete_all_cookies() # 先删除cookies +# 60秒时间留你进行登陆 +sleep(60) +dictcookies = bro.get_cookies() # 读取登录之后浏览器的cookies +jsoncookies = json.dumps(dictcookies) # 将字典数据转成json数据便于保存 + +# 生成cookies.txt文件 +with open('cookies.txt', 'w') as f: # 写进文本保存 + f.write(jsoncookies) +print('cookies is ok') \ No newline at end of file diff --git a/Bilispider _redis/spider/cookies.txt b/Bilispider _redis/spider/cookies.txt new file mode 100644 index 0000000..053e100 --- /dev/null +++ b/Bilispider _redis/spider/cookies.txt @@ -0,0 +1 @@ +[{"domain": ".bilibili.com", "httpOnly": false, "name": "innersign", "path": "/", "secure": false, "value": "0"}, {"domain": ".bilibili.com", "expiry": 1700969682, "httpOnly": false, "name": "i-wanna-go-back", "path": "/", "secure": false, "value": "-1"}, {"domain": ".bilibili.com", "expiry": 1684985681, "httpOnly": false, "name": "bili_jct", "path": "/", "secure": false, "value": "6c88a668c7442fa148fc9d06d6e40849"}, {"domain": ".bilibili.com", "expiry": 1700969680, "httpOnly": false, "name": "sid", "path": "/", "secure": false, "value": "qauykkrb"}, {"domain": ".bilibili.com", "expiry": 1684985681, "httpOnly": true, "name": "SESSDATA", "path": "/", "secure": true, "value": "8e595145%2C1684985681%2C785d7%2Ab1"}, {"domain": ".bilibili.com", "expiry": 1764041679, "httpOnly": false, "name": "buvid_fp", "path": "/", "secure": false, "value": "3fdd662d6b3f6d9fd8b2a5f1834013d4"}, {"domain": ".bilibili.com", "expiry": 1684985681, "httpOnly": false, "name": "DedeUserID", "path": "/", "secure": false, "value": "178665301"}, {"domain": ".bilibili.com", "expiry": 1764041638, "httpOnly": false, "name": "b_nut", "path": "/", "secure": false, "value": "1669433638"}, {"domain": ".bilibili.com", "expiry": 1700969682, "httpOnly": false, "name": "b_ut", "path": "/", "secure": false, "value": "5"}, {"domain": ".bilibili.com", "httpOnly": false, "name": "b_lsid", "path": "/", "secure": false, "value": "B3E3109EB_184B2000F25"}, {"domain": ".bilibili.com", "expiry": 1684985681, "httpOnly": false, "name": "DedeUserID__ckMd5", "path": "/", "secure": false, "value": "3f4304303449401f"}, {"domain": ".bilibili.com", "expiry": 1700969654, "httpOnly": false, "name": "_uuid", "path": "/", "secure": false, "value": "F0B6051B-46F1-CEFA-D47B-30415304BFA054324infoc"}, {"domain": ".bilibili.com", "expiry": 1764041638, "httpOnly": false, "name": "buvid3", "path": "/", "secure": false, "value": "69BFBC33-CEEC-E4A5-9E0A-7D9DB491BA0738742infoc"}, {"domain": ".bilibili.com", "expiry": 1764041638, "httpOnly": false, "name": "buvid4", "path": "/", "secure": false, "value": "7FF54BEA-5D27-6E79-874D-2092619183B623196-022112611-0kiN2FN18wB5k/cGcFypkA%3D%3D"}] \ No newline at end of file diff --git a/Bilispider _redis/spider/items.py b/Bilispider _redis/spider/items.py new file mode 100644 index 0000000..b3c2ccb --- /dev/null +++ b/Bilispider _redis/spider/items.py @@ -0,0 +1,21 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class VideoItem(scrapy.Item): + title = scrapy.Field() # 视频名 + view_counts = scrapy.Field() # 播放量 + barrage = scrapy.Field() # 弹幕数 + up = scrapy.Field() # up主 + + +class BiliItem(scrapy.Item): + title = scrapy.Field() # 视频名 + view_counts = scrapy.Field() # 播放量 + evaluate = scrapy.Field() # 评分 + attention = scrapy.Field() # 追番数 + barrage = scrapy.Field() # 弹幕数 diff --git a/Bilispider _redis/spider/middlewares.py b/Bilispider _redis/spider/middlewares.py new file mode 100644 index 0000000..a18b685 --- /dev/null +++ b/Bilispider _redis/spider/middlewares.py @@ -0,0 +1,123 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html +from scrapy import signals, Request +from scrapy.http import HtmlResponse +from spider.utils import create_chrome_driver, add_cookies + + +# useful for handling different item types with a single interface + +class SpiderSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class SpiderDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def __init__(self): # 初始化数据管道时模拟用户登录 + self.browser = create_chrome_driver(headless=True) #headless=True + self.browser.get('https://www.bilibili.com/v/popular/rank/all') + add_cookies(self.browser, "../cookies.txt") + + def __del__(self): # 销毁时执行该方法 + self.browser.close() + + def process_request(self, request: Request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + # request.cookies = COOKIES_DICT[randint(0, len(COOKIES_DICT))] + # request.meta = {'proxy': 'http://127.0.0.1:7080'} # 使用代理,防封ip + self.browser.get(request.url) + return HtmlResponse(url=request.url, body=self.browser.page_source, # 获取动态内容 + request=request, encoding='utf-8') + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ProxyMiddleware(object): + def process_request(self, request, spider): + proxy = "http://172.20.10.2:7080" + request.meta["proxy"] = proxy + print(f"ProxyMiddleware --> {proxy}") + diff --git a/Bilispider _redis/spider/pipelines.py b/Bilispider _redis/spider/pipelines.py new file mode 100644 index 0000000..28e4ab3 --- /dev/null +++ b/Bilispider _redis/spider/pipelines.py @@ -0,0 +1,87 @@ + +import pymysql +from spider.items import VideoItem, BiliItem + +# import openpyxl + + +# class ExcelPipeline: +# def __int__(self): +# self.wb = openpyxl.Workbook() +# self.ws = self.wb.active +# self.ws.title = 'Goods' +# self.ws.append(('标题', '价格', '销量', '图片', '店铺', '位置')) +# +# def close_spider(self, spider): +# self.wb.save('商品数据.xlsx') +# +# def process_item(self, item, spider): +# title = item.get('title', '') # 如果拿不到,则赋空 +# price = item.get('price', 0) +# deal_count = item.get('deal_count', 0) +# picture = item.get('picture', '') +# location = item.get('location', '') +# shop = item.get('shop', '') +# self.ws.append((title, price, deal_count, picture, shop, location)) +# return item + + +class MysqlPipeline: + def __init__(self): + self.conn = pymysql.connect(host='47.106.183.36', port=3306, + user='fuchuang', password='fuchuang', + database='fuchuang', charset='utf8mb4') + self.cursor = self.conn.cursor() + self.data_bangumi = [] + self.data_video = [] + + def close_spider(self, spider): + if len(self.data_bangumi) > 0: + self._write_to_mysql_bangumi() + self.data_bangumi.clear() + if len(self.data_video) > 0: + self._write_to_mysql_video() + self.data_video.clear() + self.conn.commit() + self.conn.close() + + def process_item(self, item, spider): + if type(item) == VideoItem: + title = item.get('title', '') # 如果拿不到,则赋空 + view_counts = item.get('view_counts', '0') + barrage = item.get('barrage', '0') + up = item.get('up', '') + self.data_video.append((title, view_counts, barrage, up)) + if type(item) == BiliItem: + title = item.get('title', '') # 如果拿不到,则赋空 + view_counts = item.get('view_counts', '0') + evaluate = item.get('evaluate', '0') + attention = item.get('attention', '0') + barrage = item.get('barrage', '0') + self.data_bangumi.append((title, view_counts, evaluate, attention, barrage)) + if len(self.data_bangumi) >= 20: + self._write_to_mysql_bangumi() + self.data_bangumi.clear() + if len(self.data_video) >= 20: + self._write_to_mysql_video() + self.data_video.clear() + return item + + def _write_to_mysql_bangumi(self): + for item in self.data_bangumi: + self.cursor.execute( + 'insert into bangumi (title, view_counts, evaluate, attention, barrage) values (%s, %s, %s, %s, %s)', + item + ) + self.conn.commit() + + def _write_to_mysql_video(self): + for item in self.data_video: + self.cursor.execute( + 'insert into video (title, view_counts, barrage, up) values (%s, %s, %s, %s)', + item + ) + self.conn.commit() + + + diff --git a/Bilispider _redis/spider/settings.py b/Bilispider _redis/spider/settings.py new file mode 100644 index 0000000..b4fd705 --- /dev/null +++ b/Bilispider _redis/spider/settings.py @@ -0,0 +1,101 @@ + +BOT_NAME = 'spider' + +SPIDER_MODULES = ['spider.spiders'] +NEWSPIDER_MODULE = 'spider.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +# USER_AGENT = 'spider (+http://www.yourdomain.com)' + +# scrapy-redis +# 去重 +DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" +# 使用scrapy_redis的调度器 +SCHEDULER = "scrapy_redis.scheduler.Scheduler" +# 是否允许暂停 +SCHEDULER_PERSIST = True + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs + +DOWNLOAD_DELAY = 3 #延迟时间 +RANDOMIZE_DOWNLOAD_DELAY = True # 随机延迟开关 + +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'spider.middlewares.SpiderSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +DOWNLOADER_MIDDLEWARES = { + 'spider.middlewares.SpiderDownloaderMiddleware': 543, + # 'spider.middlewares.ProxyMiddleware': 600, + + +} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + # 'spider.pipelines.ExcelPipeline': 300, + 'spider.pipelines.MysqlPipeline': 300, + + # 使用scrapy_redis的管道 + 'scrapy_redis.pipelines.RedisPipeline': 400, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7' +TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' diff --git a/Bilispider _redis/spider/spiders/Bili.py b/Bilispider _redis/spider/spiders/Bili.py new file mode 100644 index 0000000..ee1c410 --- /dev/null +++ b/Bilispider _redis/spider/spiders/Bili.py @@ -0,0 +1,54 @@ +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings +from scrapy_redis.spiders import RedisCrawlSpider +from scrapy.linkextractors import LinkExtractor +from scrapy.spiders import Rule +from scrapy import Request + +from spider.items import VideoItem, BiliItem + + +class BiliSpider(RedisCrawlSpider): + name = 'Bili' + redis_key = 'Bili' + + rules = [ + Rule(LinkExtractor(allow=r"https://www.bilibili.com/bangumi/.*?"), callback='parse_Item', follow=True), + Rule(LinkExtractor(allow=r"https://www.bilibili.com/video/BV.*?"), callback='parse_Videoitem', follow=True), + Rule(LinkExtractor(allow=r"https://www.bilibili.com/bangumi/media/md.*?"), callback='parse_BiliItem', + follow=True), + ] + + def parse_Videoitem(self, response, **kwargs): + Video_item = VideoItem() + Video_item['title'] = response.xpath('//*[@id="viewbox_report"]/h1/@title').extract()[0] + Video_item['view_counts'] = str( + response.xpath('//*[@id="viewbox_report"]/div/div/span[1]/@title').extract()[0]).replace("总播放数", "") + Video_item['barrage'] = str( + response.xpath('//*[@id="viewbox_report"]/div/div/span[2]/@title').extract()[0]).replace( + "历史累计弹幕数", "") + Video_item['up'] = str(response.xpath('//*[@id="v_upinfo"]/div[2]/div[1]/a[1]/text()').extract()[0]).replace( + "\\n", + "").strip() + yield Video_item + + def parse_BiliItem(self, response, **kwargs): + bangumi_item = BiliItem() + bangumi_item['title'] = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()').extract()[0] + bangumi_item['view_counts'] = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[1]/em/text()').extract()[0] + bangumi_item['attention'] = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[2]/em/text()').extract()[0] + bangumi_item['barrage'] = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[3]/em/text()').extract()[0] + bangumi_item['evaluate'] = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[2]/div/div[1]/text()').extract()[0] + yield bangumi_item + + def parse_Item(self, response, **kwargs): + url = 'https:' + response.xpath('//*[@id="media_module"]/div/a/@href').extract()[0] + yield Request(url=url, callback=self.parse_BiliItem) + + + diff --git a/Bilispider _redis/spider/spiders/__init__.py b/Bilispider _redis/spider/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/Bilispider _redis/spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/Bilispider _redis/spider/spiders/__pycache__/Bili.cpython-310.pyc b/Bilispider _redis/spider/spiders/__pycache__/Bili.cpython-310.pyc new file mode 100644 index 0000000..92365cf Binary files /dev/null and b/Bilispider _redis/spider/spiders/__pycache__/Bili.cpython-310.pyc differ diff --git a/Bilispider _redis/spider/spiders/__pycache__/__init__.cpython-310.pyc b/Bilispider _redis/spider/spiders/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..42f745b Binary files /dev/null and b/Bilispider _redis/spider/spiders/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bilispider _redis/spider/spiders/bili.csv b/Bilispider _redis/spider/spiders/bili.csv new file mode 100644 index 0000000..e69de29 diff --git a/Bilispider _redis/spider/utils.py b/Bilispider _redis/spider/utils.py new file mode 100644 index 0000000..e921b83 --- /dev/null +++ b/Bilispider _redis/spider/utils.py @@ -0,0 +1,24 @@ +import json + +from selenium import webdriver + + +def create_chrome_driver(*, headless=False): + options = webdriver.ChromeOptions() + if headless: + options.add_argument('--headless') # 不显示浏览器窗口 + options.add_experimental_option('excludeSwitches', ['enable-automation']) # 防止识别Selenium驱动浏览器 + options.add_experimental_option('useAutomationExtension', False) + browser = webdriver.Chrome(options=options,executable_path=r'chromedriver.exe') + browser.execute_cdp_cmd( + 'Page.addScriptToEvaluateOnNewDocument', + {'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined'} # 修改浏览器标识 + ) + return browser + + +def add_cookies(browser, cookie_file): + with open(cookie_file, 'r') as file: + cookies_list = json.loads(file.read()) + for cookie_dict in cookies_list: + browser.add_cookie(cookie_dict)