From 8fdee85a5a27dc59e2e1f5efc43b53ef85af5268 Mon Sep 17 00:00:00 2001 From: palc3e6gq <1622356900@qq.com> Date: Tue, 17 Sep 2024 09:27:02 +0800 Subject: [PATCH] =?UTF-8?q?Delete=20'b=E7=AB=99=E5=BC=B9=E5=B9=95=E7=88=AC?= =?UTF-8?q?=E8=99=AB.py'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- b站弹幕爬虫.py | 62 --------------------------------------------- 1 file changed, 62 deletions(-) delete mode 100644 b站弹幕爬虫.py diff --git a/b站弹幕爬虫.py b/b站弹幕爬虫.py deleted file mode 100644 index b0f4bb5..0000000 --- a/b站弹幕爬虫.py +++ /dev/null @@ -1,62 +0,0 @@ -import requests -import re -#一、首先定义获取弹幕huqudanmu函数 -def huoqudanmu(cid): - url=f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}'#通过cid值获取对应视频的弹幕 - Hddf = { - "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" - } - resp= requests.get(url,headers=Hddf)#从网址获取弹幕 - resp.encoding="utf-8"#弹幕中文编码 - Data=resp.text#提取text文本 - context=re.findall('(.*?)',Data)#提取弹幕文本 - print(context) - for index in context: - with open('总弹幕.txt',mode='a',encoding='utf-8')as f: - f.write(index)#写入text文件 - f.write('\n') - -#二、获取所需弹幕地址 -headers0={ - "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" -}#请求头 -for page0 in range(1,11):#从搜索的十页获取网址,一页30个视频 - if page0==1 : - url="https://search.bilibili.com/all?vt=93020172&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5" - else: - url=f"https://search.bilibili.com/all?vt=93020172&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page0}" - res0=requests.get(url,headers=headers0)#获取网页数据 - Text0=res0.text - bvid=re.findall('bvid:"(.*?)",title:',Text0)#获取视频bvid号 - for bvid1 in bvid:# - url=f"https://www.bilibili.com/video/{bvid1.strip()}/?spm_id_from=333.337.search-card.all.click&vd_source=516714ff716c382225c801afa2c87d8d" - res0=requests.get(url,headers=headers0)#获取视频数据 - Text0=res0.text - oid=re.findall('"embedPlayer":{"p":.*?,"aid":.*?,"bvid":".*?","cid":(.*?),',Text0)#获取oid值(多余空格去除,不然匹配不到) - for oid1 in oid:#提取oid值 - huoqudanmu(oid1)#调用获取弹幕的函数 - -#三、生成词云图 -import jieba -import wordcloud -import imageio -#1.读取弹幕数据 -f =open("总弹幕.txt",encoding='utf-8') -text =f.read() -#2.分词 -text_list=jieba.lcut(text) -text_str=' '.join(text_list) -#3.词云图 -f2=open("中文常见停用词.text",encoding='utf-8') -text2=f2.read().splitlines()#读取弹幕 -img= imageio.v2.imread('地球.png')#导入词云形状的图片 -wc=wordcloud.WordCloud(#设置词云格式 - width=1160,#宽 - height=800,#高 - background_color='white',#背景颜色 - mask=img,#云图样式 - stopwords=text2,#禁用词 - font_path='msyh.ttc'#字体 -) -wc.generate(text_str) -wc.to_file('词云图.png')