You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pai/b站弹幕爬虫.py

63 lines
2.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import re
#一、首先定义获取弹幕huqudanmu函数
def huoqudanmu(cid):
url=f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}'#通过cid值获取对应视频的弹幕
Hddf = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
}
resp= requests.get(url,headers=Hddf)#从网址获取弹幕
resp.encoding="utf-8"#弹幕中文编码
Data=resp.text#提取text文本
context=re.findall('<d p=.*?>(.*?)</d>',Data)#提取弹幕文本
print(context)
for index in context:
with open('总弹幕.txt',mode='a',encoding='utf-8')as f:
f.write(index)#写入text文件
f.write('\n')
#二、获取所需弹幕地址
headers0={
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
}#请求头
for page0 in range(1,11):#从搜索的十页获取网址一页30个视频
if page0==1 :
url="https://search.bilibili.com/all?vt=93020172&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5"
else:
url=f"https://search.bilibili.com/all?vt=93020172&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page0}"
res0=requests.get(url,headers=headers0)#获取网页数据
Text0=res0.text
bvid=re.findall('bvid:"(.*?)",title:',Text0)#获取视频bvid号
for bvid1 in bvid:#
url=f"https://www.bilibili.com/video/{bvid1.strip()}/?spm_id_from=333.337.search-card.all.click&vd_source=516714ff716c382225c801afa2c87d8d"
res0=requests.get(url,headers=headers0)#获取视频数据
Text0=res0.text
oid=re.findall('"embedPlayer":{"p":.*?,"aid":.*?,"bvid":".*?","cid":(.*?),',Text0)#获取oid值(多余空格去除,不然匹配不到)
for oid1 in oid:#提取oid值
huoqudanmu(oid1)#调用获取弹幕的函数
#三、生成词云图
import jieba
import wordcloud
import imageio
#1.读取弹幕数据
f =open("总弹幕.txt",encoding='utf-8')
text =f.read()
#2.分词
text_list=jieba.lcut(text)
text_str=' '.join(text_list)
#3.词云图
f2=open("中文常见停用词.text",encoding='utf-8')
text2=f2.read().splitlines()#读取弹幕
img= imageio.v2.imread('地球.png')#导入词云形状的图片
wc=wordcloud.WordCloud(#设置词云格式
width=1160,#宽
height=800,#高
background_color='white',#背景颜色
mask=img,#云图样式
stopwords=text2,#禁用词
font_path='msyh.ttc'#字体
)
wc.generate(text_str)
wc.to_file('词云图.png')