Delete 'danmu.py'

main
pkf2ew7r5 11 months ago
parent 1c34280e4b
commit 05fd55c11b

@ -1,119 +0,0 @@
import re
import requests
import json
from wordcloud import WordCloud
from collections import Counter
import pandas as pd
import jieba
# 用于获得请求的头应对B站的反爬虫机制
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
"Referer": "https://www.bilibili.com/video/BV1ji421r7PB/?spm_id_from=333.337.search-card.all.click&vd_source=6d043574d81a3f216c41cc281767c59e",
"Cookie": "buvid4=F793982E-B1BE-6615-D504-4A3BF6DC079274572-022081216-zrNi2ZH69Dhc8fOw09IIaA%3D%3D; buvid_fp_plain=undefined; DedeUserID=254629681; DedeUserID__ckMd5=23182279794f6f43; header_theme_version=CLOSE; enable_web_push=DISABLE; rpdid=0zbfVHb84m|bujW4qg1|H3r|3w1RkXNz; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; hit-dyn-v2=1; CURRENT_FNVAL=4048; LIVE_BUVID=AUTO3317219069887705; buvid3=575DAC67-0FE1-B3C3-8A9F-32190F231A1F24035infoc; b_nut=1723377925; _uuid=24521A4B-895E-10E6A-2555-A9EF812A1AC136123infoc; fingerprint=45b7e451fad392b858fc6e95bbb4231d; PVID=1; SESSDATA=7bb24682%2C1741874111%2Cf9490%2A92CjB3NDHhJCIr34LBlg2TTS7vPGQ72Kh3s3DVcaEn0JkQ_hXc7q6MgsK9Wjb-Z6NTf68SVnlnTkJQZmtmRXMxZnc1RDZpYUFnS1BlTmptUklUSkcyQUNxaWg4Smpla1VCZHdkLVc4dW9OWU4zZm1PaVhuMzFtcnNvQ2JMQzFiTFVleC1HbXc0Sm53IIEC; bili_jct=895f9e091815ee492e57345dc62b8757; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1ODEzMjMsImlhdCI6MTcyNjMyMjA2MywicGx0IjotMX0.47Js9n97GQecalPWdc-MNrDVUOz2-FfY_DlB72DUlZM; bili_ticket_expires=1726581263; CURRENT_QUALITY=32; bp_t_offset_254629681=977402825627664384; buvid_fp=45b7e451fad392b858fc6e95bbb4231d; home_feed_column=5; browser_resolution=1528-704; b_lsid=18C6BCCB_191F83A85C1"
}
def get_bvlist():
# 获取BV号
bvlist = []
for page in range(1, 11):
# 搜索“2024巴黎奥运会”的URL
url = f'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page={page}&page_size=30&pubtime_begin_s=0&pubtime_end_s=0&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&#39'
# 发送请求
response = requests.get(url, headers=headers)
response.encoding='urf-8'
# 使用正则表达式得到BV号
tem_bvlist = re.findall('"bvid":"(.*?)"', response.text)
bvlist.extend(list(set(tem_bvlist)))
print("BV号获取完成")
return bvlist
def get_cidlist(bvlist):
# 传入获取到的BV号获取对应的cid
cidlist=[]
for bvid in bvlist:
# 由于获取cid的URL
url = f'https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp'
# 发送请求
res = requests.get(url,headers=headers)
res.encoding = 'urf-8'
# 利用JSON提取cid
json_dict = json.loads(res.text)
cidlist.append(json_dict["data"][0]["cid"])
print("cid获取完成")
return cidlist
def get_danmu(cidlist):
# 利用传入的cid获取对应的弹幕
count = 1
for cid in cidlist:
# 弹幕所在的URL
url = f'http://comment.bilibili.com/{cid}.xml'
# 发送请求
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
# 利用正则表达式获取弹幕
content_list = re.findall('<d p=".*?">(.*?)</d>', response.text)
for danmu in content_list:
with open('弹幕.txt', 'a', encoding='utf-8') as f:
f.write(danmu + '\n')
print(f'----------第{count}个视频弹幕获取完毕!----------')
count +=1
def get_ai_related_danmu():
# 统计与AI技术应用相关的弹幕
ai_related_danmu = []
# AI相关的关键词列表
ai_keywords = {'ai', 'ai生成', 'ai合成', 'ai配音', 'gpt', '人工智能', '机器学习', '深度学习', '神经网络',
'自然语言处理', '计算机视觉', '语音识别', '人脸识别', '无人驾驶'}
# 读取弹幕文件
with open('弹幕.txt', 'r', encoding='utf-8') as file:
danmu_lines = file.readlines()
# 去除原本的换行符和空格,方便统计
danmu_cleaned = [line.strip() for line in danmu_lines]
# 统计与AI技术应用相关的弹幕
for danmu in danmu_cleaned:
if any(keyword in danmu.lower() for keyword in ai_keywords):
# 对含有“ai”关键字但内容并不是AI技术应用相关的弹幕需另外判断
if 'ai' in danmu.lower() and re.search(r'(?<![\u4e00-\u9fa5])[aiAI](?![\u4e00-\u9fa5])', danmu):
continue
ai_related_danmu.append(danmu)
# 统计每种弹幕的数量
danmu_count = Counter(ai_related_danmu)
# 获取数量排名前8的弹幕
top_8_danmu = danmu_count.most_common(8)
# 创建一个DataFrame来存储数据
df = pd.DataFrame(top_8_danmu, columns=['弹幕内容', '数量'])
# 将DataFrame写入Excel文件
df.to_excel('AI弹幕统计.xlsx', index=False)
return(ai_related_danmu)
def get_wordcloud(ai_danmu):
# 首先,将弹幕列表中的每个元素连接成一个长字符串,元素之间用空格分隔
ai = ' '.join(ai_danmu)
# 使用jieba分词对连接后的字符串进行分词并将分词结果再次连接成一个长字符串分词之间用空格分隔
cut_t = ' '.join(jieba.cut(ai))
# 创建一个WordCloud对象设置词云图的参数
wordcloud = WordCloud(background_color="white", # 设置背景颜色为白色
width=800, # 设置词云图的宽度
height=600, # 设置词云图的高度
max_words=100, # 设置词云图中显示的最大单词数
max_font_size=80, # 设置词云图中最大字号
font_path='msyh.ttc', # 设置字体路径,确保能够显示中文
contour_width=2, # 设置词云图的轮廓宽度
contour_color='steelblue',# 设置词云图轮廓的颜色
).generate(cut_t)
# 将生成的词云图保存为PNG文件
wordcloud.to_file('词云图.png')
if __name__ == '__main__':
bvlist = get_bvlist()
cidlist = get_cidlist(bvlist)
get_danmu(cidlist)
ai_danmu = get_ai_related_danmu()
get_wordcloud(ai_danmu)
Loading…
Cancel
Save