Update danmu.py

main
pkf2ew7r5 11 months ago
parent 46bec89339
commit 3de1cd7a90

@ -1,120 +1,124 @@
import re import re
import requests import requests
import json import json
from wordcloud import WordCloud from wordcloud import WordCloud
from collections import Counter from collections import Counter
import pandas as pd import pandas as pd
import jieba import jieba
# 用于获得请求的头应对B站的反爬虫机制 # 用于获得请求的头应对B站的反爬虫机制
headers = { headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
"Referer": "https://www.bilibili.com/video/BV1ji421r7PB/?spm_id_from=333.337.search-card.all.click&vd_source=6d043574d81a3f216c41cc281767c59e", "Referer": "https://www.bilibili.com/video/BV1ji421r7PB/?spm_id_from=333.337.search-card.all.click&vd_source=6d043574d81a3f216c41cc281767c59e",
"Cookie": "buvid4=F793982E-B1BE-6615-D504-4A3BF6DC079274572-022081216-zrNi2ZH69Dhc8fOw09IIaA%3D%3D; buvid_fp_plain=undefined; DedeUserID=254629681; DedeUserID__ckMd5=23182279794f6f43; header_theme_version=CLOSE; enable_web_push=DISABLE; rpdid=0zbfVHb84m|bujW4qg1|H3r|3w1RkXNz; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; hit-dyn-v2=1; CURRENT_FNVAL=4048; LIVE_BUVID=AUTO3317219069887705; buvid3=575DAC67-0FE1-B3C3-8A9F-32190F231A1F24035infoc; b_nut=1723377925; _uuid=24521A4B-895E-10E6A-2555-A9EF812A1AC136123infoc; fingerprint=45b7e451fad392b858fc6e95bbb4231d; PVID=1; SESSDATA=7bb24682%2C1741874111%2Cf9490%2A92CjB3NDHhJCIr34LBlg2TTS7vPGQ72Kh3s3DVcaEn0JkQ_hXc7q6MgsK9Wjb-Z6NTf68SVnlnTkJQZmtmRXMxZnc1RDZpYUFnS1BlTmptUklUSkcyQUNxaWg4Smpla1VCZHdkLVc4dW9OWU4zZm1PaVhuMzFtcnNvQ2JMQzFiTFVleC1HbXc0Sm53IIEC; bili_jct=895f9e091815ee492e57345dc62b8757; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1ODEzMjMsImlhdCI6MTcyNjMyMjA2MywicGx0IjotMX0.47Js9n97GQecalPWdc-MNrDVUOz2-FfY_DlB72DUlZM; bili_ticket_expires=1726581263; CURRENT_QUALITY=32; bp_t_offset_254629681=977402825627664384; buvid_fp=45b7e451fad392b858fc6e95bbb4231d; home_feed_column=5; browser_resolution=1528-704; b_lsid=18C6BCCB_191F83A85C1" "Cookie": "buvid4=F793982E-B1BE-6615-D504-4A3BF6DC079274572-022081216-zrNi2ZH69Dhc8fOw09IIaA%3D%3D; buvid_fp_plain=undefined; DedeUserID=254629681; DedeUserID__ckMd5=23182279794f6f43; header_theme_version=CLOSE; enable_web_push=DISABLE; rpdid=0zbfVHb84m|bujW4qg1|H3r|3w1RkXNz; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; hit-dyn-v2=1; CURRENT_FNVAL=4048; LIVE_BUVID=AUTO3317219069887705; buvid3=575DAC67-0FE1-B3C3-8A9F-32190F231A1F24035infoc; b_nut=1723377925; _uuid=24521A4B-895E-10E6A-2555-A9EF812A1AC136123infoc; fingerprint=45b7e451fad392b858fc6e95bbb4231d; PVID=1; SESSDATA=7bb24682%2C1741874111%2Cf9490%2A92CjB3NDHhJCIr34LBlg2TTS7vPGQ72Kh3s3DVcaEn0JkQ_hXc7q6MgsK9Wjb-Z6NTf68SVnlnTkJQZmtmRXMxZnc1RDZpYUFnS1BlTmptUklUSkcyQUNxaWg4Smpla1VCZHdkLVc4dW9OWU4zZm1PaVhuMzFtcnNvQ2JMQzFiTFVleC1HbXc0Sm53IIEC; bili_jct=895f9e091815ee492e57345dc62b8757; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1ODEzMjMsImlhdCI6MTcyNjMyMjA2MywicGx0IjotMX0.47Js9n97GQecalPWdc-MNrDVUOz2-FfY_DlB72DUlZM; bili_ticket_expires=1726581263; CURRENT_QUALITY=32; bp_t_offset_254629681=977402825627664384; buvid_fp=45b7e451fad392b858fc6e95bbb4231d; home_feed_column=5; browser_resolution=1528-704; b_lsid=18C6BCCB_191F83A85C1"
} }
def get_bvlist(): def get_bvlist():
# 获取BV号 # 获取BV号
bvlist = [] bvlist = []
for page in range(1, 11): for page in range(1, 11):
# 搜索“2024巴黎奥运会”的URL # 搜索“2024巴黎奥运会”的URL
url = f'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page={page}&page_size=30&pubtime_begin_s=0&pubtime_end_s=0&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&#39' url = f'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page={page}&page_size=30&pubtime_begin_s=0&pubtime_end_s=0&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&#39'
# 发送请求 # 发送请求
response = requests.get(url, headers=headers) response = requests.get(url, headers=headers)
response.encoding='urf-8' response.encoding='urf-8'
# 使用正则表达式得到BV号 # 使用正则表达式得到BV号
tem_bvlist = re.findall('"bvid":"(.*?)"', response.text) tem_bvlist = re.findall('"bvid":"(.*?)"', response.text)
bvlist.extend(list(set(tem_bvlist))) bvlist.extend(list(set(tem_bvlist)))
print("BV号获取完成") print("BV号获取完成")
return bvlist return bvlist
def get_cidlist(bvlist): def get_cidlist(bvlist):
# 传入获取到的BV号获取对应的cid # 传入获取到的BV号获取对应的cid
cidlist=[] cidlist=[]
for bvid in bvlist: for bvid in bvlist:
# 由于获取cid的URL # 由于获取cid的URL
url = f'https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp' url = f'https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp'
# 发送请求 # 发送请求
res = requests.get(url,headers=headers) res = requests.get(url,headers=headers)
res.encoding = 'urf-8' res.encoding = 'urf-8'
# 利用JSON提取cid # 利用JSON提取cid
json_dict = json.loads(res.text) json_dict = json.loads(res.text)
cidlist.append(json_dict["data"][0]["cid"]) cidlist.append(json_dict["data"][0]["cid"])
print("cid获取完成") print("cid获取完成")
return cidlist return cidlist
def get_danmu(cidlist): def get_danmu(cidlist):
# 利用传入的cid获取对应的弹幕 # 利用传入的cid获取对应的弹幕
count = 1 count = 1
for cid in cidlist: for cid in cidlist:
# 弹幕所在的URL # 弹幕所在的URL
url = f'http://comment.bilibili.com/{cid}.xml' url = f'http://comment.bilibili.com/{cid}.xml'
# 发送请求 # 发送请求
response = requests.get(url, headers=headers) response = requests.get(url, headers=headers)
response.encoding = 'utf-8' response.encoding = 'utf-8'
# 利用正则表达式获取弹幕 # 利用正则表达式获取弹幕
content_list = re.findall('<d p=".*?">(.*?)</d>', response.text) content_list = re.findall('<d p=".*?">(.*?)</d>', response.text)
for danmu in content_list: for danmu in content_list:
with open('弹幕.txt', 'a', encoding='utf-8') as f: with open('弹幕.txt', 'a', encoding='utf-8') as f:
f.write(danmu + '\n') f.write(danmu + '\n')
print(f'----------第{count}个视频弹幕获取完毕!----------') print(f'----------第{count}个视频弹幕获取完毕!----------')
count +=1 count +=1
def get_ai_related_danmu(): def get_ai_related_danmu():
# 统计与AI技术应用相关的弹幕 # 统计与AI技术应用相关的弹幕
ai_related_danmu = [] ai_related_danmu = []
# AI相关的关键词列表 # AI相关的关键词列表
ai_keywords = {'ai', 'ai生成', 'ai合成', 'ai配音', 'gpt', '人工智能', '机器学习', '深度学习', '神经网络', ai_keywords = {'ai', 'ai生成', 'ai合成', 'ai配音', 'gpt', '人工智能', '机器学习', '深度学习', '神经网络',
'自然语言处理', '计算机视觉', '语音识别', '人脸识别', '无人驾驶'} '自然语言处理', '计算机视觉', '语音识别', '人脸识别', '无人驾驶'}
# 读取弹幕文件 # 读取弹幕文件
with open('弹幕.txt', 'r', encoding='utf-8') as file: with open('弹幕.txt', 'r', encoding='utf-8') as file:
danmu_lines = file.readlines() danmu_lines = file.readlines()
# 去除原本的换行符和空格,方便统计 # 去除原本的换行符和空格,方便统计
danmu_cleaned = [line.strip() for line in danmu_lines] danmu_cleaned = [line.strip() for line in danmu_lines]
# 统计与AI技术应用相关的弹幕 # 统计与AI技术应用相关的弹幕
for danmu in danmu_cleaned: for danmu in danmu_cleaned:
if any(keyword in danmu.lower() for keyword in ai_keywords): if any(keyword in danmu.lower() for keyword in ai_keywords):
# 对含有“ai”关键字但内容并不是AI技术应用相关的弹幕需另外判断 # 对含有“ai”关键字但内容并不是AI技术应用相关的弹幕需另外判断
if 'ai' in danmu.lower() and re.search(r'(?<![\u4e00-\u9fa5])[aiAI](?![\u4e00-\u9fa5])', danmu): if 'ai' in danmu.lower() and re.search(r'(?<![\u4e00-\u9fa5])[aiAI](?![\u4e00-\u9fa5])', danmu):
continue continue
ai_related_danmu.append(danmu) ai_related_danmu.append(danmu)
# 统计每种弹幕的数量 # 统计每种弹幕的数量
danmu_count = Counter(ai_related_danmu) danmu_count = Counter(ai_related_danmu)
# 获取数量排名前8的弹幕 # 获取数量排名前8的弹幕
top_8_danmu = danmu_count.most_common(8) top_8_danmu = danmu_count.most_common(8)
# 创建一个DataFrame来存储数据 # 创建一个DataFrame来存储数据
df = pd.DataFrame(top_8_danmu, columns=['弹幕内容', '数量']) df = pd.DataFrame(top_8_danmu, columns=['弹幕内容', '数量'])
# 将DataFrame写入Excel文件 # 将DataFrame写入Excel文件
df.to_excel('AI弹幕统计.xlsx', index=False) df.to_excel('AI弹幕统计.xlsx', index=False)
return(ai_related_danmu) return(ai_related_danmu)
def get_wordcloud(ai_danmu): def get_wordcloud(ai_danmu):
# 首先,将弹幕列表中的每个元素连接成一个长字符串,元素之间用空格分隔 # 首先,将弹幕列表中的每个元素连接成一个长字符串,元素之间用空格分隔
ai = ' '.join(ai_danmu) ai = ' '.join(ai_danmu)
# 使用jieba分词对连接后的字符串进行分词并将分词结果再次连接成一个长字符串分词之间用空格分隔 # 使用jieba分词对连接后的字符串进行分词并将分词结果再次连接成一个长字符串分词之间用空格分隔
cut_t = ' '.join(jieba.cut(ai)) cut_t = ' '.join(jieba.cut(ai))
# 创建一个WordCloud对象设置词云图的参数 # 设置停用词,减少无效信息
wordcloud = WordCloud(background_color="white", # 设置背景颜色为白色 stop_words = set()
width=800, # 设置词云图的宽度 content = [line.strip() for line in open('停用词.txt', 'r', encoding='utf-8').readlines()]
height=600, # 设置词云图的高度 stop_words.update(content)
max_words=150, # 设置词云图中显示的最大单词数 # 创建一个WordCloud对象设置词云图的参数
max_font_size=80, # 设置词云图中最大字号 wordcloud = WordCloud(background_color="white", # 设置背景颜色为白色
font_path='msyh.ttc', # 设置字体路径,确保能够显示中文 width=800, # 设置词云图的宽度
contour_width=2, # 设置词云图的轮廓宽度 height=600, # 设置词云图的高度
contour_color='steelblue', # 设置词云图轮廓的颜色 max_words=150, # 设置词云图中显示的最大单词数
repeat='true' # 设置单词课重复出现 max_font_size=80, # 设置词云图中最大字号
).generate(cut_t) font_path='msyh.ttc', # 设置字体路径,确保能够显示中文
# 将生成的词云图保存为PNG文件 contour_width=2, # 设置词云图的轮廓宽度
wordcloud.to_file('词云图.png') contour_color='steelblue', # 设置词云图轮廓的颜色
repeat='true', # 设置单词课重复出现
stopwords = stop_words, # 设置停用词
if __name__ == '__main__': ).generate(cut_t)
bvlist = get_bvlist() # 将生成的词云图保存为PNG文件
cidlist = get_cidlist(bvlist) wordcloud.to_file('词云图.png')
get_danmu(cidlist)
ai_danmu = get_ai_related_danmu() if __name__ == '__main__':
bvlist = get_bvlist()
cidlist = get_cidlist(bvlist)
get_danmu(cidlist)
ai_danmu = get_ai_related_danmu()
get_wordcloud(ai_danmu) get_wordcloud(ai_danmu)
Loading…
Cancel
Save