parent
caefda24ca
commit
bcf3d9fe4c
@ -0,0 +1,195 @@
|
||||
import requests
|
||||
import re
|
||||
import jieba
|
||||
from collections import Counter
|
||||
from wordcloud import WordCloud
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class SimpleDanmuAnalyzer:
|
||||
def __init__(self):
|
||||
self.danmu_list = []
|
||||
self.noise_words = ['666', '哈哈哈', '233', '点赞', '关注', '来了']
|
||||
|
||||
def get_danmu(self, bvid):
|
||||
"""获取单个视频的弹幕"""
|
||||
try:
|
||||
print(f"正在获取视频 {bvid} 的弹幕...")
|
||||
|
||||
# 获取cid
|
||||
info_url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
|
||||
info_response = requests.get(info_url)
|
||||
info_data = info_response.json()
|
||||
|
||||
if info_data['code'] != 0:
|
||||
print(f"获取视频信息失败: {info_data.get('message', '未知错误')}")
|
||||
return
|
||||
|
||||
cid = info_data['data']['cid']
|
||||
|
||||
# 获取弹幕
|
||||
danmu_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
|
||||
response = requests.get(danmu_url)
|
||||
response.encoding = 'utf-8'
|
||||
|
||||
# 提取弹幕文本
|
||||
danmus = re.findall(r'<d p=".*?">(.*?)</d>', response.text)
|
||||
|
||||
# 简单过滤噪声
|
||||
filtered_danmus = []
|
||||
for danmu in danmus:
|
||||
if not any(noise in danmu for noise in self.noise_words):
|
||||
filtered_danmus.append(danmu)
|
||||
|
||||
self.danmu_list.extend(filtered_danmus)
|
||||
print(f"获取到 {len(filtered_danmus)} 条有效弹幕")
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取弹幕失败: {e}")
|
||||
|
||||
def analyze_words(self, top_n=8):
|
||||
"""简单词频分析"""
|
||||
if not self.danmu_list:
|
||||
print("没有弹幕数据可供分析")
|
||||
return []
|
||||
|
||||
# 合并所有弹幕
|
||||
text = ' '.join(self.danmu_list)
|
||||
|
||||
# 分词
|
||||
words = jieba.cut(text)
|
||||
|
||||
# 过滤短词和停用词
|
||||
filtered_words = [word for word in words if len(word) > 1]
|
||||
|
||||
# 统计词频
|
||||
word_count = Counter(filtered_words)
|
||||
return word_count.most_common(top_n)
|
||||
|
||||
def make_wordcloud(self, filename='wordcloud.png'):
|
||||
"""生成词云"""
|
||||
if not self.danmu_list:
|
||||
print("没有弹幕数据生成词云")
|
||||
return
|
||||
|
||||
text = ' '.join(self.danmu_list)
|
||||
|
||||
try:
|
||||
# 创建词云
|
||||
wc = WordCloud(
|
||||
font_path='simhei.ttf', # 需要系统中文字体
|
||||
width=800,
|
||||
height=600,
|
||||
background_color='white',
|
||||
max_words=100
|
||||
).generate(text)
|
||||
|
||||
# 显示词云
|
||||
plt.figure(figsize=(10, 8))
|
||||
plt.imshow(wc)
|
||||
plt.axis('off')
|
||||
plt.title('弹幕词云图')
|
||||
plt.tight_layout()
|
||||
plt.savefig(filename, dpi=300, bbox_inches='tight')
|
||||
plt.show()
|
||||
print(f"词云图已保存为: {filename}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"生成词云失败: {e}")
|
||||
print("请确保系统中安装了中文字体")
|
||||
|
||||
def save_to_excel(self, filename='result.xlsx'):
|
||||
"""保存结果到Excel"""
|
||||
if not self.danmu_list:
|
||||
print("没有数据可保存")
|
||||
return
|
||||
|
||||
try:
|
||||
# 弹幕数据
|
||||
df_danmu = pd.DataFrame(self.danmu_list, columns=['弹幕内容'])
|
||||
|
||||
# 词频数据
|
||||
top_words = self.analyze_words(8)
|
||||
df_words = pd.DataFrame(top_words, columns=['词语', '出现次数'])
|
||||
|
||||
# 保存
|
||||
with pd.ExcelWriter(filename) as writer:
|
||||
df_danmu.to_excel(writer, sheet_name='弹幕数据', index=False)
|
||||
df_words.to_excel(writer, sheet_name='词频统计', index=False)
|
||||
|
||||
print(f"数据已保存到: {filename}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"保存Excel失败: {e}")
|
||||
|
||||
def get_conclusions(self):
|
||||
"""简单分析结论"""
|
||||
if not self.danmu_list:
|
||||
return "没有足够数据进行分析"
|
||||
|
||||
total = len(self.danmu_list)
|
||||
top_words = self.analyze_words(5)
|
||||
|
||||
conclusions = []
|
||||
conclusions.append(f"共分析 {total} 条弹幕")
|
||||
conclusions.append("高频词TOP5:")
|
||||
for word, count in top_words:
|
||||
conclusions.append(f" - {word}: {count}次")
|
||||
|
||||
return '\n'.join(conclusions)
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
analyzer = SimpleDanmuAnalyzer()
|
||||
|
||||
# 这里放几个大语言模型相关视频的BV号
|
||||
# 你可以替换成你想分析的视频BV号
|
||||
video_list = [
|
||||
'BV1fp4y1q7E9', # 大语言模型介绍
|
||||
'BV1nV41127AV', # LLM应用案例
|
||||
'BV1Ru41127XB', # 大模型技术解析
|
||||
]
|
||||
|
||||
print("开始获取弹幕数据...")
|
||||
for bvid in video_list:
|
||||
analyzer.get_danmu(bvid)
|
||||
|
||||
if not analyzer.danmu_list:
|
||||
print("没有获取到弹幕数据,使用示例数据演示")
|
||||
# 添加一些示例数据
|
||||
analyzer.danmu_list = [
|
||||
'大语言模型很强大',
|
||||
'AI改变世界',
|
||||
'机器学习很有趣',
|
||||
'深度学习技术',
|
||||
'自然语言处理',
|
||||
'大模型应用广泛',
|
||||
'人工智能未来',
|
||||
'LLM发展很快',
|
||||
'智能助手很方便',
|
||||
'代码生成很实用'
|
||||
]
|
||||
|
||||
print("\n进行词频分析...")
|
||||
top_words = analyzer.analyze_words(8)
|
||||
print("高频词TOP8:")
|
||||
for i, (word, count) in enumerate(top_words, 1):
|
||||
print(f"{i}. {word}: {count}次")
|
||||
|
||||
print("\n生成词云图...")
|
||||
analyzer.make_wordcloud()
|
||||
|
||||
print("\n保存数据到Excel...")
|
||||
analyzer.save_to_excel()
|
||||
|
||||
print("\n分析结论:")
|
||||
conclusions = analyzer.get_conclusions()
|
||||
print(conclusions)
|
||||
|
||||
print("\n任务完成!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in new issue