You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
194 lines
5.7 KiB
194 lines
5.7 KiB
import requests
|
|
import re
|
|
import jieba
|
|
from collections import Counter
|
|
from wordcloud import WordCloud
|
|
import matplotlib.pyplot as plt
|
|
import pandas as pd
|
|
|
|
|
|
class SimpleDanmuAnalyzer:
|
|
def __init__(self):
|
|
self.danmu_list = []
|
|
self.noise_words = ['666', '哈哈哈', '233', '点赞', '关注', '来了']
|
|
|
|
def get_danmu(self, bvid):
|
|
"""获取单个视频的弹幕"""
|
|
try:
|
|
print(f"正在获取视频 {bvid} 的弹幕...")
|
|
|
|
# 获取cid
|
|
info_url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
|
|
info_response = requests.get(info_url)
|
|
info_data = info_response.json()
|
|
|
|
if info_data['code'] != 0:
|
|
print(f"获取视频信息失败: {info_data.get('message', '未知错误')}")
|
|
return
|
|
|
|
cid = info_data['data']['cid']
|
|
|
|
# 获取弹幕
|
|
danmu_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
|
|
response = requests.get(danmu_url)
|
|
response.encoding = 'utf-8'
|
|
|
|
# 提取弹幕文本
|
|
danmus = re.findall(r'<d p=".*?">(.*?)</d>', response.text)
|
|
|
|
# 简单过滤噪声
|
|
filtered_danmus = []
|
|
for danmu in danmus:
|
|
if not any(noise in danmu for noise in self.noise_words):
|
|
filtered_danmus.append(danmu)
|
|
|
|
self.danmu_list.extend(filtered_danmus)
|
|
print(f"获取到 {len(filtered_danmus)} 条有效弹幕")
|
|
|
|
except Exception as e:
|
|
print(f"获取弹幕失败: {e}")
|
|
|
|
def analyze_words(self, top_n=8):
|
|
"""简单词频分析"""
|
|
if not self.danmu_list:
|
|
print("没有弹幕数据可供分析")
|
|
return []
|
|
|
|
# 合并所有弹幕
|
|
text = ' '.join(self.danmu_list)
|
|
|
|
# 分词
|
|
words = jieba.cut(text)
|
|
|
|
# 过滤短词和停用词
|
|
filtered_words = [word for word in words if len(word) > 1]
|
|
|
|
# 统计词频
|
|
word_count = Counter(filtered_words)
|
|
return word_count.most_common(top_n)
|
|
|
|
def make_wordcloud(self, filename='wordcloud.png'):
|
|
"""生成词云"""
|
|
if not self.danmu_list:
|
|
print("没有弹幕数据生成词云")
|
|
return
|
|
|
|
text = ' '.join(self.danmu_list)
|
|
|
|
try:
|
|
# 创建词云
|
|
wc = WordCloud(
|
|
font_path='simhei.ttf', # 需要系统中文字体
|
|
width=800,
|
|
height=600,
|
|
background_color='white',
|
|
max_words=100
|
|
).generate(text)
|
|
|
|
# 显示词云
|
|
plt.figure(figsize=(10, 8))
|
|
plt.imshow(wc)
|
|
plt.axis('off')
|
|
plt.title('弹幕词云图')
|
|
plt.tight_layout()
|
|
plt.savefig(filename, dpi=300, bbox_inches='tight')
|
|
plt.show()
|
|
print(f"词云图已保存为: {filename}")
|
|
|
|
except Exception as e:
|
|
print(f"生成词云失败: {e}")
|
|
print("请确保系统中安装了中文字体")
|
|
|
|
def save_to_excel(self, filename='result.xlsx'):
|
|
"""保存结果到Excel"""
|
|
if not self.danmu_list:
|
|
print("没有数据可保存")
|
|
return
|
|
|
|
try:
|
|
# 弹幕数据
|
|
df_danmu = pd.DataFrame(self.danmu_list, columns=['弹幕内容'])
|
|
|
|
# 词频数据
|
|
top_words = self.analyze_words(8)
|
|
df_words = pd.DataFrame(top_words, columns=['词语', '出现次数'])
|
|
|
|
# 保存
|
|
with pd.ExcelWriter(filename) as writer:
|
|
df_danmu.to_excel(writer, sheet_name='弹幕数据', index=False)
|
|
df_words.to_excel(writer, sheet_name='词频统计', index=False)
|
|
|
|
print(f"数据已保存到: {filename}")
|
|
|
|
except Exception as e:
|
|
print(f"保存Excel失败: {e}")
|
|
|
|
def get_conclusions(self):
|
|
"""简单分析结论"""
|
|
if not self.danmu_list:
|
|
return "没有足够数据进行分析"
|
|
|
|
total = len(self.danmu_list)
|
|
top_words = self.analyze_words(5)
|
|
|
|
conclusions = []
|
|
conclusions.append(f"共分析 {total} 条弹幕")
|
|
conclusions.append("高频词TOP5:")
|
|
for word, count in top_words:
|
|
conclusions.append(f" - {word}: {count}次")
|
|
|
|
return '\n'.join(conclusions)
|
|
|
|
|
|
def main():
|
|
"""主函数"""
|
|
analyzer = SimpleDanmuAnalyzer()
|
|
|
|
# 可以替换成你想分析的视频BV号
|
|
video_list = [
|
|
'BV1fp4y1q7E9', # 大语言模型介绍
|
|
'BV1nV41127AV', # LLM应用案例
|
|
'BV1Ru41127XB', # 大模型技术解析
|
|
]
|
|
|
|
print("开始获取弹幕数据...")
|
|
for bvid in video_list:
|
|
analyzer.get_danmu(bvid)
|
|
|
|
if not analyzer.danmu_list:
|
|
print("没有获取到弹幕数据,使用示例数据演示")
|
|
# 添加一些示例数据
|
|
analyzer.danmu_list = [
|
|
'大语言模型很强大',
|
|
'AI改变世界',
|
|
'机器学习很有趣',
|
|
'深度学习技术',
|
|
'自然语言处理',
|
|
'大模型应用广泛',
|
|
'人工智能未来',
|
|
'LLM发展很快',
|
|
'智能助手很方便',
|
|
'代码生成很实用'
|
|
]
|
|
|
|
print("\n进行词频分析...")
|
|
top_words = analyzer.analyze_words(8)
|
|
print("高频词TOP8:")
|
|
for i, (word, count) in enumerate(top_words, 1):
|
|
print(f"{i}. {word}: {count}次")
|
|
|
|
print("\n生成词云图...")
|
|
analyzer.make_wordcloud()
|
|
|
|
print("\n保存数据到Excel...")
|
|
analyzer.save_to_excel()
|
|
|
|
print("\n分析结论:")
|
|
conclusions = analyzer.get_conclusions()
|
|
print(conclusions)
|
|
|
|
print("\n任务完成!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |