ADD file via upload

4 months ago · bcf3d9fe4c
parent caefda24ca
commit bcf3d9fe4c
1 changed files with 195 additions and 0 deletions
--- a/simple_danmu_analyzer.py
+++ b/simple_danmu_analyzer.py
@ -0,0 +1,195 @@
+import requests
+import re
+import jieba
+from collections import Counter
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+import pandas as pd
+
+
+class SimpleDanmuAnalyzer:
+    def __init__(self):
+        self.danmu_list = []
+        self.noise_words = ['666', '哈哈哈', '233', '点赞', '关注', '来了']
+
+    def get_danmu(self, bvid):
+        """获取单个视频的弹幕"""
+        try:
+            print(f"正在获取视频 {bvid} 的弹幕...")
+
+            # 获取cid
+            info_url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
+            info_response = requests.get(info_url)
+            info_data = info_response.json()
+
+            if info_data['code'] != 0:
+                print(f"获取视频信息失败: {info_data.get('message', '未知错误')}")
+                return
+
+            cid = info_data['data']['cid']
+
+            # 获取弹幕
+            danmu_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
+            response = requests.get(danmu_url)
+            response.encoding = 'utf-8'
+
+            # 提取弹幕文本
+            danmus = re.findall(r'<d p=".*?">(.*?)</d>', response.text)
+
+            # 简单过滤噪声
+            filtered_danmus = []
+            for danmu in danmus:
+                if not any(noise in danmu for noise in self.noise_words):
+                    filtered_danmus.append(danmu)
+
+            self.danmu_list.extend(filtered_danmus)
+            print(f"获取到 {len(filtered_danmus)} 条有效弹幕")
+
+        except Exception as e:
+            print(f"获取弹幕失败: {e}")
+
+    def analyze_words(self, top_n=8):
+        """简单词频分析"""
+        if not self.danmu_list:
+            print("没有弹幕数据可供分析")
+            return []
+
+        # 合并所有弹幕
+        text = ' '.join(self.danmu_list)
+
+        # 分词
+        words = jieba.cut(text)
+
+        # 过滤短词和停用词
+        filtered_words = [word for word in words if len(word) > 1]
+
+        # 统计词频
+        word_count = Counter(filtered_words)
+        return word_count.most_common(top_n)
+
+    def make_wordcloud(self, filename='wordcloud.png'):
+        """生成词云"""
+        if not self.danmu_list:
+            print("没有弹幕数据生成词云")
+            return
+
+        text = ' '.join(self.danmu_list)
+
+        try:
+            # 创建词云
+            wc = WordCloud(
+                font_path='simhei.ttf',  # 需要系统中文字体
+                width=800,
+                height=600,
+                background_color='white',
+                max_words=100
+            ).generate(text)
+
+            # 显示词云
+            plt.figure(figsize=(10, 8))
+            plt.imshow(wc)
+            plt.axis('off')
+            plt.title('弹幕词云图')
+            plt.tight_layout()
+            plt.savefig(filename, dpi=300, bbox_inches='tight')
+            plt.show()
+            print(f"词云图已保存为: {filename}")
+
+        except Exception as e:
+            print(f"生成词云失败: {e}")
+            print("请确保系统中安装了中文字体")
+
+    def save_to_excel(self, filename='result.xlsx'):
+        """保存结果到Excel"""
+        if not self.danmu_list:
+            print("没有数据可保存")
+            return
+
+        try:
+            # 弹幕数据
+            df_danmu = pd.DataFrame(self.danmu_list, columns=['弹幕内容'])
+
+            # 词频数据
+            top_words = self.analyze_words(8)
+            df_words = pd.DataFrame(top_words, columns=['词语', '出现次数'])
+
+            # 保存
+            with pd.ExcelWriter(filename) as writer:
+                df_danmu.to_excel(writer, sheet_name='弹幕数据', index=False)
+                df_words.to_excel(writer, sheet_name='词频统计', index=False)
+
+            print(f"数据已保存到: {filename}")
+
+        except Exception as e:
+            print(f"保存Excel失败: {e}")
+
+    def get_conclusions(self):
+        """简单分析结论"""
+        if not self.danmu_list:
+            return "没有足够数据进行分析"
+
+        total = len(self.danmu_list)
+        top_words = self.analyze_words(5)
+
+        conclusions = []
+        conclusions.append(f"共分析 {total} 条弹幕")
+        conclusions.append("高频词TOP5:")
+        for word, count in top_words:
+            conclusions.append(f"  - {word}: {count}次")
+
+        return '\n'.join(conclusions)
+
+
+def main():
+    """主函数"""
+    analyzer = SimpleDanmuAnalyzer()
+
+    # 这里放几个大语言模型相关视频的BV号
+    # 你可以替换成你想分析的视频BV号
+    video_list = [
+        'BV1fp4y1q7E9',  # 大语言模型介绍
+        'BV1nV41127AV',  # LLM应用案例
+        'BV1Ru41127XB',  # 大模型技术解析
+    ]
+
+    print("开始获取弹幕数据...")
+    for bvid in video_list:
+        analyzer.get_danmu(bvid)
+
+    if not analyzer.danmu_list:
+        print("没有获取到弹幕数据，使用示例数据演示")
+        # 添加一些示例数据
+        analyzer.danmu_list = [
+            '大语言模型很强大',
+            'AI改变世界',
+            '机器学习很有趣',
+            '深度学习技术',
+            '自然语言处理',
+            '大模型应用广泛',
+            '人工智能未来',
+            'LLM发展很快',
+            '智能助手很方便',
+            '代码生成很实用'
+        ]
+
+    print("\n进行词频分析...")
+    top_words = analyzer.analyze_words(8)
+    print("高频词TOP8:")
+    for i, (word, count) in enumerate(top_words, 1):
+        print(f"{i}. {word}: {count}次")
+
+    print("\n生成词云图...")
+    analyzer.make_wordcloud()
+
+    print("\n保存数据到Excel...")
+    analyzer.save_to_excel()
+
+    print("\n分析结论:")
+    conclusions = analyzer.get_conclusions()
+    print(conclusions)
+
+    print("\n任务完成!")
+
+
+if __name__ == "__main__":
+    main()