task2/simple_danmu_analyzer.py

import requests
import re
import jieba
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd


class SimpleDanmuAnalyzer:
    def __init__(self):
        self.danmu_list = []
        self.noise_words = ['666', '哈哈哈', '233', '点赞', '关注', '来了']

    def get_danmu(self, bvid):
        """获取单个视频的弹幕"""
        try:
            print(f"正在获取视频 {bvid} 的弹幕...")

            # 获取cid
            info_url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
            info_response = requests.get(info_url)
            info_data = info_response.json()

            if info_data['code'] != 0:
                print(f"获取视频信息失败: {info_data.get('message', '未知错误')}")
                return

            cid = info_data['data']['cid']

            # 获取弹幕
            danmu_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
            response = requests.get(danmu_url)
            response.encoding = 'utf-8'

            # 提取弹幕文本
            danmus = re.findall(r'<d p=".*?">(.*?)</d>', response.text)

            # 简单过滤噪声
            filtered_danmus = []
            for danmu in danmus:
                if not any(noise in danmu for noise in self.noise_words):
                    filtered_danmus.append(danmu)

            self.danmu_list.extend(filtered_danmus)
            print(f"获取到 {len(filtered_danmus)} 条有效弹幕")

        except Exception as e:
            print(f"获取弹幕失败: {e}")

    def analyze_words(self, top_n=8):
        """简单词频分析"""
        if not self.danmu_list:
            print("没有弹幕数据可供分析")
            return []

        # 合并所有弹幕
        text = ' '.join(self.danmu_list)

        # 分词
        words = jieba.cut(text)

        # 过滤短词和停用词
        filtered_words = [word for word in words if len(word) > 1]

        # 统计词频
        word_count = Counter(filtered_words)
        return word_count.most_common(top_n)

    def make_wordcloud(self, filename='wordcloud.png'):
        """生成词云"""
        if not self.danmu_list:
            print("没有弹幕数据生成词云")
            return

        text = ' '.join(self.danmu_list)

        try:
            # 创建词云
            wc = WordCloud(
                font_path='simhei.ttf',  # 需要系统中文字体
                width=800,
                height=600,
                background_color='white',
                max_words=100
            ).generate(text)

            # 显示词云
            plt.figure(figsize=(10, 8))
            plt.imshow(wc)
            plt.axis('off')
            plt.title('弹幕词云图')
            plt.tight_layout()
            plt.savefig(filename, dpi=300, bbox_inches='tight')
            plt.show()
            print(f"词云图已保存为: {filename}")

        except Exception as e:
            print(f"生成词云失败: {e}")
            print("请确保系统中安装了中文字体")

    def save_to_excel(self, filename='result.xlsx'):
        """保存结果到Excel"""
        if not self.danmu_list:
            print("没有数据可保存")
            return

        try:
            # 弹幕数据
            df_danmu = pd.DataFrame(self.danmu_list, columns=['弹幕内容'])

            # 词频数据
            top_words = self.analyze_words(8)
            df_words = pd.DataFrame(top_words, columns=['词语', '出现次数'])

            # 保存
            with pd.ExcelWriter(filename) as writer:
                df_danmu.to_excel(writer, sheet_name='弹幕数据', index=False)
                df_words.to_excel(writer, sheet_name='词频统计', index=False)

            print(f"数据已保存到: {filename}")

        except Exception as e:
            print(f"保存Excel失败: {e}")

    def get_conclusions(self):
        """简单分析结论"""
        if not self.danmu_list:
            return "没有足够数据进行分析"

        total = len(self.danmu_list)
        top_words = self.analyze_words(5)

        conclusions = []
        conclusions.append(f"共分析 {total} 条弹幕")
        conclusions.append("高频词TOP5:")
        for word, count in top_words:
            conclusions.append(f"  - {word}: {count}次")

        return '\n'.join(conclusions)


def main():
    """主函数"""
    analyzer = SimpleDanmuAnalyzer()

    # 可以替换成你想分析的视频BV号
    video_list = [
        'BV1fp4y1q7E9',  # 大语言模型介绍
        'BV1nV41127AV',  # LLM应用案例
        'BV1Ru41127XB',  # 大模型技术解析
    ]

    print("开始获取弹幕数据...")
    for bvid in video_list:
        analyzer.get_danmu(bvid)

    if not analyzer.danmu_list:
        print("没有获取到弹幕数据，使用示例数据演示")
        # 添加一些示例数据
        analyzer.danmu_list = [
            '大语言模型很强大',
            'AI改变世界',
            '机器学习很有趣',
            '深度学习技术',
            '自然语言处理',
            '大模型应用广泛',
            '人工智能未来',
            'LLM发展很快',
            '智能助手很方便',
            '代码生成很实用'
        ]

    print("\n进行词频分析...")
    top_words = analyzer.analyze_words(8)
    print("高频词TOP8:")
    for i, (word, count) in enumerate(top_words, 1):
        print(f"{i}. {word}: {count}次")

    print("\n生成词云图...")
    analyzer.make_wordcloud()

    print("\n保存数据到Excel...")
    analyzer.save_to_excel()

    print("\n分析结论:")
    conclusions = analyzer.get_conclusions()
    print(conclusions)

    print("\n任务完成!")


if __name__ == "__main__":
    main()