ADD file via upload

6 months ago · 36447749b4
parent 9809d390d8
commit 36447749b4
1 changed files with 536 additions and 0 deletions
--- a/main3（附加题二）.py
+++ b/main3（附加题二）.py
@ -0,0 +1,536 @@
+import requests
+import pandas as pd
+import re
+import jieba
+import jieba.analyse
+from collections import Counter
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+import seaborn as sns
+from datetime import datetime
+import time
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import warnings
+warnings.filterwarnings('ignore')
+
+# 设置中文字体
+plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']
+plt.rcParams['axes.unicode_minus'] = False
+
+class AdvancedBilibiliAnalyzer:
+    def __init__(self):
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Referer': 'https://www.bilibili.com',
+        }
+        self.danmu_data = []
+        self.video_info = {}
+        
+        # 初始化情感词典
+        self.positive_words = set([
+            '好', '棒', '赞', '厉害', '优秀', '精彩', '喜欢', '爱', '支持', '感谢', '不错', '可以',
+            '强大', '实用', '方便', '惊喜', '进步', '提升', '效率', '创新', '革命', '牛逼', '太强了',
+            '完美', '精彩', '出色', '优秀', '美好', '幸福', '快乐', '开心', '满意', '赞同', '认可',
+            '佩服', '崇拜', '羡慕', '期待', '希望', '梦想', '成功', '胜利', '冠军', '第一', '最佳'
+        ])
+        
+        self.negative_words = set([
+            '差', '烂', '垃圾', '讨厌', '恶心', '无聊', '失望', '反对', '抵制', '错误', '问题',
+            '困难', '复杂', '昂贵', '糟糕', '缺陷', '不足', '局限', '风险', '错误', '偏差',
+            '失败', '痛苦', '悲伤', '难过', '愤怒', '讨厌', '可恶', '鄙视', '轻视', '嘲笑',
+            '批评', '指责', '抱怨', '后悔', '遗憾', '失望', '绝望', '痛苦', '难受', '不舒服'
+        ])
+        
+        self.intensifiers = {
+            '非常': 1.5, '特别': 1.5, '极其': 1.8, '超级': 1.5, '十分': 1.4,
+            '很': 1.3, '挺': 1.2, '相当': 1.3, '比较': 1.1, '有点': 0.8,
+            '稍微': 0.7, '略微': 0.7, '极其': 1.8, '极度': 1.8, '超': 1.5,
+            '太': 1.6, '真': 1.3, '确实': 1.2
+        }
+        
+        self.negations = {
+            '不', '没', '没有', '无', '非', '未', '别', '莫', '勿', '休', '免'
+        }
+    
+    def get_multiple_videos(self, bvid_list):
+        """获取多个视频的数据"""
+        all_danmu = []
+        all_info = []
+        
+        for bvid in bvid_list:
+            print(f"\n正在分析视频: {bvid}")
+            video_info = self.get_video_info(bvid)
+            if video_info:
+                danmu_data = self.get_danmu_data(video_info['cid'])
+                if danmu_data:
+                    filtered_danmu = self.filter_noise(danmu_data)
+                    video_info['filtered_danmu_count'] = len(filtered_danmu)
+                    video_info['danmu_data'] = filtered_danmu
+                    all_danmu.extend(filtered_danmu)
+                    all_info.append(video_info)
+                    time.sleep(2)  # 避免请求过快
+        
+        return all_danmu, all_info
+    
+    def advanced_sentiment_analysis(self, text):
+        """改进的中文情感分析"""
+        words = list(jieba.cut(text))
+        sentiment_score = 0
+        intensity = 1.0
+        negation = False
+        
+        for i, word in enumerate(words):
+            # 检查程度副词
+            if word in self.intensifiers:
+                intensity *= self.intensifiers[word]
+                continue
+                
+            # 检查否定词
+            if word in self.negations:
+                negation = not negation
+                continue
+                
+            # 检查情感词
+            if word in self.positive_words:
+                word_score = 1.0
+                if negation:
+                    word_score = -word_score
+                sentiment_score += word_score * intensity
+                # 重置
+                intensity = 1.0
+                negation = False
+                
+            elif word in self.negative_words:
+                word_score = -1.0
+                if negation:
+                    word_score = -word_score
+                sentiment_score += word_score * intensity
+                # 重置
+                intensity = 1.0
+                negation = False
+        
+        # 归一化到 [-1, 1] 范围
+        word_count = len([w for w in words if w in self.positive_words or w in self.negative_words])
+        if word_count > 0:
+            final_score = sentiment_score / word_count
+            # 限制在 [-1, 1] 范围内
+            final_score = max(-1.0, min(1.0, final_score))
+        else:
+            final_score = 0
+            
+        return final_score
+    
+    def analyze_emotion_trend(self, danmu_list):
+        """分析情感趋势"""
+        emotions = []
+        for danmu in danmu_list[:500]:  # 分析前500条避免过载
+            score = self.advanced_sentiment_analysis(danmu)
+            emotions.append(score)
+        
+        # 情感分类
+        positive = len([e for e in emotions if e > 0.1])
+        negative = len([e for e in emotions if e < -0.1])
+        neutral = len([e for e in emotions if -0.1 <= e <= 0.1])
+        
+        return {
+            'positive': positive,
+            'negative': negative,
+            'neutral': neutral,
+            'sentiment_scores': emotions
+        }
+    
+    def extract_key_phrases(self, text_list, top_n=10):
+        """提取关键短语"""
+        text = ' '.join(text_list)
+        
+        # 使用jieba的TextRank算法提取关键词
+        keywords = jieba.analyse.textrank(text, topK=top_n, withWeight=True, allowPOS=('n', 'vn', 'v'))
+        
+        return keywords
+    
+    def create_interactive_dashboard(self, video_data, danmu_data, emotion_data, keywords):
+        """创建交互式可视化大屏"""
+        
+        # 创建子图布局
+        fig = make_subplots(
+            rows=3, cols=3,
+            subplot_titles=(
+                '视频热度对比', '情感分析分布', '关键词权重',
+                '弹幕数量对比', '情感趋势', '热门话题',
+                '视频信息总览', '互动分析', '情感分布'
+            ),
+            specs=[
+                [{"type": "bar"}, {"type": "pie"}, {"type": "bar"}],
+                [{"type": "bar"}, {"type": "scatter"}, {"type": "bar"}],
+                [{"type": "table"}, {"type": "bar"}, {"type": "histogram"}]
+            ]
+        )
+        
+        # 1. 视频热度对比
+        titles = [v['title'][:20] + '...' if len(v['title']) > 20 else v['title'] for v in video_data]
+        views = [v['view'] for v in video_data]
+        danmu_counts = [v['danmaku_count'] for v in video_data]
+        
+        fig.add_trace(
+            go.Bar(name='播放量', x=titles, y=views, marker_color='lightblue'),
+            row=1, col=1
+        )
+        fig.add_trace(
+            go.Bar(name='弹幕数', x=titles, y=danmu_counts, marker_color='lightcoral'),
+            row=1, col=1
+        )
+        
+        # 2. 情感分析分布
+        sentiment_labels = ['正面', '中性', '负面']
+        sentiment_values = [emotion_data['positive'], emotion_data['neutral'], emotion_data['negative']]
+        
+        fig.add_trace(
+            go.Pie(labels=sentiment_labels, values=sentiment_values, 
+                  marker_colors=['#2ecc71', '#f39c12', '#e74c3c']),
+            row=1, col=2
+        )
+        
+        # 3. 关键词权重
+        if keywords:
+            keyword_phrases = [kw[0] for kw in keywords]
+            keyword_weights = [kw[1] for kw in keywords]
+            
+            fig.add_trace(
+                go.Bar(x=keyword_weights, y=keyword_phrases, orientation='h',
+                      marker_color='lightgreen', name='关键词'),
+                row=1, col=3
+            )
+        
+        # 4. 弹幕数量对比
+        filtered_counts = [v['filtered_danmu_count'] for v in video_data]
+        
+        fig.add_trace(
+            go.Bar(x=titles, y=filtered_counts, marker_color='purple', name='有效弹幕数'),
+            row=2, col=1
+        )
+        
+        # 5. 情感趋势
+        sentiment_scores = emotion_data['sentiment_scores'][:50]  # 取前50条显示趋势
+        fig.add_trace(
+            go.Scatter(y=sentiment_scores, mode='lines+markers', 
+                      line=dict(color='orange'), name='情感趋势'),
+            row=2, col=2
+        )
+        
+        # 6. 热门话题（使用词频）
+        if danmu_data:
+            word_freq = self.get_word_frequency(danmu_data)
+            top_words = word_freq.most_common(10)
+            if top_words:
+                words, counts = zip(*top_words)
+                fig.add_trace(
+                    go.Bar(x=list(words), y=list(counts), marker_color='lightseagreen', name='热门词汇'),
+                    row=2, col=3
+                )
+        
+        # 7. 视频信息表格
+        table_data = [
+            [v['title'][:15] + '...' for v in video_data],
+            [v['owner'] for v in video_data],
+            [f"{v['view']:,}" for v in video_data],
+            [v['danmaku_count'] for v in video_data]
+        ]
+        
+        fig.add_trace(
+            go.Table(
+                header=dict(values=['标题', '作者', '播放量', '弹幕数']),
+                cells=dict(values=table_data)
+            ),
+            row=3, col=1
+        )
+        
+        # 8. 互动分析
+        interaction_ratio = [v['danmaku_count']/v['view']*1000 if v['view'] > 0 else 0 for v in video_data]
+        
+        fig.add_trace(
+            go.Bar(x=titles, y=interaction_ratio, marker_color='brown',
+                  name='弹幕/播放量(千分比)'),
+            row=3, col=2
+        )
+        
+        # 9. 情感分数分布
+        fig.add_trace(
+            go.Histogram(x=emotion_data['sentiment_scores'], nbinsx=20,
+                        marker_color='lightseagreen', name='情感分数分布'),
+            row=3, col=3
+        )
+        
+        fig.update_layout(
+            height=1200,
+            title_text="B站视频弹幕深度分析大屏",
+            showlegend=True,
+            template="plotly_white"
+        )
+        
+        fig.show()
+        
+        return fig
+    
+    def get_word_frequency(self, text_list):
+        """获取词频"""
+        all_text = ' '.join(text_list)
+        words = jieba.cut(all_text)
+        
+        # 过滤停用词
+        stop_words = {
+            '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', 
+            '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好'
+        }
+        
+        filtered_words = [
+            word for word in words 
+            if len(word) > 1 
+            and word not in stop_words
+            and not re.match(r'^\d+$', word)
+        ]
+        
+        return Counter(filtered_words)
+    
+    def create_wordcloud_advanced(self, text_list, filename='advanced_wordcloud.png'):
+        """创建高级词云"""
+        text = ' '.join(text_list)
+        
+        # 使用词频生成词云
+        word_freq = self.get_word_frequency(text_list)
+        
+        # 创建词云
+        try:
+            wordcloud = WordCloud(
+                font_path='simhei.ttf',
+                width=1600,
+                height=800,
+                background_color='white',
+                max_words=200,
+                colormap='viridis',
+                relative_scaling=0.5,
+                collocations=False
+            ).generate_from_frequencies(word_freq)
+            
+            plt.figure(figsize=(20, 10))
+            plt.imshow(wordcloud, interpolation='bilinear')
+            plt.axis('off')
+            plt.title('高级弹幕词云分析', fontsize=24, pad=20)
+            plt.tight_layout()
+            plt.savefig(filename, dpi=300, bbox_inches='tight')
+            plt.show()
+            
+            return wordcloud
+        except Exception as e:
+            print(f"生成词云时出错: {e}")
+            # 备用方案：使用文本生成
+            wordcloud = WordCloud(
+                width=1600,
+                height=800,
+                background_color='white',
+                max_words=200,
+                colormap='viridis'
+            ).generate(text)
+            
+            plt.figure(figsize=(20, 10))
+            plt.imshow(wordcloud, interpolation='bilinear')
+            plt.axis('off')
+            plt.title('高级弹幕词云分析', fontsize=24, pad=20)
+            plt.tight_layout()
+            plt.savefig(filename, dpi=300, bbox_inches='tight')
+            plt.show()
+            return wordcloud
+    
+    def generate_comprehensive_report(self, video_data, emotion_data, keywords):
+        """生成综合分析报告"""
+        print("\n" + "="*80)
+        print("📊 B站视频弹幕深度分析报告")
+        print("="*80)
+        
+        # 总体统计
+        total_views = sum(v['view'] for v in video_data)
+        total_danmu = sum(v['danmaku_count'] for v in video_data)
+        total_videos = len(video_data)
+        
+        print(f"\n📈 总体统计:")
+        print(f"   • 分析视频数量: {total_videos}")
+        print(f"   • 总播放量: {total_views:,}")
+        print(f"   • 总弹幕数: {total_danmu}")
+        print(f"   • 平均播放量: {total_views/total_videos:,.0f}")
+        
+        # 情感分析报告
+        total_sentiments = emotion_data['positive'] + emotion_data['neutral'] + emotion_data['negative']
+        if total_sentiments > 0:
+            positive_ratio = emotion_data['positive'] / total_sentiments * 100
+            negative_ratio = emotion_data['negative'] / total_sentiments * 100
+            
+            print(f"\n😊 情感分析:")
+            print(f"   • 正面情感: {emotion_data['positive']} ({positive_ratio:.1f}%)")
+            print(f"   • 中性情感: {emotion_data['neutral']} ({(100-positive_ratio-negative_ratio):.1f}%)")
+            print(f"   • 负面情感: {emotion_data['negative']} ({negative_ratio:.1f}%)")
+            
+            if positive_ratio > 60:
+                sentiment_verdict = "🌟 社区氛围非常积极！"
+            elif positive_ratio > 40:
+                sentiment_verdict = "👍 社区氛围总体良好"
+            else:
+                sentiment_verdict = "⚠️  社区氛围需要关注"
+                
+            print(f"   • 情感结论: {sentiment_verdict}")
+        
+        # 热门话题
+        print(f"\n🔥 热门话题TOP10:")
+        if keywords:
+            for i, (phrase, weight) in enumerate(keywords[:10], 1):
+                print(f"   {i:2d}. {phrase}: {weight:.3f}")
+        else:
+            print("   未提取到关键词")
+        
+        # 视频排名
+        print(f"\n🏆 视频热度排名:")
+        sorted_videos = sorted(video_data, key=lambda x: x['view'], reverse=True)
+        for i, video in enumerate(sorted_videos, 1):
+            title_short = video['title'][:25] + '...' if len(video['title']) > 25 else video['title']
+            print(f"   {i:2d}. {title_short}")
+            print(f"       播放量: {video['view']:,} | 弹幕数: {video['danmaku_count']} | 作者: {video['owner']}")
+        
+        # 互动分析
+        print(f"\n💬 互动质量分析:")
+        for video in video_data:
+            interaction_rate = video['danmaku_count'] / video['view'] * 1000 if video['view'] > 0 else 0
+            title_short = video['title'][:20] + '...' if len(video['title']) > 20 else video['title']
+            print(f"   • {title_short}: {interaction_rate:.2f}‰ (每千次播放弹幕数)")
+        
+        # 内容质量评估
+        if emotion_data['sentiment_scores']:
+            avg_sentiment = np.mean(emotion_data['sentiment_scores'])
+            sentiment_std = np.std(emotion_data['sentiment_scores'])
+            print(f"\n📊 内容质量评估:")
+            print(f"   • 平均情感分数: {avg_sentiment:.3f}")
+            print(f"   • 情感波动度: {sentiment_std:.3f}")
+            
+            if avg_sentiment > 0.1:
+                quality_verdict = "🎯 内容质量优秀，用户反馈积极"
+            elif avg_sentiment > -0.1:
+                quality_verdict = "✅ 内容质量良好，用户反馈中性"
+            else:
+                quality_verdict = "💡 内容有待优化，用户反馈偏负面"
+                
+            print(f"   • 质量结论: {quality_verdict}")
+
+    # 保留原有的基础方法
+    def get_video_info(self, bvid):
+        """获取视频信息"""
+        url = "https://api.bilibili.com/x/web-interface/view"
+        params = {'bvid': bvid}
+        
+        try:
+            response = requests.get(url, params=params, headers=self.headers, timeout=10)
+            if response.status_code == 200:
+                data = response.json()
+                if data.get('code') == 0:
+                    video_data = data['data']
+                    return {
+                        'title': video_data['title'],
+                        'cid': video_data['cid'],
+                        'bvid': bvid,
+                        'owner': video_data['owner']['name'],
+                        'view': video_data['stat']['view'],
+                        'danmaku_count': video_data['stat']['danmaku']
+                    }
+        except Exception as e:
+            print(f"获取视频信息失败: {e}")
+        return None
+
+    def get_danmu_data(self, cid):
+        """获取弹幕数据"""
+        url = f"https://api.bilibili.com/x/v1/dm/list.so"
+        params = {'oid': cid}
+        
+        try:
+            response = requests.get(url, params=params, headers=self.headers, timeout=10)
+            if response.status_code == 200:
+                try:
+                    import xml.etree.ElementTree as ET
+                    root = ET.fromstring(response.content)
+                    return [d.text for d in root.findall('d')]
+                except:
+                    content = response.content.decode('utf-8')
+                    danmu_pattern = r'<d[^>]*>([^<]+)</d>'
+                    return re.findall(danmu_pattern, content)
+        except Exception as e:
+            print(f"获取弹幕失败: {e}")
+        return []
+
+    def filter_noise(self, danmu_list):
+        """过滤噪声弹幕"""
+        noise_words = ['666', '哈哈哈', '233', 'awsl', '哈哈哈哈', '妙啊', '好活', '点赞', '支持']
+        filtered_danmu = []
+        
+        for danmu in danmu_list:
+            if not danmu or len(danmu.strip()) <= 1:
+                continue
+            if any(noise in danmu for noise in noise_words):
+                continue
+            if danmu.strip().isdigit():
+                continue
+            if len(set(danmu)) <= 2:
+                continue
+            filtered_danmu.append(danmu.strip())
+        
+        return filtered_danmu
+
+def main():
+    """主函数 - 分析多个热门视频"""
+    
+    # 定义要分析的视频列表（B站热门视频BV号）
+    hot_videos = [
+        "BV1kg4y1T7PA",  # AI相关
+        "BV1uC41177bh",  # 科技热门
+        "BV1Gu4y1W7iK",  # 知识科普
+    ]
+    
+    print("🚀 B站热门视频弹幕深度分析系统")
+    print("="*50)
+    
+    analyzer = AdvancedBilibiliAnalyzer()
+    
+    # 1. 获取多个视频数据
+    print("📥 正在获取视频数据...")
+    all_danmu, video_data = analyzer.get_multiple_videos(hot_videos)
+    
+    if not all_danmu:
+        print("❌ 无法获取弹幕数据，程序结束")
+        return
+    
+    print(f"✅ 成功获取 {len(video_data)} 个视频的 {len(all_danmu)} 条弹幕")
+    
+    # 2. 情感分析
+    print("\n😊 正在进行情感分析...")
+    emotion_data = analyzer.analyze_emotion_trend(all_danmu)
+    
+    # 3. 关键词提取
+    print("\n🔍 正在提取关键短语...")
+    keywords = analyzer.extract_key_phrases(all_danmu, 15)
+    
+    # 4. 创建高级词云
+    print("\n🎨 生成高级词云...")
+    analyzer.create_wordcloud_advanced(all_danmu, 'hot_videos_wordcloud.png')
+    
+    # 5. 创建交互式大屏
+    print("\n📊 创建交互式可视化大屏...")
+    analyzer.create_interactive_dashboard(video_data, all_danmu, emotion_data, keywords)
+    
+    # 6. 生成详细报告
+    print("\n📋 生成综合分析报告...")
+    analyzer.generate_comprehensive_report(video_data, emotion_data, keywords)
+    
+    print("\n🎉 分析完成！")
+    print("📁 生成的文件:")
+    print("   • hot_videos_wordcloud.png - 高级词云图")
+    print("   • 交互式可视化大屏 (在浏览器中显示)")
+
+if __name__ == "__main__":
+    main()