ADD file via upload

dev
fzu102301341 6 months ago
parent 9809d390d8
commit 36447749b4

@ -0,0 +1,536 @@
import requests
import pandas as pd
import re
import jieba
import jieba.analyse
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import time
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
class AdvancedBilibiliAnalyzer:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Referer': 'https://www.bilibili.com',
}
self.danmu_data = []
self.video_info = {}
# 初始化情感词典
self.positive_words = set([
'', '', '', '厉害', '优秀', '精彩', '喜欢', '', '支持', '感谢', '不错', '可以',
'强大', '实用', '方便', '惊喜', '进步', '提升', '效率', '创新', '革命', '牛逼', '太强了',
'完美', '精彩', '出色', '优秀', '美好', '幸福', '快乐', '开心', '满意', '赞同', '认可',
'佩服', '崇拜', '羡慕', '期待', '希望', '梦想', '成功', '胜利', '冠军', '第一', '最佳'
])
self.negative_words = set([
'', '', '垃圾', '讨厌', '恶心', '无聊', '失望', '反对', '抵制', '错误', '问题',
'困难', '复杂', '昂贵', '糟糕', '缺陷', '不足', '局限', '风险', '错误', '偏差',
'失败', '痛苦', '悲伤', '难过', '愤怒', '讨厌', '可恶', '鄙视', '轻视', '嘲笑',
'批评', '指责', '抱怨', '后悔', '遗憾', '失望', '绝望', '痛苦', '难受', '不舒服'
])
self.intensifiers = {
'非常': 1.5, '特别': 1.5, '极其': 1.8, '超级': 1.5, '十分': 1.4,
'': 1.3, '': 1.2, '相当': 1.3, '比较': 1.1, '有点': 0.8,
'稍微': 0.7, '略微': 0.7, '极其': 1.8, '极度': 1.8, '': 1.5,
'': 1.6, '': 1.3, '确实': 1.2
}
self.negations = {
'', '', '没有', '', '', '', '', '', '', '', ''
}
def get_multiple_videos(self, bvid_list):
"""获取多个视频的数据"""
all_danmu = []
all_info = []
for bvid in bvid_list:
print(f"\n正在分析视频: {bvid}")
video_info = self.get_video_info(bvid)
if video_info:
danmu_data = self.get_danmu_data(video_info['cid'])
if danmu_data:
filtered_danmu = self.filter_noise(danmu_data)
video_info['filtered_danmu_count'] = len(filtered_danmu)
video_info['danmu_data'] = filtered_danmu
all_danmu.extend(filtered_danmu)
all_info.append(video_info)
time.sleep(2) # 避免请求过快
return all_danmu, all_info
def advanced_sentiment_analysis(self, text):
"""改进的中文情感分析"""
words = list(jieba.cut(text))
sentiment_score = 0
intensity = 1.0
negation = False
for i, word in enumerate(words):
# 检查程度副词
if word in self.intensifiers:
intensity *= self.intensifiers[word]
continue
# 检查否定词
if word in self.negations:
negation = not negation
continue
# 检查情感词
if word in self.positive_words:
word_score = 1.0
if negation:
word_score = -word_score
sentiment_score += word_score * intensity
# 重置
intensity = 1.0
negation = False
elif word in self.negative_words:
word_score = -1.0
if negation:
word_score = -word_score
sentiment_score += word_score * intensity
# 重置
intensity = 1.0
negation = False
# 归一化到 [-1, 1] 范围
word_count = len([w for w in words if w in self.positive_words or w in self.negative_words])
if word_count > 0:
final_score = sentiment_score / word_count
# 限制在 [-1, 1] 范围内
final_score = max(-1.0, min(1.0, final_score))
else:
final_score = 0
return final_score
def analyze_emotion_trend(self, danmu_list):
"""分析情感趋势"""
emotions = []
for danmu in danmu_list[:500]: # 分析前500条避免过载
score = self.advanced_sentiment_analysis(danmu)
emotions.append(score)
# 情感分类
positive = len([e for e in emotions if e > 0.1])
negative = len([e for e in emotions if e < -0.1])
neutral = len([e for e in emotions if -0.1 <= e <= 0.1])
return {
'positive': positive,
'negative': negative,
'neutral': neutral,
'sentiment_scores': emotions
}
def extract_key_phrases(self, text_list, top_n=10):
"""提取关键短语"""
text = ' '.join(text_list)
# 使用jieba的TextRank算法提取关键词
keywords = jieba.analyse.textrank(text, topK=top_n, withWeight=True, allowPOS=('n', 'vn', 'v'))
return keywords
def create_interactive_dashboard(self, video_data, danmu_data, emotion_data, keywords):
"""创建交互式可视化大屏"""
# 创建子图布局
fig = make_subplots(
rows=3, cols=3,
subplot_titles=(
'视频热度对比', '情感分析分布', '关键词权重',
'弹幕数量对比', '情感趋势', '热门话题',
'视频信息总览', '互动分析', '情感分布'
),
specs=[
[{"type": "bar"}, {"type": "pie"}, {"type": "bar"}],
[{"type": "bar"}, {"type": "scatter"}, {"type": "bar"}],
[{"type": "table"}, {"type": "bar"}, {"type": "histogram"}]
]
)
# 1. 视频热度对比
titles = [v['title'][:20] + '...' if len(v['title']) > 20 else v['title'] for v in video_data]
views = [v['view'] for v in video_data]
danmu_counts = [v['danmaku_count'] for v in video_data]
fig.add_trace(
go.Bar(name='播放量', x=titles, y=views, marker_color='lightblue'),
row=1, col=1
)
fig.add_trace(
go.Bar(name='弹幕数', x=titles, y=danmu_counts, marker_color='lightcoral'),
row=1, col=1
)
# 2. 情感分析分布
sentiment_labels = ['正面', '中性', '负面']
sentiment_values = [emotion_data['positive'], emotion_data['neutral'], emotion_data['negative']]
fig.add_trace(
go.Pie(labels=sentiment_labels, values=sentiment_values,
marker_colors=['#2ecc71', '#f39c12', '#e74c3c']),
row=1, col=2
)
# 3. 关键词权重
if keywords:
keyword_phrases = [kw[0] for kw in keywords]
keyword_weights = [kw[1] for kw in keywords]
fig.add_trace(
go.Bar(x=keyword_weights, y=keyword_phrases, orientation='h',
marker_color='lightgreen', name='关键词'),
row=1, col=3
)
# 4. 弹幕数量对比
filtered_counts = [v['filtered_danmu_count'] for v in video_data]
fig.add_trace(
go.Bar(x=titles, y=filtered_counts, marker_color='purple', name='有效弹幕数'),
row=2, col=1
)
# 5. 情感趋势
sentiment_scores = emotion_data['sentiment_scores'][:50] # 取前50条显示趋势
fig.add_trace(
go.Scatter(y=sentiment_scores, mode='lines+markers',
line=dict(color='orange'), name='情感趋势'),
row=2, col=2
)
# 6. 热门话题(使用词频)
if danmu_data:
word_freq = self.get_word_frequency(danmu_data)
top_words = word_freq.most_common(10)
if top_words:
words, counts = zip(*top_words)
fig.add_trace(
go.Bar(x=list(words), y=list(counts), marker_color='lightseagreen', name='热门词汇'),
row=2, col=3
)
# 7. 视频信息表格
table_data = [
[v['title'][:15] + '...' for v in video_data],
[v['owner'] for v in video_data],
[f"{v['view']:,}" for v in video_data],
[v['danmaku_count'] for v in video_data]
]
fig.add_trace(
go.Table(
header=dict(values=['标题', '作者', '播放量', '弹幕数']),
cells=dict(values=table_data)
),
row=3, col=1
)
# 8. 互动分析
interaction_ratio = [v['danmaku_count']/v['view']*1000 if v['view'] > 0 else 0 for v in video_data]
fig.add_trace(
go.Bar(x=titles, y=interaction_ratio, marker_color='brown',
name='弹幕/播放量(千分比)'),
row=3, col=2
)
# 9. 情感分数分布
fig.add_trace(
go.Histogram(x=emotion_data['sentiment_scores'], nbinsx=20,
marker_color='lightseagreen', name='情感分数分布'),
row=3, col=3
)
fig.update_layout(
height=1200,
title_text="B站视频弹幕深度分析大屏",
showlegend=True,
template="plotly_white"
)
fig.show()
return fig
def get_word_frequency(self, text_list):
"""获取词频"""
all_text = ' '.join(text_list)
words = jieba.cut(all_text)
# 过滤停用词
stop_words = {
'', '', '', '', '', '', '', '', '', '', '', '', '一个',
'', '', '', '', '', '', '', '', '', '', '没有', '', ''
}
filtered_words = [
word for word in words
if len(word) > 1
and word not in stop_words
and not re.match(r'^\d+$', word)
]
return Counter(filtered_words)
def create_wordcloud_advanced(self, text_list, filename='advanced_wordcloud.png'):
"""创建高级词云"""
text = ' '.join(text_list)
# 使用词频生成词云
word_freq = self.get_word_frequency(text_list)
# 创建词云
try:
wordcloud = WordCloud(
font_path='simhei.ttf',
width=1600,
height=800,
background_color='white',
max_words=200,
colormap='viridis',
relative_scaling=0.5,
collocations=False
).generate_from_frequencies(word_freq)
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('高级弹幕词云分析', fontsize=24, pad=20)
plt.tight_layout()
plt.savefig(filename, dpi=300, bbox_inches='tight')
plt.show()
return wordcloud
except Exception as e:
print(f"生成词云时出错: {e}")
# 备用方案:使用文本生成
wordcloud = WordCloud(
width=1600,
height=800,
background_color='white',
max_words=200,
colormap='viridis'
).generate(text)
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('高级弹幕词云分析', fontsize=24, pad=20)
plt.tight_layout()
plt.savefig(filename, dpi=300, bbox_inches='tight')
plt.show()
return wordcloud
def generate_comprehensive_report(self, video_data, emotion_data, keywords):
"""生成综合分析报告"""
print("\n" + "="*80)
print("📊 B站视频弹幕深度分析报告")
print("="*80)
# 总体统计
total_views = sum(v['view'] for v in video_data)
total_danmu = sum(v['danmaku_count'] for v in video_data)
total_videos = len(video_data)
print(f"\n📈 总体统计:")
print(f" • 分析视频数量: {total_videos}")
print(f" • 总播放量: {total_views:,}")
print(f" • 总弹幕数: {total_danmu}")
print(f" • 平均播放量: {total_views/total_videos:,.0f}")
# 情感分析报告
total_sentiments = emotion_data['positive'] + emotion_data['neutral'] + emotion_data['negative']
if total_sentiments > 0:
positive_ratio = emotion_data['positive'] / total_sentiments * 100
negative_ratio = emotion_data['negative'] / total_sentiments * 100
print(f"\n😊 情感分析:")
print(f" • 正面情感: {emotion_data['positive']} ({positive_ratio:.1f}%)")
print(f" • 中性情感: {emotion_data['neutral']} ({(100-positive_ratio-negative_ratio):.1f}%)")
print(f" • 负面情感: {emotion_data['negative']} ({negative_ratio:.1f}%)")
if positive_ratio > 60:
sentiment_verdict = "🌟 社区氛围非常积极!"
elif positive_ratio > 40:
sentiment_verdict = "👍 社区氛围总体良好"
else:
sentiment_verdict = "⚠️ 社区氛围需要关注"
print(f" • 情感结论: {sentiment_verdict}")
# 热门话题
print(f"\n🔥 热门话题TOP10:")
if keywords:
for i, (phrase, weight) in enumerate(keywords[:10], 1):
print(f" {i:2d}. {phrase}: {weight:.3f}")
else:
print(" 未提取到关键词")
# 视频排名
print(f"\n🏆 视频热度排名:")
sorted_videos = sorted(video_data, key=lambda x: x['view'], reverse=True)
for i, video in enumerate(sorted_videos, 1):
title_short = video['title'][:25] + '...' if len(video['title']) > 25 else video['title']
print(f" {i:2d}. {title_short}")
print(f" 播放量: {video['view']:,} | 弹幕数: {video['danmaku_count']} | 作者: {video['owner']}")
# 互动分析
print(f"\n💬 互动质量分析:")
for video in video_data:
interaction_rate = video['danmaku_count'] / video['view'] * 1000 if video['view'] > 0 else 0
title_short = video['title'][:20] + '...' if len(video['title']) > 20 else video['title']
print(f"{title_short}: {interaction_rate:.2f}‰ (每千次播放弹幕数)")
# 内容质量评估
if emotion_data['sentiment_scores']:
avg_sentiment = np.mean(emotion_data['sentiment_scores'])
sentiment_std = np.std(emotion_data['sentiment_scores'])
print(f"\n📊 内容质量评估:")
print(f" • 平均情感分数: {avg_sentiment:.3f}")
print(f" • 情感波动度: {sentiment_std:.3f}")
if avg_sentiment > 0.1:
quality_verdict = "🎯 内容质量优秀,用户反馈积极"
elif avg_sentiment > -0.1:
quality_verdict = "✅ 内容质量良好,用户反馈中性"
else:
quality_verdict = "💡 内容有待优化,用户反馈偏负面"
print(f" • 质量结论: {quality_verdict}")
# 保留原有的基础方法
def get_video_info(self, bvid):
"""获取视频信息"""
url = "https://api.bilibili.com/x/web-interface/view"
params = {'bvid': bvid}
try:
response = requests.get(url, params=params, headers=self.headers, timeout=10)
if response.status_code == 200:
data = response.json()
if data.get('code') == 0:
video_data = data['data']
return {
'title': video_data['title'],
'cid': video_data['cid'],
'bvid': bvid,
'owner': video_data['owner']['name'],
'view': video_data['stat']['view'],
'danmaku_count': video_data['stat']['danmaku']
}
except Exception as e:
print(f"获取视频信息失败: {e}")
return None
def get_danmu_data(self, cid):
"""获取弹幕数据"""
url = f"https://api.bilibili.com/x/v1/dm/list.so"
params = {'oid': cid}
try:
response = requests.get(url, params=params, headers=self.headers, timeout=10)
if response.status_code == 200:
try:
import xml.etree.ElementTree as ET
root = ET.fromstring(response.content)
return [d.text for d in root.findall('d')]
except:
content = response.content.decode('utf-8')
danmu_pattern = r'<d[^>]*>([^<]+)</d>'
return re.findall(danmu_pattern, content)
except Exception as e:
print(f"获取弹幕失败: {e}")
return []
def filter_noise(self, danmu_list):
"""过滤噪声弹幕"""
noise_words = ['666', '哈哈哈', '233', 'awsl', '哈哈哈哈', '妙啊', '好活', '点赞', '支持']
filtered_danmu = []
for danmu in danmu_list:
if not danmu or len(danmu.strip()) <= 1:
continue
if any(noise in danmu for noise in noise_words):
continue
if danmu.strip().isdigit():
continue
if len(set(danmu)) <= 2:
continue
filtered_danmu.append(danmu.strip())
return filtered_danmu
def main():
"""主函数 - 分析多个热门视频"""
# 定义要分析的视频列表B站热门视频BV号
hot_videos = [
"BV1kg4y1T7PA", # AI相关
"BV1uC41177bh", # 科技热门
"BV1Gu4y1W7iK", # 知识科普
]
print("🚀 B站热门视频弹幕深度分析系统")
print("="*50)
analyzer = AdvancedBilibiliAnalyzer()
# 1. 获取多个视频数据
print("📥 正在获取视频数据...")
all_danmu, video_data = analyzer.get_multiple_videos(hot_videos)
if not all_danmu:
print("❌ 无法获取弹幕数据,程序结束")
return
print(f"✅ 成功获取 {len(video_data)} 个视频的 {len(all_danmu)} 条弹幕")
# 2. 情感分析
print("\n😊 正在进行情感分析...")
emotion_data = analyzer.analyze_emotion_trend(all_danmu)
# 3. 关键词提取
print("\n🔍 正在提取关键短语...")
keywords = analyzer.extract_key_phrases(all_danmu, 15)
# 4. 创建高级词云
print("\n🎨 生成高级词云...")
analyzer.create_wordcloud_advanced(all_danmu, 'hot_videos_wordcloud.png')
# 5. 创建交互式大屏
print("\n📊 创建交互式可视化大屏...")
analyzer.create_interactive_dashboard(video_data, all_danmu, emotion_data, keywords)
# 6. 生成详细报告
print("\n📋 生成综合分析报告...")
analyzer.generate_comprehensive_report(video_data, emotion_data, keywords)
print("\n🎉 分析完成!")
print("📁 生成的文件:")
print(" • hot_videos_wordcloud.png - 高级词云图")
print(" • 交互式可视化大屏 (在浏览器中显示)")
if __name__ == "__main__":
main()
Loading…
Cancel
Save