|
|
import requests
|
|
|
import pandas as pd
|
|
|
import re
|
|
|
import jieba
|
|
|
from collections import Counter
|
|
|
from wordcloud import WordCloud
|
|
|
import matplotlib.pyplot as plt
|
|
|
import time
|
|
|
import xml.etree.ElementTree as ET
|
|
|
import warnings
|
|
|
warnings.filterwarnings('ignore')
|
|
|
import time
|
|
|
import functools
|
|
|
from collections import defaultdict
|
|
|
|
|
|
# 全局性能数据存储
|
|
|
performance_data = defaultdict(list)
|
|
|
|
|
|
def performance_monitor(func):
|
|
|
"""性能监控装饰器"""
|
|
|
@functools.wraps(func)
|
|
|
def wrapper(*args, **kwargs):
|
|
|
start_time = time.time()
|
|
|
start_memory = get_memory_usage()
|
|
|
|
|
|
try:
|
|
|
result = func(*args, **kwargs)
|
|
|
return result
|
|
|
finally:
|
|
|
end_time = time.time()
|
|
|
end_memory = get_memory_usage()
|
|
|
|
|
|
execution_time = end_time - start_time
|
|
|
memory_used = end_memory - start_memory
|
|
|
|
|
|
# 记录性能数据
|
|
|
func_name = f"{args[0].__class__.__name__}.{func.__name__}" if args else func.__name__
|
|
|
performance_data[func_name].append({
|
|
|
'execution_time': execution_time,
|
|
|
'memory_used': memory_used,
|
|
|
'timestamp': time.time()
|
|
|
})
|
|
|
|
|
|
print(f"⏱️ {func_name}: {execution_time:.4f}s, 内存: {memory_used:.2f}MB")
|
|
|
|
|
|
return wrapper
|
|
|
|
|
|
def get_memory_usage():
|
|
|
"""获取当前内存使用量(MB)"""
|
|
|
import psutil
|
|
|
import os
|
|
|
process = psutil.Process(os.getpid())
|
|
|
return process.memory_info().rss / 1024 / 1024
|
|
|
|
|
|
def generate_performance_report():
|
|
|
"""生成性能分析报告"""
|
|
|
print("\n" + "="*60)
|
|
|
print("📊 详细性能分析报告")
|
|
|
print("="*60)
|
|
|
|
|
|
total_time = 0
|
|
|
for func_name, data_list in performance_data.items():
|
|
|
if data_list:
|
|
|
times = [d['execution_time'] for d in data_list]
|
|
|
memories = [d['memory_used'] for d in data_list]
|
|
|
|
|
|
avg_time = sum(times) / len(times)
|
|
|
max_time = max(times)
|
|
|
total_time += sum(times)
|
|
|
avg_memory = sum(memories) / len(memories)
|
|
|
|
|
|
print(f"\n{func_name}:")
|
|
|
print(f" 调用次数: {len(data_list)}")
|
|
|
print(f" 平均时间: {avg_time:.4f}s")
|
|
|
print(f" 最长时间: {max_time:.4f}s")
|
|
|
print(f" 总时间: {sum(times):.4f}s")
|
|
|
print(f" 平均内存: {avg_memory:.2f}MB")
|
|
|
|
|
|
print(f"\n🎯 总执行时间: {total_time:.4f}秒")
|
|
|
|
|
|
# 应用性能监控到BilibiliVideoAnalyzer的所有方法
|
|
|
def apply_performance_monitoring():
|
|
|
"""应用性能监控到BilibiliVideoAnalyzer的所有公共方法"""
|
|
|
for method_name in dir(BilibiliVideoAnalyzer):
|
|
|
if not method_name.startswith('_') and callable(getattr(BilibiliVideoAnalyzer, method_name)):
|
|
|
original_method = getattr(BilibiliVideoAnalyzer, method_name)
|
|
|
setattr(BilibiliVideoAnalyzer, method_name, performance_monitor(original_method))
|
|
|
|
|
|
# 设置中文字体
|
|
|
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']
|
|
|
plt.rcParams['axes.unicode_minus'] = False
|
|
|
|
|
|
class BilibiliVideoAnalyzer:
|
|
|
def __init__(self):
|
|
|
self.headers = {
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
|
'Referer': 'https://www.bilibili.com',
|
|
|
}
|
|
|
self.danmu_data = []
|
|
|
|
|
|
def get_video_info(self, bvid):
|
|
|
"""获取视频信息,包括CID"""
|
|
|
url = "https://api.bilibili.com/x/web-interface/view"
|
|
|
params = {'bvid': bvid}
|
|
|
|
|
|
try:
|
|
|
response = requests.get(url, params=params, headers=self.headers, timeout=10)
|
|
|
if response.status_code == 200:
|
|
|
data = response.json()
|
|
|
if data.get('code') == 0:
|
|
|
video_data = data['data']
|
|
|
print(f"视频标题: {video_data['title']}")
|
|
|
print(f"视频作者: {video_data['owner']['name']}")
|
|
|
print(f"播放量: {video_data['stat']['view']}")
|
|
|
print(f"弹幕数: {video_data['stat']['danmaku']}")
|
|
|
|
|
|
# 获取CID
|
|
|
cid = video_data['cid']
|
|
|
print(f"视频CID: {cid}")
|
|
|
|
|
|
return {
|
|
|
'title': video_data['title'],
|
|
|
'cid': cid,
|
|
|
'bvid': bvid,
|
|
|
'owner': video_data['owner']['name'],
|
|
|
'view': video_data['stat']['view'],
|
|
|
'danmaku_count': video_data['stat']['danmaku']
|
|
|
}
|
|
|
else:
|
|
|
print(f"API返回错误: {data.get('message')}")
|
|
|
else:
|
|
|
print(f"HTTP请求失败,状态码: {response.status_code}")
|
|
|
except Exception as e:
|
|
|
print(f"获取视频信息失败: {e}")
|
|
|
|
|
|
return None
|
|
|
|
|
|
def get_danmu_data(self, cid):
|
|
|
"""通过CID获取弹幕数据 - 使用多种解析方法确保兼容性"""
|
|
|
url = f"https://api.bilibili.com/x/v1/dm/list.so"
|
|
|
params = {'oid': cid}
|
|
|
|
|
|
try:
|
|
|
response = requests.get(url, params=params, headers=self.headers, timeout=10)
|
|
|
if response.status_code == 200:
|
|
|
# 方法1: 使用Python内置XML解析器(最可靠)
|
|
|
try:
|
|
|
root = ET.fromstring(response.content)
|
|
|
danmu_list = []
|
|
|
for d in root.findall('d'):
|
|
|
danmu_list.append(d.text)
|
|
|
print(f"使用内置XML解析器获取 {len(danmu_list)} 条弹幕")
|
|
|
return danmu_list
|
|
|
except ET.ParseError:
|
|
|
# 方法2: 使用正则表达式作为备选
|
|
|
try:
|
|
|
content = response.content.decode('utf-8')
|
|
|
# 使用正则表达式匹配弹幕
|
|
|
danmu_pattern = r'<d[^>]*>([^<]+)</d>'
|
|
|
danmu_list = re.findall(danmu_pattern, content)
|
|
|
print(f"使用正则表达式获取 {len(danmu_list)} 条弹幕")
|
|
|
return danmu_list
|
|
|
except Exception as e:
|
|
|
print(f"正则表达式解析失败: {e}")
|
|
|
except Exception as e:
|
|
|
print(f"XML解析失败: {e}")
|
|
|
else:
|
|
|
print(f"获取弹幕HTTP请求失败,状态码: {response.status_code}")
|
|
|
except Exception as e:
|
|
|
print(f"获取弹幕失败: {e}")
|
|
|
|
|
|
return []
|
|
|
|
|
|
def filter_noise(self, danmu_list):
|
|
|
"""过滤噪声弹幕"""
|
|
|
# 噪声词列表
|
|
|
noise_words = [
|
|
|
'666', '哈哈哈', '233', 'awsl', '哈哈哈哈', '妙啊', '好活',
|
|
|
'点赞', '支持', '顶', '签到', '来了', '第一', '前排',
|
|
|
'打卡', '报道', '路过', '围观', '沙发', '板凳', '哈哈哈',
|
|
|
'笑死', 'hhhh', 'hhh', '啊啊啊', '哇', '哦', '嗯', '呃',
|
|
|
'不错', '可以', '挺好', '好的', '谢谢', '感谢', '牛逼', '太强了'
|
|
|
]
|
|
|
|
|
|
filtered_danmu = []
|
|
|
|
|
|
for danmu in danmu_list:
|
|
|
# 过滤空弹幕和过短弹幕
|
|
|
if not danmu or len(danmu.strip()) <= 1:
|
|
|
continue
|
|
|
|
|
|
# 过滤噪声词
|
|
|
if any(noise in danmu for noise in noise_words):
|
|
|
continue
|
|
|
|
|
|
# 过滤纯数字
|
|
|
if danmu.strip().isdigit():
|
|
|
continue
|
|
|
|
|
|
# 过滤重复字符(如"啊啊啊啊")
|
|
|
if len(set(danmu)) <= 2:
|
|
|
continue
|
|
|
|
|
|
filtered_danmu.append(danmu.strip())
|
|
|
|
|
|
print(f"过滤后剩余 {len(filtered_danmu)} 条有效弹幕")
|
|
|
return filtered_danmu
|
|
|
|
|
|
def segment_and_count_words(self, danmu_list):
|
|
|
"""分词并统计词频"""
|
|
|
# 添加自定义词典
|
|
|
custom_words = [
|
|
|
'大语言模型', 'LLM', 'GPT', 'ChatGPT', '文心一言', '通义千问',
|
|
|
'智谱', 'AI模型', '智能客服', '代码生成', '深度学习', '神经网络',
|
|
|
'人工智能', '自然语言', '机器学习', 'AI技术', '模型训练', '应用成本',
|
|
|
'数据安全', '隐私保护', '就业影响', '技术门槛', '内容创作', '智能助手',
|
|
|
'AIGC', '多模态', '算法优化', '训练数据', '模型部署', 'API调用'
|
|
|
]
|
|
|
|
|
|
for word in custom_words:
|
|
|
jieba.add_word(word)
|
|
|
|
|
|
all_text = ' '.join(danmu_list)
|
|
|
|
|
|
# 使用jieba进行分词
|
|
|
words = jieba.cut(all_text)
|
|
|
|
|
|
# 过滤停用词和单字
|
|
|
stop_words = {
|
|
|
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个',
|
|
|
'上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好',
|
|
|
'自己', '这个', '那个', '就是', '可以', '怎么', '什么', '这样', '这种', '这些',
|
|
|
'还有', '就是', '一下', '一点', '一种', '一些', '这个', '那个', '这种', '那种',
|
|
|
'这样', '那样', '这么', '那么', '为啥', '为什么', '怎么', '怎么样', '如何'
|
|
|
}
|
|
|
|
|
|
filtered_words = [
|
|
|
word for word in words
|
|
|
if len(word) > 1
|
|
|
and word not in stop_words
|
|
|
and not re.match(r'^\d+$', word)
|
|
|
]
|
|
|
|
|
|
# 统计词频
|
|
|
word_freq = Counter(filtered_words)
|
|
|
return word_freq, filtered_words
|
|
|
|
|
|
def extract_llm_applications(self, word_freq, top_n=8):
|
|
|
"""提取LLM应用案例"""
|
|
|
# LLM应用领域关键词映射
|
|
|
application_keywords = {
|
|
|
'智能客服': ['客服', '客户服务', '问答', '咨询', '服务机器人', '智能问答', '在线客服'],
|
|
|
'代码编程': ['编程', '代码', '程序员', '开发', 'Copilot', '代码生成', '编程助手', '软件开发', '程序'],
|
|
|
'内容创作': ['写作', '创作', '文案', '文章', '内容生成', '写作文', '创作助手', '文案生成', '内容'],
|
|
|
'AI翻译': ['翻译', '多语言', '语言翻译', '翻译工具', '跨语言', '机器翻译', '翻译软件'],
|
|
|
'教育学习': ['教育', '学习', '教学', '辅导', '个性化学习', '学习助手', '教育AI', '在线教育', '老师'],
|
|
|
'创意设计': ['创意', '设计', '艺术', '绘画', '音乐', '创意生成', '设计助手', '艺术创作', '美术'],
|
|
|
'数据分析': ['数据', '分析', '报表', '报告生成', '数据处理', '数据分析', '数据挖掘', '统计'],
|
|
|
'医疗健康': ['医疗', '诊断', '健康', '病历', '医学', '医疗AI', '健康咨询', '智能诊断', '医生'],
|
|
|
'金融服务': ['金融', '风控', '投资', '银行', '保险', '金融分析', '风险控制', '量化交易', '理财'],
|
|
|
'智能助手': ['助手', '语音助手', '个人助理', '智能助理', 'AI助手', '虚拟助手', '助理'],
|
|
|
'游戏娱乐': ['游戏', 'NPC', '对话', '娱乐', '游戏AI', '角色对话', '游戏开发', '玩家'],
|
|
|
'科研学术': ['科研', '学术', '论文', '文献', '研究', '学术助手', '科学计算', '科学家']
|
|
|
}
|
|
|
|
|
|
application_scores = Counter()
|
|
|
|
|
|
# 计算每个应用领域的得分
|
|
|
for app_name, keywords in application_keywords.items():
|
|
|
score = 0
|
|
|
for keyword in keywords:
|
|
|
score += word_freq.get(keyword, 0)
|
|
|
if score > 0:
|
|
|
application_scores[app_name] = score
|
|
|
|
|
|
# 获取前N个应用
|
|
|
top_applications = application_scores.most_common(top_n)
|
|
|
return top_applications, application_scores
|
|
|
|
|
|
def analyze_user_views(self, word_freq, danmu_list):
|
|
|
"""分析用户对大语言模型的看法"""
|
|
|
# 定义不同维度的关键词
|
|
|
cost_keywords = ['成本', '价格', '昂贵', '便宜', '免费', '收费', '性价比', '投入', '预算', '费用', '花钱', '价值']
|
|
|
positive_keywords = ['好用', '实用', '方便', '强大', '厉害', '优秀', '精准', '准确', '惊喜', '进步', '提升', '效率', '创新', '革命']
|
|
|
negative_keywords = ['不行', '不好', '错误', '问题', '困难', '复杂', '昂贵', '糟糕', '缺陷', '不足', '局限', '风险', '错误', '偏差']
|
|
|
security_keywords = ['安全', '隐私', '泄露', '保护', '风险', '威胁', '危险', '伦理', '道德', '监管', '规范']
|
|
|
employment_keywords = ['失业', '工作', '岗位', '就业', '替代', '取代', '职业', '裁员', '淘汰', '人力']
|
|
|
future_keywords = ['未来', '发展', '趋势', '前景', '潜力', '机会', '创新', '变革', '革命', '突破']
|
|
|
technical_keywords = ['技术', '算法', '模型', '训练', '参数', '架构', '优化', '调参', '算力']
|
|
|
|
|
|
# 统计各维度提及次数
|
|
|
views_analysis = {
|
|
|
'应用成本': sum(word_freq.get(word, 0) for word in cost_keywords),
|
|
|
'正面评价': sum(word_freq.get(word, 0) for word in positive_keywords),
|
|
|
'负面评价': sum(word_freq.get(word, 0) for word in negative_keywords),
|
|
|
'安全隐私': sum(word_freq.get(word, 0) for word in security_keywords),
|
|
|
'就业影响': sum(word_freq.get(word, 0) for word in employment_keywords),
|
|
|
'发展前景': sum(word_freq.get(word, 0) for word in future_keywords),
|
|
|
'技术关注': sum(word_freq.get(word, 0) for word in technical_keywords),
|
|
|
}
|
|
|
|
|
|
# 分析具体观点
|
|
|
specific_views = {
|
|
|
'成本相关弹幕': [danmu for danmu in danmu_list if any(word in danmu for word in cost_keywords)],
|
|
|
'安全问题弹幕': [danmu for danmu in danmu_list if any(word in danmu for word in security_keywords)],
|
|
|
'就业影响弹幕': [danmu for danmu in danmu_list if any(word in danmu for word in employment_keywords)],
|
|
|
'技术讨论弹幕': [danmu for danmu in danmu_list if any(word in danmu for word in technical_keywords)],
|
|
|
}
|
|
|
|
|
|
return views_analysis, specific_views
|
|
|
|
|
|
def generate_wordcloud(self, words_list, filename='llm_wordcloud.png'):
|
|
|
"""生成词云图"""
|
|
|
text = ' '.join(words_list)
|
|
|
|
|
|
# 尝试多种字体
|
|
|
font_paths = [
|
|
|
'simhei.ttf',
|
|
|
'msyh.ttc',
|
|
|
'simsun.ttc',
|
|
|
'Arial Unicode.ttf' # macOS
|
|
|
]
|
|
|
|
|
|
font_path = None
|
|
|
for fp in font_paths:
|
|
|
try:
|
|
|
# 测试字体是否可用
|
|
|
WordCloud(font_path=fp).generate(text)
|
|
|
font_path = fp
|
|
|
print(f"使用字体: {fp}")
|
|
|
break
|
|
|
except:
|
|
|
continue
|
|
|
|
|
|
try:
|
|
|
if font_path:
|
|
|
wordcloud = WordCloud(
|
|
|
font_path=font_path,
|
|
|
width=1200,
|
|
|
height=800,
|
|
|
background_color='white',
|
|
|
max_words=200,
|
|
|
colormap='viridis',
|
|
|
relative_scaling=0.5,
|
|
|
collocations=False # 避免重复词语
|
|
|
).generate(text)
|
|
|
else:
|
|
|
# 使用默认字体
|
|
|
wordcloud = WordCloud(
|
|
|
width=1200,
|
|
|
height=800,
|
|
|
background_color='white',
|
|
|
max_words=200,
|
|
|
colormap='viridis',
|
|
|
relative_scaling=0.5,
|
|
|
collocations=False
|
|
|
).generate(text)
|
|
|
|
|
|
plt.figure(figsize=(15, 10))
|
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
|
plt.axis('off')
|
|
|
plt.title('大语言模型应用弹幕词云分析', fontsize=20, pad=20)
|
|
|
plt.tight_layout()
|
|
|
plt.savefig(filename, dpi=300, bbox_inches='tight')
|
|
|
plt.show()
|
|
|
|
|
|
return wordcloud
|
|
|
except Exception as e:
|
|
|
print(f"生成词云失败: {e}")
|
|
|
return None
|
|
|
|
|
|
def save_to_excel(self, word_freq, top_applications, application_scores, video_info, views_analysis, specific_views, filename='llm_analysis.xlsx'):
|
|
|
"""保存数据到Excel"""
|
|
|
try:
|
|
|
# 创建DataFrame
|
|
|
video_df = pd.DataFrame([video_info])
|
|
|
word_df = pd.DataFrame(word_freq.most_common(50), columns=['词语', '频次'])
|
|
|
app_df = pd.DataFrame(application_scores.most_common(), columns=['应用领域', '出现次数'])
|
|
|
top8_df = pd.DataFrame(top_applications, columns=['应用领域', '出现次数'])
|
|
|
danmu_df = pd.DataFrame(self.danmu_data, columns=['弹幕内容'])
|
|
|
|
|
|
# 计算百分比
|
|
|
if len(app_df) > 0 and app_df['出现次数'].sum() > 0:
|
|
|
app_df['百分比'] = (app_df['出现次数'] / app_df['出现次数'].sum() * 100).round(2)
|
|
|
|
|
|
# 保存到Excel
|
|
|
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
|
|
|
video_df.to_excel(writer, sheet_name='视频信息', index=False)
|
|
|
word_df.to_excel(writer, sheet_name='词频统计', index=False)
|
|
|
app_df.to_excel(writer, sheet_name='应用领域统计', index=False)
|
|
|
top8_df.to_excel(writer, sheet_name='TOP8应用案例', index=False)
|
|
|
|
|
|
# 保存用户观点分析
|
|
|
views_df = pd.DataFrame(list(views_analysis.items()), columns=['观点维度', '提及次数'])
|
|
|
if len(views_df) > 0 and views_df['提及次数'].sum() > 0:
|
|
|
views_df['百分比'] = (views_df['提及次数'] / views_df['提及次数'].sum() * 100).round(2)
|
|
|
views_df.to_excel(writer, sheet_name='用户观点分析', index=False)
|
|
|
|
|
|
# 保存具体观点示例
|
|
|
for view_type, examples in specific_views.items():
|
|
|
if examples:
|
|
|
example_df = pd.DataFrame(examples[:10], columns=[f'{view_type}示例'])
|
|
|
example_df.to_excel(writer, sheet_name=f'{view_type[:5]}示例', index=False)
|
|
|
|
|
|
danmu_df.to_excel(writer, sheet_name='原始弹幕数据', index=False)
|
|
|
|
|
|
print(f"数据已保存到 {filename}")
|
|
|
return True
|
|
|
except Exception as e:
|
|
|
print(f"保存Excel文件失败: {e}")
|
|
|
# 尝试保存为CSV
|
|
|
try:
|
|
|
word_df = pd.DataFrame(word_freq.most_common(50), columns=['词语', '频次'])
|
|
|
word_df.to_csv('llm_word_freq.csv', index=False, encoding='utf-8-sig')
|
|
|
pd.DataFrame(top_applications, columns=['应用领域', '出现次数']).to_csv('llm_applications.csv', index=False, encoding='utf-8-sig')
|
|
|
print("数据已保存到CSV文件")
|
|
|
return True
|
|
|
except Exception as e2:
|
|
|
print(f"保存CSV文件也失败: {e2}")
|
|
|
return False
|
|
|
|
|
|
def plot_top_applications(self, top_applications):
|
|
|
"""绘制TOP应用柱状图"""
|
|
|
if not top_applications:
|
|
|
print("没有找到应用领域数据")
|
|
|
return
|
|
|
|
|
|
apps, counts = zip(*top_applications)
|
|
|
|
|
|
plt.figure(figsize=(12, 8))
|
|
|
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7', '#DDA0DD', '#98D8C8', '#F7DC6F']
|
|
|
bars = plt.bar(apps, counts, color=colors[:len(apps)])
|
|
|
|
|
|
# 添加数据标签
|
|
|
for bar, count in zip(bars, counts):
|
|
|
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
|
|
|
f'{count}', ha='center', va='bottom', fontsize=12, fontweight='bold')
|
|
|
|
|
|
plt.title('大语言模型应用领域TOP8分布', fontsize=16, pad=20)
|
|
|
plt.xlabel('应用领域', fontsize=14)
|
|
|
plt.ylabel('出现频次', fontsize=14)
|
|
|
plt.xticks(rotation=45, ha='right')
|
|
|
plt.grid(axis='y', alpha=0.3)
|
|
|
plt.tight_layout()
|
|
|
plt.show()
|
|
|
|
|
|
def plot_user_views(self, views_analysis):
|
|
|
"""绘制用户观点分析图"""
|
|
|
if not views_analysis or sum(views_analysis.values()) == 0:
|
|
|
print("没有用户观点数据可展示")
|
|
|
return
|
|
|
|
|
|
# 过滤掉值为0的条目
|
|
|
filtered_views = {k: v for k, v in views_analysis.items() if v > 0}
|
|
|
|
|
|
if not filtered_views:
|
|
|
print("所有观点维度提及次数都为0")
|
|
|
return
|
|
|
|
|
|
categories, counts = zip(*filtered_views.items())
|
|
|
|
|
|
plt.figure(figsize=(12, 8))
|
|
|
colors = ['#FF9999', '#66B2FF', '#99FF99', '#FFD700', '#FFB6C1', '#87CEEB', '#98FB98']
|
|
|
bars = plt.bar(categories, counts, color=colors[:len(categories)])
|
|
|
|
|
|
# 添加数据标签
|
|
|
for bar, count in zip(bars, counts):
|
|
|
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
|
|
|
f'{count}', ha='center', va='bottom', fontsize=12, fontweight='bold')
|
|
|
|
|
|
plt.title('用户对大语言模型的观点分布', fontsize=16, pad=20)
|
|
|
plt.xlabel('观点维度', fontsize=14)
|
|
|
plt.ylabel('提及次数', fontsize=14)
|
|
|
plt.xticks(rotation=45, ha='right')
|
|
|
plt.grid(axis='y', alpha=0.3)
|
|
|
plt.tight_layout()
|
|
|
plt.show()
|
|
|
|
|
|
def analyze_conclusions(self, word_freq, top_applications, views_analysis, specific_views, video_title):
|
|
|
"""分析并得出结论"""
|
|
|
print("\n" + "="*60)
|
|
|
print("大语言模型应用分析结论")
|
|
|
print("="*60)
|
|
|
|
|
|
print(f"\n分析视频: {video_title}")
|
|
|
|
|
|
# 1. 应用领域分析
|
|
|
if top_applications:
|
|
|
total_mentions = sum([count for _, count in top_applications])
|
|
|
print(f"\n1. 主要应用领域分布:")
|
|
|
for app, count in top_applications:
|
|
|
percentage = (count / total_mentions) * 100 if total_mentions > 0 else 0
|
|
|
print(f" - {app}: {count}次 ({percentage:.1f}%)")
|
|
|
|
|
|
# 2. 用户观点综合分析
|
|
|
print(f"\n2. 用户观点综合分析:")
|
|
|
total_views = sum(views_analysis.values())
|
|
|
if total_views > 0:
|
|
|
for category, count in views_analysis.items():
|
|
|
percentage = (count / total_views) * 100
|
|
|
print(f" - {category}: {count}次 ({percentage:.1f}%)")
|
|
|
|
|
|
# 3. 具体观点深入分析
|
|
|
print(f"\n3. 具体观点深入分析:")
|
|
|
|
|
|
# 应用成本分析
|
|
|
if total_views > 0:
|
|
|
cost_ratio = views_analysis['应用成本'] / total_views * 100
|
|
|
print(f" - 应用成本关注度: {cost_ratio:.1f}%")
|
|
|
if specific_views['成本相关弹幕']:
|
|
|
print(f" 代表性观点: {specific_views['成本相关弹幕'][0][:50]}...")
|
|
|
|
|
|
# 安全隐私分析
|
|
|
if total_views > 0:
|
|
|
security_ratio = views_analysis['安全隐私'] / total_views * 100
|
|
|
print(f" - 安全隐私关注度: {security_ratio:.1f}%")
|
|
|
if specific_views['安全问题弹幕']:
|
|
|
print(f" 代表性观点: {specific_views['安全问题弹幕'][0][:50]}...")
|
|
|
|
|
|
# 就业影响分析
|
|
|
if total_views > 0:
|
|
|
employment_ratio = views_analysis['就业影响'] / total_views * 100
|
|
|
print(f" - 就业影响关注度: {employment_ratio:.1f}%")
|
|
|
if specific_views['就业影响弹幕']:
|
|
|
print(f" 代表性观点: {specific_views['就业影响弹幕'][0][:50]}...")
|
|
|
|
|
|
# 4. 总体评价倾向
|
|
|
if (views_analysis['正面评价'] + views_analysis['负面评价']) > 0:
|
|
|
positive_ratio = views_analysis['正面评价'] / (views_analysis['正面评价'] + views_analysis['负面评价']) * 100
|
|
|
print(f"\n4. 总体评价倾向:")
|
|
|
print(f" - 正面评价占比: {positive_ratio:.1f}%")
|
|
|
if positive_ratio > 60:
|
|
|
print(" - 用户态度: 总体积极乐观")
|
|
|
elif positive_ratio < 40:
|
|
|
print(" - 用户态度: 存在较多担忧")
|
|
|
else:
|
|
|
print(" - 用户态度: 理性看待,既有期待也有担忧")
|
|
|
|
|
|
# 5. 技术发展趋势
|
|
|
if total_views > 0:
|
|
|
future_ratio = views_analysis['发展前景'] / total_views * 100
|
|
|
print(f"\n5. 技术发展趋势:")
|
|
|
print(f" - 未来发展关注度: {future_ratio:.1f}%")
|
|
|
if future_ratio > 15:
|
|
|
print(" - 用户对LLM未来发展保持高度关注")
|
|
|
|
|
|
# 6. 主要发现总结
|
|
|
print(f"\n6. 主要发现总结:")
|
|
|
if views_analysis['应用成本'] > views_analysis['安全隐私']:
|
|
|
print(" - 用户更关注应用成本而非安全问题")
|
|
|
else:
|
|
|
print(" - 用户对安全隐私问题的关注超过成本问题")
|
|
|
|
|
|
if views_analysis['就业影响'] > 0:
|
|
|
print(" - 就业替代效应已引起用户关注")
|
|
|
|
|
|
if views_analysis['正面评价'] > views_analysis['负面评价']:
|
|
|
print(" - 总体上用户对LLM技术持积极态度")
|
|
|
else:
|
|
|
print(" - 用户对LLM技术存在较多担忧")
|
|
|
|
|
|
# 7. 技术关注度
|
|
|
if total_views > 0:
|
|
|
tech_ratio = views_analysis['技术关注'] / total_views * 100
|
|
|
print(f" - 技术细节讨论占比: {tech_ratio:.1f}%")
|
|
|
|
|
|
def extract_bvid_from_url(url):
|
|
|
"""从B站URL中提取BV号"""
|
|
|
bvid_pattern = r'BV[0-9A-Za-z]{10}'
|
|
|
match = re.search(bvid_pattern, url)
|
|
|
if match:
|
|
|
return match.group()
|
|
|
return None
|
|
|
|
|
|
def main():
|
|
|
"""主函数"""
|
|
|
apply_performance_monitoring()
|
|
|
# 从URL中提取BV号
|
|
|
url = "https://www.bilibili.com/video/BV1kg4y1T7PA/?spm_id_from=333.337.search-card.all.click&vd_source=15df046f7c6c0dbb574611c9d3e4d5ef/"
|
|
|
bvid = extract_bvid_from_url(url)
|
|
|
|
|
|
if not bvid:
|
|
|
print("无法从URL中提取BV号")
|
|
|
return
|
|
|
|
|
|
print(f"提取的BV号: {bvid}")
|
|
|
|
|
|
analyzer = BilibiliVideoAnalyzer()
|
|
|
|
|
|
# 1. 获取视频信息和CID
|
|
|
print("\n获取视频信息...")
|
|
|
video_info = analyzer.get_video_info(bvid)
|
|
|
|
|
|
if not video_info:
|
|
|
print("无法获取视频信息,程序结束")
|
|
|
return
|
|
|
|
|
|
# 2. 获取弹幕数据
|
|
|
print("\n获取弹幕数据...")
|
|
|
danmu_data = analyzer.get_danmu_data(video_info['cid'])
|
|
|
|
|
|
if not danmu_data:
|
|
|
print("无法获取弹幕数据,程序结束")
|
|
|
return
|
|
|
|
|
|
analyzer.danmu_data = danmu_data
|
|
|
|
|
|
# 3. 过滤噪声
|
|
|
print("\n过滤噪声弹幕...")
|
|
|
filtered_danmu = analyzer.filter_noise(danmu_data)
|
|
|
|
|
|
if not filtered_danmu:
|
|
|
print("过滤后无有效弹幕,程序结束")
|
|
|
return
|
|
|
|
|
|
# 4. 分词统计词频
|
|
|
print("\n进行分词和词频统计...")
|
|
|
word_freq, all_words = analyzer.segment_and_count_words(filtered_danmu)
|
|
|
|
|
|
# 显示前20个高频词
|
|
|
print("\n前20个高频词:")
|
|
|
for word, count in word_freq.most_common(20):
|
|
|
print(f" {word}: {count}")
|
|
|
|
|
|
# 5. 提取LLM应用案例
|
|
|
print("\n提取LLM应用案例...")
|
|
|
top_applications, application_scores = analyzer.extract_llm_applications(word_freq, 8)
|
|
|
|
|
|
# 6. 分析用户观点
|
|
|
print("\n分析用户观点...")
|
|
|
views_analysis, specific_views = analyzer.analyze_user_views(word_freq, filtered_danmu)
|
|
|
|
|
|
# 7. 显示TOP8应用
|
|
|
if top_applications:
|
|
|
print("\nTOP 8 大语言模型应用领域:")
|
|
|
for i, (app, count) in enumerate(top_applications, 1):
|
|
|
print(f"{i}. {app}: {count}次")
|
|
|
|
|
|
analyzer.plot_top_applications(top_applications)
|
|
|
else:
|
|
|
print("未识别到明显的LLM应用领域")
|
|
|
|
|
|
# 8. 显示用户观点分析
|
|
|
print("\n用户观点分析:")
|
|
|
for category, count in views_analysis.items():
|
|
|
print(f" {category}: {count}次")
|
|
|
|
|
|
analyzer.plot_user_views(views_analysis)
|
|
|
|
|
|
# 9. 生成词云
|
|
|
print("\n生成词云图...")
|
|
|
analyzer.generate_wordcloud(all_words)
|
|
|
|
|
|
# 10. 保存到Excel
|
|
|
print("\n保存数据到Excel...")
|
|
|
success = analyzer.save_to_excel(word_freq, top_applications, application_scores, video_info, views_analysis, specific_views)
|
|
|
|
|
|
if success:
|
|
|
print("数据分析完成!")
|
|
|
else:
|
|
|
print("数据分析完成,但数据保存失败")
|
|
|
|
|
|
# 11. 分析结论
|
|
|
analyzer.analyze_conclusions(word_freq, top_applications, views_analysis, specific_views, video_info['title'])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main() |