ADD file via upload

dev
fzu102301341 6 months ago
parent 7b87171f78
commit 6c2fb95c32

@ -0,0 +1,838 @@
import requests
import pandas as pd
import re
import jieba
from collections import Counter, defaultdict
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import time
import xml.etree.ElementTree as ET
import warnings
import functools
import psutil
import os
from typing import List, Dict, Tuple, Optional, Any
from dataclasses import dataclass
warnings.filterwarnings('ignore')
# 常量定义
class Constants:
"""存放所有常量的类"""
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': 'https://www.bilibili.com',
}
TIMEOUT = 10
TOP_N_APPLICATIONS = 8
TOP_N_WORDS = 20
MAX_WORDS_CLOUD = 200
# 噪声词列表
NOISE_WORDS = {
'666', '哈哈哈', '233', 'awsl', '哈哈哈哈', '妙啊', '好活',
'点赞', '支持', '', '签到', '来了', '第一', '前排',
'打卡', '报道', '路过', '围观', '沙发', '板凳',
'笑死', 'hhhh', 'hhh', '啊啊啊', '', '', '', '',
'不错', '可以', '挺好', '好的', '谢谢', '感谢', '牛逼', '太强了'
}
# 停用词
STOP_WORDS = {
'', '', '', '', '', '', '', '', '', '', '', '', '一个',
'', '', '', '', '', '', '', '', '', '', '没有', '', '',
'自己', '这个', '那个', '就是', '可以', '怎么', '什么', '这样', '这种', '这些',
'还有', '就是', '一下', '一点', '一种', '一些', '这个', '那个', '这种', '那种',
'这样', '那样', '这么', '那么', '为啥', '为什么', '怎么', '怎么样', '如何'
}
# 自定义词典
CUSTOM_WORDS = [
'大语言模型', 'LLM', 'GPT', 'ChatGPT', '文心一言', '通义千问',
'智谱', 'AI模型', '智能客服', '代码生成', '深度学习', '神经网络',
'人工智能', '自然语言', '机器学习', 'AI技术', '模型训练', '应用成本',
'数据安全', '隐私保护', '就业影响', '技术门槛', '内容创作', '智能助手',
'AIGC', '多模态', '算法优化', '训练数据', '模型部署', 'API调用'
]
# 应用领域关键词
APPLICATION_KEYWORDS = {
'智能客服': ['客服', '客户服务', '问答', '咨询', '服务机器人', '智能问答', '在线客服'],
'代码编程': ['编程', '代码', '程序员', '开发', 'Copilot', '代码生成', '编程助手', '软件开发', '程序'],
'内容创作': ['写作', '创作', '文案', '文章', '内容生成', '写作文', '创作助手', '文案生成', '内容'],
'AI翻译': ['翻译', '多语言', '语言翻译', '翻译工具', '跨语言', '机器翻译', '翻译软件'],
'教育学习': ['教育', '学习', '教学', '辅导', '个性化学习', '学习助手', '教育AI', '在线教育', '老师'],
'创意设计': ['创意', '设计', '艺术', '绘画', '音乐', '创意生成', '设计助手', '艺术创作', '美术'],
'数据分析': ['数据', '分析', '报表', '报告生成', '数据处理', '数据分析', '数据挖掘', '统计'],
'医疗健康': ['医疗', '诊断', '健康', '病历', '医学', '医疗AI', '健康咨询', '智能诊断', '医生'],
'金融服务': ['金融', '风控', '投资', '银行', '保险', '金融分析', '风险控制', '量化交易', '理财'],
'智能助手': ['助手', '语音助手', '个人助理', '智能助理', 'AI助手', '虚拟助手', '助理'],
'游戏娱乐': ['游戏', 'NPC', '对话', '娱乐', '游戏AI', '角色对话', '游戏开发', '玩家'],
'科研学术': ['科研', '学术', '论文', '文献', '研究', '学术助手', '科学计算', '科学家']
}
# 观点分析关键词
VIEW_KEYWORDS = {
'应用成本': ['成本', '价格', '昂贵', '便宜', '免费', '收费', '性价比', '投入', '预算', '费用', '花钱', '价值'],
'正面评价': ['好用', '实用', '方便', '强大', '厉害', '优秀', '精准', '准确', '惊喜', '进步', '提升', '效率', '创新', '革命'],
'负面评价': ['不行', '不好', '错误', '问题', '困难', '复杂', '昂贵', '糟糕', '缺陷', '不足', '局限', '风险', '错误', '偏差'],
'安全隐私': ['安全', '隐私', '泄露', '保护', '风险', '威胁', '危险', '伦理', '道德', '监管', '规范'],
'就业影响': ['失业', '工作', '岗位', '就业', '替代', '取代', '职业', '裁员', '淘汰', '人力'],
'发展前景': ['未来', '发展', '趋势', '前景', '潜力', '机会', '创新', '变革', '革命', '突破'],
'技术关注': ['技术', '算法', '模型', '训练', '参数', '架构', '优化', '调参', '算力']
}
# 字体配置
FONT_PATHS = ['simhei.ttf', 'msyh.ttc', 'simsun.ttc', 'Arial Unicode.ttf']
COLORS = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7', '#DDA0DD', '#98D8C8', '#F7DC6F']
@dataclass
class VideoInfo:
"""视频信息数据类"""
title: str
cid: int
bvid: str
owner: str
view: int
danmaku_count: int
@dataclass
class AnalysisResult:
"""分析结果数据类"""
word_freq: Counter
top_applications: List[Tuple[str, int]]
application_scores: Counter
views_analysis: Dict[str, int]
specific_views: Dict[str, List[str]]
class PerformanceMonitor:
"""性能监控器"""
def __init__(self):
self.performance_data = defaultdict(list)
def monitor(self, func):
"""性能监控装饰器"""
@functools.wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time()
start_memory = self._get_memory_usage()
try:
result = func(*args, **kwargs)
return result
finally:
end_time = time.time()
end_memory = self._get_memory_usage()
execution_time = end_time - start_time
memory_used = end_memory - start_memory
# 记录性能数据
func_name = self._get_function_name(func, args)
self.performance_data[func_name].append({
'execution_time': execution_time,
'memory_used': memory_used,
'timestamp': time.time()
})
print(f"⏱️ {func_name}: {execution_time:.4f}s, 内存: {memory_used:.2f}MB")
return wrapper
def _get_memory_usage(self) -> float:
"""获取当前内存使用量MB"""
process = psutil.Process(os.getpid())
return process.memory_info().rss / 1024 / 1024
def _get_function_name(self, func, args) -> str:
"""获取函数名称"""
if args and hasattr(args[0], '__class__'):
return f"{args[0].__class__.__name__}.{func.__name__}"
return func.__name__
def generate_report(self):
"""生成性能分析报告"""
print("\n" + "="*60)
print("📊 详细性能分析报告")
print("="*60)
total_time = 0
for func_name, data_list in self.performance_data.items():
if data_list:
times = [d['execution_time'] for d in data_list]
memories = [d['memory_used'] for d in data_list]
avg_time = sum(times) / len(times)
max_time = max(times)
total_time += sum(times)
avg_memory = sum(memories) / len(memories)
print(f"\n{func_name}:")
print(f" 调用次数: {len(data_list)}")
print(f" 平均时间: {avg_time:.4f}s")
print(f" 最长时间: {max_time:.4f}s")
print(f" 总时间: {sum(times):.4f}s")
print(f" 平均内存: {avg_memory:.2f}MB")
print(f"\n🎯 总执行时间: {total_time:.4f}")
class DataValidator:
"""数据验证器"""
@staticmethod
def validate_video_info(data: Dict) -> Optional[VideoInfo]:
"""验证视频信息"""
if not data or data.get('code') != 0:
return None
video_data = data['data']
return VideoInfo(
title=video_data['title'],
cid=video_data['cid'],
bvid=video_data.get('bvid', ''),
owner=video_data['owner']['name'],
view=video_data['stat']['view'],
danmaku_count=video_data['stat']['danmaku']
)
@staticmethod
def validate_danmu_data(danmu_list: List[str]) -> List[str]:
"""验证弹幕数据"""
return [danmu for danmu in danmu_list if danmu and len(danmu.strip()) > 0]
class DanmuParser:
"""弹幕解析器"""
@staticmethod
def parse_with_xml(content: bytes) -> List[str]:
"""使用XML解析弹幕"""
try:
root = ET.fromstring(content)
return [d.text for d in root.findall('d') if d.text]
except ET.ParseError:
return []
@staticmethod
def parse_with_regex(content: bytes) -> List[str]:
"""使用正则表达式解析弹幕"""
try:
content_str = content.decode('utf-8')
danmu_pattern = r'<d[^>]*>([^<]+)</d>'
return re.findall(danmu_pattern, content_str)
except Exception:
return []
class TextProcessor:
"""文本处理器"""
def __init__(self):
self._setup_jieba()
def _setup_jieba(self):
"""设置jieba分词器"""
for word in Constants.CUSTOM_WORDS:
jieba.add_word(word)
def filter_noise(self, danmu_list: List[str]) -> List[str]:
"""过滤噪声弹幕"""
filtered_danmu = []
for danmu in danmu_list:
danmu_clean = danmu.strip()
if self._is_noise_danmu(danmu_clean):
continue
filtered_danmu.append(danmu_clean)
print(f"过滤后剩余 {len(filtered_danmu)} 条有效弹幕")
return filtered_danmu
def _is_noise_danmu(self, danmu: str) -> bool:
"""判断是否为噪声弹幕"""
# 过滤空弹幕和过短弹幕
if len(danmu) <= 1:
return True
# 过滤噪声词
if any(noise in danmu for noise in Constants.NOISE_WORDS):
return True
# 过滤纯数字
if danmu.isdigit():
return True
# 过滤重复字符
if len(set(danmu)) <= 2:
return True
return False
def segment_and_count_words(self, danmu_list: List[str]) -> Tuple[Counter, List[str]]:
"""分词并统计词频"""
all_text = ' '.join(danmu_list)
words = jieba.cut(all_text)
filtered_words = [
word for word in words
if len(word) > 1
and word not in Constants.STOP_WORDS
and not re.match(r'^\d+$', word)
]
word_freq = Counter(filtered_words)
return word_freq, filtered_words
class ApplicationAnalyzer:
"""应用分析器"""
@staticmethod
def extract_applications(word_freq: Counter, top_n: int = Constants.TOP_N_APPLICATIONS) -> Tuple[List[Tuple[str, int]], Counter]:
"""提取LLM应用案例"""
application_scores = Counter()
for app_name, keywords in Constants.APPLICATION_KEYWORDS.items():
score = sum(word_freq.get(keyword, 0) for keyword in keywords)
if score > 0:
application_scores[app_name] = score
top_applications = application_scores.most_common(top_n)
return top_applications, application_scores
class ViewAnalyzer:
"""观点分析器"""
@staticmethod
def analyze_views(word_freq: Counter, danmu_list: List[str]) -> Tuple[Dict[str, int], Dict[str, List[str]]]:
"""分析用户对大语言模型的看法"""
views_analysis = {
category: sum(word_freq.get(word, 0) for word in keywords)
for category, keywords in Constants.VIEW_KEYWORDS.items()
}
specific_views = {
'成本相关弹幕': [danmu for danmu in danmu_list if any(word in danmu for word in Constants.VIEW_KEYWORDS['应用成本'])],
'安全问题弹幕': [danmu for danmu in danmu_list if any(word in danmu for word in Constants.VIEW_KEYWORDS['安全隐私'])],
'就业影响弹幕': [danmu for danmu in danmu_list if any(word in danmu for word in Constants.VIEW_KEYWORDS['就业影响'])],
'技术讨论弹幕': [danmu for danmu in danmu_list if any(word in danmu for word in Constants.VIEW_KEYWORDS['技术关注'])],
}
return views_analysis, specific_views
class Visualizer:
"""可视化器"""
def __init__(self):
self._setup_matplotlib()
def _setup_matplotlib(self):
"""设置matplotlib中文字体"""
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
def generate_wordcloud(self, words_list: List[str], filename: str = 'llm_wordcloud.png') -> Optional[WordCloud]:
"""生成词云图"""
text = ' '.join(words_list)
font_path = self._find_available_font(text)
try:
wordcloud_params = {
'width': 1200,
'height': 800,
'background_color': 'white',
'max_words': Constants.MAX_WORDS_CLOUD,
'colormap': 'viridis',
'relative_scaling': 0.5,
'collocations': False
}
if font_path:
wordcloud_params['font_path'] = font_path
wordcloud = WordCloud(**wordcloud_params).generate(text)
plt.figure(figsize=(15, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('大语言模型应用弹幕词云分析', fontsize=20, pad=20)
plt.tight_layout()
plt.savefig(filename, dpi=300, bbox_inches='tight')
plt.show()
return wordcloud
except Exception as e:
print(f"生成词云失败: {e}")
return None
def _find_available_font(self, text: str) -> Optional[str]:
"""查找可用的字体"""
for font_path in Constants.FONT_PATHS:
try:
WordCloud(font_path=font_path).generate(text)
print(f"使用字体: {font_path}")
return font_path
except:
continue
return None
def plot_top_applications(self, top_applications: List[Tuple[str, int]]):
"""绘制TOP应用柱状图"""
if not top_applications:
print("没有找到应用领域数据")
return
apps, counts = zip(*top_applications)
plt.figure(figsize=(12, 8))
colors = Constants.COLORS[:len(apps)]
bars = plt.bar(apps, counts, color=colors)
# 添加数据标签
for bar, count in zip(bars, counts):
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
f'{count}', ha='center', va='bottom', fontsize=12, fontweight='bold')
plt.title('大语言模型应用领域TOP8分布', fontsize=16, pad=20)
plt.xlabel('应用领域', fontsize=14)
plt.ylabel('出现频次', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()
def plot_user_views(self, views_analysis: Dict[str, int]):
"""绘制用户观点分析图"""
filtered_views = {k: v for k, v in views_analysis.items() if v > 0}
if not filtered_views:
print("所有观点维度提及次数都为0")
return
categories, counts = zip(*filtered_views.items())
plt.figure(figsize=(12, 8))
colors = ['#FF9999', '#66B2FF', '#99FF99', '#FFD700', '#FFB6C1', '#87CEEB', '#98FB98']
bars = plt.bar(categories, counts, color=colors[:len(categories)])
# 添加数据标签
for bar, count in zip(bars, counts):
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
f'{count}', ha='center', va='bottom', fontsize=12, fontweight='bold')
plt.title('用户对大语言模型的观点分布', fontsize=16, pad=20)
plt.xlabel('观点维度', fontsize=14)
plt.ylabel('提及次数', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()
class DataExporter:
"""数据导出器"""
@staticmethod
def save_to_excel(analysis_result: AnalysisResult, video_info: VideoInfo, danmu_data: List[str],
filename: str = 'llm_analysis.xlsx') -> bool:
"""保存数据到Excel"""
try:
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
# 视频信息
video_df = pd.DataFrame([video_info.__dict__])
video_df.to_excel(writer, sheet_name='视频信息', index=False)
# 词频统计
word_df = pd.DataFrame(analysis_result.word_freq.most_common(50), columns=['词语', '频次'])
word_df.to_excel(writer, sheet_name='词频统计', index=False)
# 应用领域统计
app_df = DataExporter._prepare_application_data(analysis_result.application_scores)
app_df.to_excel(writer, sheet_name='应用领域统计', index=False)
# TOP8应用
top8_df = pd.DataFrame(analysis_result.top_applications, columns=['应用领域', '出现次数'])
top8_df.to_excel(writer, sheet_name='TOP8应用案例', index=False)
# 用户观点分析
views_df = DataExporter._prepare_views_data(analysis_result.views_analysis)
views_df.to_excel(writer, sheet_name='用户观点分析', index=False)
# 具体观点示例
DataExporter._save_view_examples(writer, analysis_result.specific_views)
# 原始弹幕数据
danmu_df = pd.DataFrame(danmu_data, columns=['弹幕内容'])
danmu_df.to_excel(writer, sheet_name='原始弹幕数据', index=False)
print(f"数据已保存到 {filename}")
return True
except Exception as e:
print(f"保存Excel文件失败: {e}")
return DataExporter._save_to_csv(analysis_result, video_info)
@staticmethod
def _prepare_application_data(application_scores: Counter) -> pd.DataFrame:
"""准备应用领域数据"""
app_df = pd.DataFrame(application_scores.most_common(), columns=['应用领域', '出现次数'])
if not app_df.empty and app_df['出现次数'].sum() > 0:
app_df['百分比'] = (app_df['出现次数'] / app_df['出现次数'].sum() * 100).round(2)
return app_df
@staticmethod
def _prepare_views_data(views_analysis: Dict[str, int]) -> pd.DataFrame:
"""准备观点分析数据"""
views_df = pd.DataFrame(list(views_analysis.items()), columns=['观点维度', '提及次数'])
if not views_df.empty and views_df['提及次数'].sum() > 0:
views_df['百分比'] = (views_df['提及次数'] / views_df['提及次数'].sum() * 100).round(2)
return views_df
@staticmethod
def _save_view_examples(writer, specific_views: Dict[str, List[str]]):
"""保存具体观点示例"""
for view_type, examples in specific_views.items():
if examples:
example_df = pd.DataFrame(examples[:10], columns=[f'{view_type}示例'])
example_df.to_excel(writer, sheet_name=f'{view_type[:5]}示例', index=False)
@staticmethod
def _save_to_csv(analysis_result: AnalysisResult, video_info: VideoInfo) -> bool:
"""保存为CSV备用"""
try:
word_df = pd.DataFrame(analysis_result.word_freq.most_common(50), columns=['词语', '频次'])
word_df.to_csv('llm_word_freq.csv', index=False, encoding='utf-8-sig')
app_df = pd.DataFrame(analysis_result.top_applications, columns=['应用领域', '出现次数'])
app_df.to_csv('llm_applications.csv', index=False, encoding='utf-8-sig')
print("数据已保存到CSV文件")
return True
except Exception as e:
print(f"保存CSV文件也失败: {e}")
return False
class BilibiliVideoAnalyzer:
"""B站视频分析器"""
def __init__(self):
self.headers = Constants.HEADERS
self.danmu_data = []
self.performance_monitor = PerformanceMonitor()
self.validator = DataValidator()
self.text_processor = TextProcessor()
self.application_analyzer = ApplicationAnalyzer()
self.view_analyzer = ViewAnalyzer()
self.visualizer = Visualizer()
self.exporter = DataExporter()
# 应用性能监控
self._apply_performance_monitoring()
def _apply_performance_monitoring(self):
"""应用性能监控到所有公共方法"""
for method_name in dir(self):
if not method_name.startswith('_') and callable(getattr(self, method_name)):
original_method = getattr(self, method_name)
setattr(self, method_name, self.performance_monitor.monitor(original_method))
def get_video_info(self, bvid: str) -> Optional[VideoInfo]:
"""获取视频信息包括CID"""
url = "https://api.bilibili.com/x/web-interface/view"
params = {'bvid': bvid}
try:
response = requests.get(url, params=params, headers=self.headers, timeout=Constants.TIMEOUT)
if response.status_code == 200:
data = response.json()
video_info = self.validator.validate_video_info(data)
if video_info:
self._print_video_info(video_info)
return video_info
else:
print(f"API返回错误: {data.get('message', '未知错误')}")
else:
print(f"HTTP请求失败状态码: {response.status_code}")
except Exception as e:
print(f"获取视频信息失败: {e}")
return None
def _print_video_info(self, video_info: VideoInfo):
"""打印视频信息"""
print(f"视频标题: {video_info.title}")
print(f"视频作者: {video_info.owner}")
print(f"播放量: {video_info.view}")
print(f"弹幕数: {video_info.danmaku_count}")
print(f"视频CID: {video_info.cid}")
def get_danmu_data(self, cid: int) -> List[str]:
"""通过CID获取弹幕数据"""
url = "https://api.bilibili.com/x/v1/dm/list.so"
params = {'oid': cid}
try:
response = requests.get(url, params=params, headers=self.headers, timeout=Constants.TIMEOUT)
if response.status_code == 200:
# 方法1: 使用XML解析
danmu_list = DanmuParser.parse_with_xml(response.content)
if danmu_list:
print(f"使用内置XML解析器获取 {len(danmu_list)} 条弹幕")
return self.validator.validate_danmu_data(danmu_list)
# 方法2: 使用正则表达式作为备选
danmu_list = DanmuParser.parse_with_regex(response.content)
if danmu_list:
print(f"使用正则表达式获取 {len(danmu_list)} 条弹幕")
return self.validator.validate_danmu_data(danmu_list)
print("两种解析方法都未能获取弹幕数据")
else:
print(f"获取弹幕HTTP请求失败状态码: {response.status_code}")
except Exception as e:
print(f"获取弹幕失败: {e}")
return []
def analyze_video(self, bvid: str) -> Optional[AnalysisResult]:
"""分析视频的主要方法"""
# 获取视频信息
video_info = self.get_video_info(bvid)
if not video_info:
return None
# 获取弹幕数据
danmu_data = self.get_danmu_data(video_info.cid)
if not danmu_data:
return None
self.danmu_data = danmu_data
# 过滤噪声
filtered_danmu = self.text_processor.filter_noise(danmu_data)
if not filtered_danmu:
return None
# 分词统计词频
word_freq, all_words = self.text_processor.segment_and_count_words(filtered_danmu)
self._print_top_words(word_freq)
# 提取应用案例
top_applications, application_scores = self.application_analyzer.extract_applications(word_freq)
# 分析用户观点
views_analysis, specific_views = self.view_analyzer.analyze_views(word_freq, filtered_danmu)
return AnalysisResult(
word_freq=word_freq,
top_applications=top_applications,
application_scores=application_scores,
views_analysis=views_analysis,
specific_views=specific_views
)
def _print_top_words(self, word_freq: Counter):
"""打印高频词"""
print(f"\n{Constants.TOP_N_WORDS}个高频词:")
for word, count in word_freq.most_common(Constants.TOP_N_WORDS):
print(f" {word}: {count}")
def generate_report(self, analysis_result: AnalysisResult, video_info: VideoInfo):
"""生成分析报告"""
# 显示TOP应用
if analysis_result.top_applications:
print(f"\nTOP {Constants.TOP_N_APPLICATIONS} 大语言模型应用领域:")
for i, (app, count) in enumerate(analysis_result.top_applications, 1):
print(f"{i}. {app}: {count}")
self.visualizer.plot_top_applications(analysis_result.top_applications)
else:
print("未识别到明显的LLM应用领域")
# 显示用户观点分析
print("\n用户观点分析:")
for category, count in analysis_result.views_analysis.items():
print(f" {category}: {count}")
self.visualizer.plot_user_views(analysis_result.views_analysis)
# 生成词云
print("\n生成词云图...")
self.visualizer.generate_wordcloud([
word for word, _ in analysis_result.word_freq.most_common(Constants.MAX_WORDS_CLOUD)
])
# 保存到Excel
print("\n保存数据到Excel...")
success = self.exporter.save_to_excel(
analysis_result, video_info, self.danmu_data
)
if success:
print("数据分析完成!")
else:
print("数据分析完成,但数据保存失败")
# 分析结论
self.analyze_conclusions(analysis_result, video_info.title)
def analyze_conclusions(self, analysis_result: AnalysisResult, video_title: str):
"""分析并得出结论"""
print("\n" + "="*60)
print("大语言模型应用分析结论")
print("="*60)
print(f"\n分析视频: {video_title}")
# 应用领域分析
if analysis_result.top_applications:
total_mentions = sum(count for _, count in analysis_result.top_applications)
print(f"\n1. 主要应用领域分布:")
for app, count in analysis_result.top_applications:
percentage = (count / total_mentions) * 100 if total_mentions > 0 else 0
print(f" - {app}: {count}次 ({percentage:.1f}%)")
# 用户观点综合分析
print(f"\n2. 用户观点综合分析:")
total_views = sum(analysis_result.views_analysis.values())
if total_views > 0:
for category, count in analysis_result.views_analysis.items():
percentage = (count / total_views) * 100
print(f" - {category}: {count}次 ({percentage:.1f}%)")
# 具体观点深入分析
self._analyze_detailed_views(analysis_result, total_views)
# 总体评价倾向
self._analyze_sentiment(analysis_result)
# 技术发展趋势
self._analyze_trends(analysis_result, total_views)
# 主要发现总结
self._summarize_findings(analysis_result, total_views)
def _analyze_detailed_views(self, analysis_result: AnalysisResult, total_views: int):
"""分析具体观点"""
print(f"\n3. 具体观点深入分析:")
if total_views > 0:
# 应用成本分析
cost_ratio = analysis_result.views_analysis['应用成本'] / total_views * 100
print(f" - 应用成本关注度: {cost_ratio:.1f}%")
if analysis_result.specific_views['成本相关弹幕']:
print(f" 代表性观点: {analysis_result.specific_views['成本相关弹幕'][0][:50]}...")
# 安全隐私分析
security_ratio = analysis_result.views_analysis['安全隐私'] / total_views * 100
print(f" - 安全隐私关注度: {security_ratio:.1f}%")
if analysis_result.specific_views['安全问题弹幕']:
print(f" 代表性观点: {analysis_result.specific_views['安全问题弹幕'][0][:50]}...")
# 就业影响分析
employment_ratio = analysis_result.views_analysis['就业影响'] / total_views * 100
print(f" - 就业影响关注度: {employment_ratio:.1f}%")
if analysis_result.specific_views['就业影响弹幕']:
print(f" 代表性观点: {analysis_result.specific_views['就业影响弹幕'][0][:50]}...")
def _analyze_sentiment(self, analysis_result: AnalysisResult):
"""分析情感倾向"""
positive = analysis_result.views_analysis['正面评价']
negative = analysis_result.views_analysis['负面评价']
if (positive + negative) > 0:
positive_ratio = positive / (positive + negative) * 100
print(f"\n4. 总体评价倾向:")
print(f" - 正面评价占比: {positive_ratio:.1f}%")
if positive_ratio > 60:
print(" - 用户态度: 总体积极乐观")
elif positive_ratio < 40:
print(" - 用户态度: 存在较多担忧")
else:
print(" - 用户态度: 理性看待,既有期待也有担忧")
def _analyze_trends(self, analysis_result: AnalysisResult, total_views: int):
"""分析技术趋势"""
if total_views > 0:
future_ratio = analysis_result.views_analysis['发展前景'] / total_views * 100
print(f"\n5. 技术发展趋势:")
print(f" - 未来发展关注度: {future_ratio:.1f}%")
if future_ratio > 15:
print(" - 用户对LLM未来发展保持高度关注")
def _summarize_findings(self, analysis_result: AnalysisResult, total_views: int):
"""总结主要发现"""
print(f"\n6. 主要发现总结:")
# 成本 vs 安全
if analysis_result.views_analysis['应用成本'] > analysis_result.views_analysis['安全隐私']:
print(" - 用户更关注应用成本而非安全问题")
else:
print(" - 用户对安全隐私问题的关注超过成本问题")
# 就业影响
if analysis_result.views_analysis['就业影响'] > 0:
print(" - 就业替代效应已引起用户关注")
# 总体态度
positive = analysis_result.views_analysis['正面评价']
negative = analysis_result.views_analysis['负面评价']
if positive > negative:
print(" - 总体上用户对LLM技术持积极态度")
else:
print(" - 用户对LLM技术存在较多担忧")
# 技术关注度
if total_views > 0:
tech_ratio = analysis_result.views_analysis['技术关注'] / total_views * 100
print(f" - 技术细节讨论占比: {tech_ratio:.1f}%")
def extract_bvid_from_url(url: str) -> Optional[str]:
"""从B站URL中提取BV号"""
bvid_pattern = r'BV[0-9A-Za-z]{10}'
match = re.search(bvid_pattern, url)
return match.group() if match else None
def main():
"""主函数"""
# 从URL中提取BV号
url = "https://www.bilibili.com/video/BV1kg4y1T7PA/?spm_id_from=333.337.search-card.all.click&vd_source=15df046f7c6c0dbb574611c9d3e4d5ef/"
bvid = extract_bvid_from_url(url)
if not bvid:
print("无法从URL中提取BV号")
return
print(f"提取的BV号: {bvid}")
analyzer = BilibiliVideoAnalyzer()
# 分析视频
analysis_result = analyzer.analyze_video(bvid)
if analysis_result:
# 生成报告
video_info = analyzer.get_video_info(bvid) # 重新获取以用于报告
if video_info:
analyzer.generate_report(analysis_result, video_info)
# 生成性能报告
analyzer.performance_monitor.generate_report()
if __name__ == "__main__":
main()
Loading…
Cancel
Save