|
|
|
|
@ -0,0 +1,816 @@
|
|
|
|
|
import requests
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import re
|
|
|
|
|
import jieba
|
|
|
|
|
from collections import Counter
|
|
|
|
|
from wordcloud import WordCloud
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
import time
|
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
|
import warnings
|
|
|
|
|
from datetime import datetime, timedelta
|
|
|
|
|
import json
|
|
|
|
|
warnings.filterwarnings('ignore')
|
|
|
|
|
|
|
|
|
|
# 设置中文字体
|
|
|
|
|
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']
|
|
|
|
|
plt.rcParams['axes.unicode_minus'] = False
|
|
|
|
|
|
|
|
|
|
class TechMediaAnalyzer:
|
|
|
|
|
"""科技媒体观点分析类"""
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self.headers = {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def crawl_tech_news(self):
|
|
|
|
|
"""爬取主流科技媒体关于大语言模型的报道"""
|
|
|
|
|
print("开始爬取主流科技媒体观点...")
|
|
|
|
|
|
|
|
|
|
# 模拟数据 - 实际应用中需要从各科技媒体API或网站爬取
|
|
|
|
|
tech_articles = [
|
|
|
|
|
{
|
|
|
|
|
'source': '36氪',
|
|
|
|
|
'title': '大语言模型正在重构软件产业格局',
|
|
|
|
|
'content': '随着ChatGPT等大语言模型的普及,软件开发、内容创作、客户服务等领域正在经历深刻变革。专家预测,未来两年内,超过30%的企业将部署大语言模型应用。',
|
|
|
|
|
'date': '2024-01-15',
|
|
|
|
|
'trend_keywords': ['软件开发', '自动化', '效率提升', '产业变革']
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
'source': '虎嗅',
|
|
|
|
|
'title': 'LLM应用成本下降,中小企业迎来机遇',
|
|
|
|
|
'content': '大语言模型的API调用成本持续下降,使得中小企业也能够负担得起先进的AI能力。这将在教育、医疗、金融等领域催生大量创新应用。',
|
|
|
|
|
'date': '2024-01-10',
|
|
|
|
|
'trend_keywords': ['成本下降', '中小企业', 'API经济', '创新应用']
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
'source': '钛媒体',
|
|
|
|
|
'title': '多模态大模型将成为下一代AI竞争焦点',
|
|
|
|
|
'content': '从纯文本到图像、音频、视频的多模态理解能力,大语言模型正在向更全面的AI助手演进。这将开启人机交互的新时代。',
|
|
|
|
|
'date': '2024-01-08',
|
|
|
|
|
'trend_keywords': ['多模态', '人机交互', 'AI助手', '技术演进']
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
'source': '机器之心',
|
|
|
|
|
'title': '大模型在科学发现中的潜力开始显现',
|
|
|
|
|
'content': '研究人员开始利用大语言模型加速科学发现过程,在药物研发、材料科学、天文物理等领域取得初步成果。',
|
|
|
|
|
'date': '2024-01-05',
|
|
|
|
|
'trend_keywords': ['科学发现', '药物研发', '材料科学', '研究加速']
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
'source': '量子位',
|
|
|
|
|
'title': '边缘计算与大模型结合成为新趋势',
|
|
|
|
|
'content': '为了降低延迟和保护隐私,大模型正在向边缘设备迁移。手机、物联网设备上的本地AI能力将大幅提升。',
|
|
|
|
|
'date': '2024-01-03',
|
|
|
|
|
'trend_keywords': ['边缘计算', '隐私保护', '本地AI', '物联网']
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
'source': 'InfoQ',
|
|
|
|
|
'title': '大语言模型推动编程范式变革',
|
|
|
|
|
'content': 'AI编程助手正在改变软件开发的工作方式,从代码生成到调试优化,大模型在软件开发生命周期中发挥越来越重要的作用。',
|
|
|
|
|
'date': '2023-12-28',
|
|
|
|
|
'trend_keywords': ['编程范式', 'AI编程', '软件开发', '效率革命']
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
'source': 'CSDN',
|
|
|
|
|
'title': '开源大模型生态快速发展',
|
|
|
|
|
'content': '随着Llama、ChatGLM等开源模型的发布,大语言模型的技术门槛大幅降低,开发者社区涌现大量创新应用。',
|
|
|
|
|
'date': '2023-12-25',
|
|
|
|
|
'trend_keywords': ['开源生态', '技术民主化', '开发者社区', '创新爆发']
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
'source': '极客公园',
|
|
|
|
|
'title': 'AI安全与对齐成为关注焦点',
|
|
|
|
|
'content': '随着大模型能力增强,AI安全、价值观对齐、可控生成等技术挑战日益突出,相关研究投入快速增长。',
|
|
|
|
|
'date': '2023-12-20',
|
|
|
|
|
'trend_keywords': ['AI安全', '价值观对齐', '可控AI', '伦理治理']
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
print(f"爬取到 {len(tech_articles)} 篇科技媒体报道")
|
|
|
|
|
return tech_articles
|
|
|
|
|
|
|
|
|
|
def analyze_trends(self, articles):
|
|
|
|
|
"""分析大语言模型应用发展趋势"""
|
|
|
|
|
print("\n分析大语言模型应用发展趋势...")
|
|
|
|
|
|
|
|
|
|
# 提取所有趋势关键词
|
|
|
|
|
all_keywords = []
|
|
|
|
|
for article in articles:
|
|
|
|
|
all_keywords.extend(article['trend_keywords'])
|
|
|
|
|
|
|
|
|
|
# 统计关键词频次
|
|
|
|
|
keyword_freq = Counter(all_keywords)
|
|
|
|
|
|
|
|
|
|
# 分析主要趋势领域
|
|
|
|
|
trends_analysis = {
|
|
|
|
|
'技术演进': ['多模态', '开源生态', '边缘计算', '模型优化', '算法改进'],
|
|
|
|
|
'应用场景': ['软件开发', '科学发现', '教育医疗', '金融服务', '内容创作'],
|
|
|
|
|
'产业影响': ['成本下降', '中小企业', '效率提升', '产业变革', '就业影响'],
|
|
|
|
|
'社会影响': ['AI安全', '隐私保护', '伦理治理', '技术民主化', '人机协作']
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
trend_scores = {}
|
|
|
|
|
for category, keywords in trends_analysis.items():
|
|
|
|
|
score = sum(keyword_freq.get(keyword, 0) for keyword in keywords)
|
|
|
|
|
trend_scores[category] = score
|
|
|
|
|
|
|
|
|
|
# 获取发展趋势预测
|
|
|
|
|
predictions = self.generate_predictions(articles, keyword_freq)
|
|
|
|
|
|
|
|
|
|
return trend_scores, predictions, keyword_freq
|
|
|
|
|
|
|
|
|
|
def generate_predictions(self, articles, keyword_freq):
|
|
|
|
|
"""生成发展趋势预测"""
|
|
|
|
|
predictions = []
|
|
|
|
|
|
|
|
|
|
# 基于关键词频次和文章内容生成预测
|
|
|
|
|
if keyword_freq.get('多模态', 0) > 2:
|
|
|
|
|
predictions.append({
|
|
|
|
|
'trend': '多模态融合',
|
|
|
|
|
'prediction': '大语言模型将深度融合视觉、语音等多模态能力,成为真正的通用AI助手',
|
|
|
|
|
'timeframe': '1-2年',
|
|
|
|
|
'confidence': '高'
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
if keyword_freq.get('成本下降', 0) > 1:
|
|
|
|
|
predictions.append({
|
|
|
|
|
'trend': '应用普及',
|
|
|
|
|
'prediction': '随着成本下降和技术成熟,大语言模型应用将从大企业向中小企业快速普及',
|
|
|
|
|
'timeframe': '6-18个月',
|
|
|
|
|
'confidence': '高'
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
if keyword_freq.get('边缘计算', 0) > 1:
|
|
|
|
|
predictions.append({
|
|
|
|
|
'trend': '边缘部署',
|
|
|
|
|
'prediction': '大模型将更多部署在边缘设备,实现更低延迟和更好隐私保护的本地AI应用',
|
|
|
|
|
'timeframe': '1-2年',
|
|
|
|
|
'confidence': '中'
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
if keyword_freq.get('AI安全', 0) > 1:
|
|
|
|
|
predictions.append({
|
|
|
|
|
'trend': '安全治理',
|
|
|
|
|
'prediction': 'AI安全、价值观对齐和伦理治理将成为技术发展和应用部署的关键考量',
|
|
|
|
|
'timeframe': '持续关注',
|
|
|
|
|
'confidence': '高'
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
if keyword_freq.get('开源生态', 0) > 1:
|
|
|
|
|
predictions.append({
|
|
|
|
|
'trend': '生态繁荣',
|
|
|
|
|
'prediction': '开源大模型生态将加速创新,催生大量垂直领域和特定场景的定制化应用',
|
|
|
|
|
'timeframe': '6-12个月',
|
|
|
|
|
'confidence': '高'
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
# 确保至少有一些预测
|
|
|
|
|
if not predictions:
|
|
|
|
|
predictions = [
|
|
|
|
|
{
|
|
|
|
|
'trend': '技术融合',
|
|
|
|
|
'prediction': '大语言模型将与其他AI技术深度融合,创造新的应用范式',
|
|
|
|
|
'timeframe': '1-2年',
|
|
|
|
|
'confidence': '中'
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
'trend': '行业渗透',
|
|
|
|
|
'prediction': '大语言模型将加速向传统行业渗透,推动数字化转型',
|
|
|
|
|
'timeframe': '6-18个月',
|
|
|
|
|
'confidence': '高'
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
return predictions
|
|
|
|
|
|
|
|
|
|
def plot_trend_analysis(self, trend_scores, predictions):
|
|
|
|
|
"""绘制趋势分析图"""
|
|
|
|
|
# 绘制趋势领域分布
|
|
|
|
|
categories, scores = zip(*trend_scores.items())
|
|
|
|
|
|
|
|
|
|
plt.figure(figsize=(12, 8))
|
|
|
|
|
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
|
|
|
|
|
bars = plt.bar(categories, scores, color=colors)
|
|
|
|
|
|
|
|
|
|
for bar, score in zip(bars, scores):
|
|
|
|
|
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
|
|
|
|
|
f'{score}', ha='center', va='bottom', fontsize=12, fontweight='bold')
|
|
|
|
|
|
|
|
|
|
plt.title('大语言模型发展趋势领域分布', fontsize=16, pad=20)
|
|
|
|
|
plt.xlabel('趋势领域', fontsize=14)
|
|
|
|
|
plt.ylabel('关注度得分', fontsize=14)
|
|
|
|
|
plt.grid(axis='y', alpha=0.3)
|
|
|
|
|
plt.tight_layout()
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
# 绘制预测时间线
|
|
|
|
|
self.plot_prediction_timeline(predictions)
|
|
|
|
|
|
|
|
|
|
def plot_prediction_timeline(self, predictions):
|
|
|
|
|
"""绘制预测时间线"""
|
|
|
|
|
fig, ax = plt.subplots(figsize=(14, 8))
|
|
|
|
|
|
|
|
|
|
# 定义时间帧映射
|
|
|
|
|
timeframe_map = {
|
|
|
|
|
'6-12个月': 1,
|
|
|
|
|
'6-18个月': 1.5,
|
|
|
|
|
'1-2年': 2,
|
|
|
|
|
'持续关注': 3
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 定义置信度颜色
|
|
|
|
|
confidence_colors = {
|
|
|
|
|
'高': '#2E8B57', # 绿色
|
|
|
|
|
'中': '#FFA500', # 橙色
|
|
|
|
|
'低': '#FF4500' # 红色
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
y_positions = list(range(len(predictions)))
|
|
|
|
|
|
|
|
|
|
for i, pred in enumerate(predictions):
|
|
|
|
|
timeframe_val = timeframe_map.get(pred['timeframe'], 1)
|
|
|
|
|
color = confidence_colors.get(pred['confidence'], '#808080')
|
|
|
|
|
|
|
|
|
|
# 绘制时间点
|
|
|
|
|
ax.scatter(timeframe_val, i, color=color, s=200, alpha=0.7,
|
|
|
|
|
label=f"{pred['confidence']}置信度" if i == 0 else "")
|
|
|
|
|
|
|
|
|
|
# 添加文本
|
|
|
|
|
ax.text(timeframe_val + 0.1, i,
|
|
|
|
|
f"{pred['trend']}\n({pred['timeframe']})",
|
|
|
|
|
va='center', fontsize=10)
|
|
|
|
|
|
|
|
|
|
ax.set_yticks(y_positions)
|
|
|
|
|
ax.set_yticklabels([pred['trend'] for pred in predictions])
|
|
|
|
|
ax.set_xlabel('时间范围', fontsize=12)
|
|
|
|
|
ax.set_title('大语言模型应用发展预测时间线', fontsize=16, pad=20)
|
|
|
|
|
ax.grid(axis='x', alpha=0.3)
|
|
|
|
|
ax.legend(loc='upper right')
|
|
|
|
|
|
|
|
|
|
# 设置x轴刻度
|
|
|
|
|
time_labels = ['近期(6-12个月)', '中期(1-2年)', '长期(2年以上)']
|
|
|
|
|
ax.set_xticks([1, 2, 3])
|
|
|
|
|
ax.set_xticklabels(time_labels)
|
|
|
|
|
|
|
|
|
|
plt.tight_layout()
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
class BilibiliVideoAnalyzer:
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self.headers = {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
|
|
|
'Referer': 'https://www.bilibili.com',
|
|
|
|
|
}
|
|
|
|
|
self.danmu_data = []
|
|
|
|
|
|
|
|
|
|
def get_video_info(self, bvid):
|
|
|
|
|
"""获取视频信息,包括CID"""
|
|
|
|
|
url = "https://api.bilibili.com/x/web-interface/view"
|
|
|
|
|
params = {'bvid': bvid}
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(url, params=params, headers=self.headers, timeout=10)
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
data = response.json()
|
|
|
|
|
if data.get('code') == 0:
|
|
|
|
|
video_data = data['data']
|
|
|
|
|
print(f"视频标题: {video_data['title']}")
|
|
|
|
|
print(f"视频作者: {video_data['owner']['name']}")
|
|
|
|
|
print(f"播放量: {video_data['stat']['view']}")
|
|
|
|
|
print(f"弹幕数: {video_data['stat']['danmaku']}")
|
|
|
|
|
|
|
|
|
|
# 获取CID
|
|
|
|
|
cid = video_data['cid']
|
|
|
|
|
print(f"视频CID: {cid}")
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
'title': video_data['title'],
|
|
|
|
|
'cid': cid,
|
|
|
|
|
'bvid': bvid,
|
|
|
|
|
'owner': video_data['owner']['name'],
|
|
|
|
|
'view': video_data['stat']['view'],
|
|
|
|
|
'danmaku_count': video_data['stat']['danmaku']
|
|
|
|
|
}
|
|
|
|
|
else:
|
|
|
|
|
print(f"API返回错误: {data.get('message')}")
|
|
|
|
|
else:
|
|
|
|
|
print(f"HTTP请求失败,状态码: {response.status_code}")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"获取视频信息失败: {e}")
|
|
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def get_danmu_data(self, cid):
|
|
|
|
|
"""通过CID获取弹幕数据 - 使用多种解析方法确保兼容性"""
|
|
|
|
|
url = f"https://api.bilibili.com/x/v1/dm/list.so"
|
|
|
|
|
params = {'oid': cid}
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(url, params=params, headers=self.headers, timeout=10)
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
# 方法1: 使用Python内置XML解析器(最可靠)
|
|
|
|
|
try:
|
|
|
|
|
root = ET.fromstring(response.content)
|
|
|
|
|
danmu_list = []
|
|
|
|
|
for d in root.findall('d'):
|
|
|
|
|
danmu_list.append(d.text)
|
|
|
|
|
print(f"使用内置XML解析器获取 {len(danmu_list)} 条弹幕")
|
|
|
|
|
return danmu_list
|
|
|
|
|
except ET.ParseError:
|
|
|
|
|
# 方法2: 使用正则表达式作为备选
|
|
|
|
|
try:
|
|
|
|
|
content = response.content.decode('utf-8')
|
|
|
|
|
# 使用正则表达式匹配弹幕
|
|
|
|
|
danmu_pattern = r'<d[^>]*>([^<]+)</d>'
|
|
|
|
|
danmu_list = re.findall(danmu_pattern, content)
|
|
|
|
|
print(f"使用正则表达式获取 {len(danmu_list)} 条弹幕")
|
|
|
|
|
return danmu_list
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"正则表达式解析失败: {e}")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"XML解析失败: {e}")
|
|
|
|
|
else:
|
|
|
|
|
print(f"获取弹幕HTTP请求失败,状态码: {response.status_code}")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"获取弹幕失败: {e}")
|
|
|
|
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
def filter_noise(self, danmu_list):
|
|
|
|
|
"""过滤噪声弹幕"""
|
|
|
|
|
# 噪声词列表
|
|
|
|
|
noise_words = [
|
|
|
|
|
'666', '哈哈哈', '233', 'awsl', '哈哈哈哈', '妙啊', '好活',
|
|
|
|
|
'点赞', '支持', '顶', '签到', '来了', '第一', '前排',
|
|
|
|
|
'打卡', '报道', '路过', '围观', '沙发', '板凳', '哈哈哈',
|
|
|
|
|
'笑死', 'hhhh', 'hhh', '啊啊啊', '哇', '哦', '嗯', '呃',
|
|
|
|
|
'不错', '可以', '挺好', '好的', '谢谢', '感谢', '牛逼', '太强了'
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
filtered_danmu = []
|
|
|
|
|
|
|
|
|
|
for danmu in danmu_list:
|
|
|
|
|
# 过滤空弹幕和过短弹幕
|
|
|
|
|
if not danmu or len(danmu.strip()) <= 1:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# 过滤噪声词
|
|
|
|
|
if any(noise in danmu for noise in noise_words):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# 过滤纯数字
|
|
|
|
|
if danmu.strip().isdigit():
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# 过滤重复字符(如"啊啊啊啊")
|
|
|
|
|
if len(set(danmu)) <= 2:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
filtered_danmu.append(danmu.strip())
|
|
|
|
|
|
|
|
|
|
print(f"过滤后剩余 {len(filtered_danmu)} 条有效弹幕")
|
|
|
|
|
return filtered_danmu
|
|
|
|
|
|
|
|
|
|
def segment_and_count_words(self, danmu_list):
|
|
|
|
|
"""分词并统计词频"""
|
|
|
|
|
# 添加自定义词典
|
|
|
|
|
custom_words = [
|
|
|
|
|
'大语言模型', 'LLM', 'GPT', 'ChatGPT', '文心一言', '通义千问',
|
|
|
|
|
'智谱', 'AI模型', '智能客服', '代码生成', '深度学习', '神经网络',
|
|
|
|
|
'人工智能', '自然语言', '机器学习', 'AI技术', '模型训练', '应用成本',
|
|
|
|
|
'数据安全', '隐私保护', '就业影响', '技术门槛', '内容创作', '智能助手',
|
|
|
|
|
'AIGC', '多模态', '算法优化', '训练数据', '模型部署', 'API调用'
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
for word in custom_words:
|
|
|
|
|
jieba.add_word(word)
|
|
|
|
|
|
|
|
|
|
all_text = ' '.join(danmu_list)
|
|
|
|
|
|
|
|
|
|
# 使用jieba进行分词
|
|
|
|
|
words = jieba.cut(all_text)
|
|
|
|
|
|
|
|
|
|
# 过滤停用词和单字
|
|
|
|
|
stop_words = {
|
|
|
|
|
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个',
|
|
|
|
|
'上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好',
|
|
|
|
|
'自己', '这个', '那个', '就是', '可以', '怎么', '什么', '这样', '这种', '这些',
|
|
|
|
|
'还有', '就是', '一下', '一点', '一种', '一些', '这个', '那个', '这种', '那种',
|
|
|
|
|
'这样', '那样', '这么', '那么', '为啥', '为什么', '怎么', '怎么样', '如何'
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
filtered_words = [
|
|
|
|
|
word for word in words
|
|
|
|
|
if len(word) > 1
|
|
|
|
|
and word not in stop_words
|
|
|
|
|
and not re.match(r'^\d+$', word)
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# 统计词频
|
|
|
|
|
word_freq = Counter(filtered_words)
|
|
|
|
|
return word_freq, filtered_words
|
|
|
|
|
|
|
|
|
|
def extract_llm_applications(self, word_freq, top_n=8):
|
|
|
|
|
"""提取LLM应用案例"""
|
|
|
|
|
# LLM应用领域关键词映射
|
|
|
|
|
application_keywords = {
|
|
|
|
|
'智能客服': ['客服', '客户服务', '问答', '咨询', '服务机器人', '智能问答', '在线客服'],
|
|
|
|
|
'代码编程': ['编程', '代码', '程序员', '开发', 'Copilot', '代码生成', '编程助手', '软件开发', '程序'],
|
|
|
|
|
'内容创作': ['写作', '创作', '文案', '文章', '内容生成', '写作文', '创作助手', '文案生成', '内容'],
|
|
|
|
|
'AI翻译': ['翻译', '多语言', '语言翻译', '翻译工具', '跨语言', '机器翻译', '翻译软件'],
|
|
|
|
|
'教育学习': ['教育', '学习', '教学', '辅导', '个性化学习', '学习助手', '教育AI', '在线教育', '老师'],
|
|
|
|
|
'创意设计': ['创意', '设计', '艺术', '绘画', '音乐', '创意生成', '设计助手', '艺术创作', '美术'],
|
|
|
|
|
'数据分析': ['数据', '分析', '报表', '报告生成', '数据处理', '数据分析', '数据挖掘', '统计'],
|
|
|
|
|
'医疗健康': ['医疗', '诊断', '健康', '病历', '医学', '医疗AI', '健康咨询', '智能诊断', '医生'],
|
|
|
|
|
'金融服务': ['金融', '风控', '投资', '银行', '保险', '金融分析', '风险控制', '量化交易', '理财'],
|
|
|
|
|
'智能助手': ['助手', '语音助手', '个人助理', '智能助理', 'AI助手', '虚拟助手', '助理'],
|
|
|
|
|
'游戏娱乐': ['游戏', 'NPC', '对话', '娱乐', '游戏AI', '角色对话', '游戏开发', '玩家'],
|
|
|
|
|
'科研学术': ['科研', '学术', '论文', '文献', '研究', '学术助手', '科学计算', '科学家']
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
application_scores = Counter()
|
|
|
|
|
|
|
|
|
|
# 计算每个应用领域的得分
|
|
|
|
|
for app_name, keywords in application_keywords.items():
|
|
|
|
|
score = 0
|
|
|
|
|
for keyword in keywords:
|
|
|
|
|
score += word_freq.get(keyword, 0)
|
|
|
|
|
if score > 0:
|
|
|
|
|
application_scores[app_name] = score
|
|
|
|
|
|
|
|
|
|
# 获取前N个应用
|
|
|
|
|
top_applications = application_scores.most_common(top_n)
|
|
|
|
|
return top_applications, application_scores
|
|
|
|
|
|
|
|
|
|
def analyze_user_views(self, word_freq, danmu_list):
|
|
|
|
|
"""分析用户对大语言模型的看法"""
|
|
|
|
|
# 定义不同维度的关键词
|
|
|
|
|
cost_keywords = ['成本', '价格', '昂贵', '便宜', '免费', '收费', '性价比', '投入', '预算', '费用', '花钱', '价值']
|
|
|
|
|
positive_keywords = ['好用', '实用', '方便', '强大', '厉害', '优秀', '精准', '准确', '惊喜', '进步', '提升', '效率', '创新', '革命']
|
|
|
|
|
negative_keywords = ['不行', '不好', '错误', '问题', '困难', '复杂', '昂贵', '糟糕', '缺陷', '不足', '局限', '风险', '错误', '偏差']
|
|
|
|
|
security_keywords = ['安全', '隐私', '泄露', '保护', '风险', '威胁', '危险', '伦理', '道德', '监管', '规范']
|
|
|
|
|
employment_keywords = ['失业', '工作', '岗位', '就业', '替代', '取代', '职业', '裁员', '淘汰', '人力']
|
|
|
|
|
future_keywords = ['未来', '发展', '趋势', '前景', '潜力', '机会', '创新', '变革', '革命', '突破']
|
|
|
|
|
technical_keywords = ['技术', '算法', '模型', '训练', '参数', '架构', '优化', '调参', '算力']
|
|
|
|
|
|
|
|
|
|
# 统计各维度提及次数
|
|
|
|
|
views_analysis = {
|
|
|
|
|
'应用成本': sum(word_freq.get(word, 0) for word in cost_keywords),
|
|
|
|
|
'正面评价': sum(word_freq.get(word, 0) for word in positive_keywords),
|
|
|
|
|
'负面评价': sum(word_freq.get(word, 0) for word in negative_keywords),
|
|
|
|
|
'安全隐私': sum(word_freq.get(word, 0) for word in security_keywords),
|
|
|
|
|
'就业影响': sum(word_freq.get(word, 0) for word in employment_keywords),
|
|
|
|
|
'发展前景': sum(word_freq.get(word, 0) for word in future_keywords),
|
|
|
|
|
'技术关注': sum(word_freq.get(word, 0) for word in technical_keywords),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 分析具体观点
|
|
|
|
|
specific_views = {
|
|
|
|
|
'成本相关弹幕': [danmu for danmu in danmu_list if any(word in danmu for word in cost_keywords)],
|
|
|
|
|
'安全问题弹幕': [danmu for danmu in danmu_list if any(word in danmu for word in security_keywords)],
|
|
|
|
|
'就业影响弹幕': [danmu for danmu in danmu_list if any(word in danmu for word in employment_keywords)],
|
|
|
|
|
'技术讨论弹幕': [danmu for danmu in danmu_list if any(word in danmu for word in technical_keywords)],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return views_analysis, specific_views
|
|
|
|
|
|
|
|
|
|
def generate_wordcloud(self, words_list, filename='llm_wordcloud.png'):
|
|
|
|
|
"""生成词云图"""
|
|
|
|
|
text = ' '.join(words_list)
|
|
|
|
|
|
|
|
|
|
# 尝试多种字体
|
|
|
|
|
font_paths = [
|
|
|
|
|
'simhei.ttf',
|
|
|
|
|
'msyh.ttc',
|
|
|
|
|
'simsun.ttc',
|
|
|
|
|
'Arial Unicode.ttf' # macOS
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
font_path = None
|
|
|
|
|
for fp in font_paths:
|
|
|
|
|
try:
|
|
|
|
|
# 测试字体是否可用
|
|
|
|
|
WordCloud(font_path=fp).generate(text)
|
|
|
|
|
font_path = fp
|
|
|
|
|
print(f"使用字体: {fp}")
|
|
|
|
|
break
|
|
|
|
|
except:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
if font_path:
|
|
|
|
|
wordcloud = WordCloud(
|
|
|
|
|
font_path=font_path,
|
|
|
|
|
width=1200,
|
|
|
|
|
height=800,
|
|
|
|
|
background_color='white',
|
|
|
|
|
max_words=200,
|
|
|
|
|
colormap='viridis',
|
|
|
|
|
relative_scaling=0.5,
|
|
|
|
|
collocations=False # 避免重复词语
|
|
|
|
|
).generate(text)
|
|
|
|
|
else:
|
|
|
|
|
# 使用默认字体
|
|
|
|
|
wordcloud = WordCloud(
|
|
|
|
|
width=1200,
|
|
|
|
|
height=800,
|
|
|
|
|
background_color='white',
|
|
|
|
|
max_words=200,
|
|
|
|
|
colormap='viridis',
|
|
|
|
|
relative_scaling=0.5,
|
|
|
|
|
collocations=False
|
|
|
|
|
).generate(text)
|
|
|
|
|
|
|
|
|
|
plt.figure(figsize=(15, 10))
|
|
|
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
|
|
|
plt.axis('off')
|
|
|
|
|
plt.title('大语言模型应用弹幕词云分析', fontsize=20, pad=20)
|
|
|
|
|
plt.tight_layout()
|
|
|
|
|
plt.savefig(filename, dpi=300, bbox_inches='tight')
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
return wordcloud
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"生成词云失败: {e}")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def save_to_excel(self, word_freq, top_applications, application_scores, video_info, views_analysis, specific_views, tech_data, filename='llm_analysis.xlsx'):
|
|
|
|
|
"""保存数据到Excel"""
|
|
|
|
|
try:
|
|
|
|
|
# 创建DataFrame
|
|
|
|
|
video_df = pd.DataFrame([video_info])
|
|
|
|
|
word_df = pd.DataFrame(word_freq.most_common(50), columns=['词语', '频次'])
|
|
|
|
|
app_df = pd.DataFrame(application_scores.most_common(), columns=['应用领域', '出现次数'])
|
|
|
|
|
top8_df = pd.DataFrame(top_applications, columns=['应用领域', '出现次数'])
|
|
|
|
|
danmu_df = pd.DataFrame(self.danmu_data, columns=['弹幕内容'])
|
|
|
|
|
|
|
|
|
|
# 计算百分比
|
|
|
|
|
if len(app_df) > 0 and app_df['出现次数'].sum() > 0:
|
|
|
|
|
app_df['百分比'] = (app_df['出现次数'] / app_df['出现次数'].sum() * 100).round(2)
|
|
|
|
|
|
|
|
|
|
# 保存科技媒体数据
|
|
|
|
|
tech_articles_df = pd.DataFrame(tech_data['articles'])
|
|
|
|
|
trend_scores_df = pd.DataFrame(list(tech_data['trend_scores'].items()), columns=['趋势领域', '关注度'])
|
|
|
|
|
predictions_df = pd.DataFrame(tech_data['predictions'])
|
|
|
|
|
|
|
|
|
|
# 保存到Excel
|
|
|
|
|
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
|
|
|
|
|
video_df.to_excel(writer, sheet_name='视频信息', index=False)
|
|
|
|
|
word_df.to_excel(writer, sheet_name='词频统计', index=False)
|
|
|
|
|
app_df.to_excel(writer, sheet_name='应用领域统计', index=False)
|
|
|
|
|
top8_df.to_excel(writer, sheet_name='TOP8应用案例', index=False)
|
|
|
|
|
|
|
|
|
|
# 保存用户观点分析
|
|
|
|
|
views_df = pd.DataFrame(list(views_analysis.items()), columns=['观点维度', '提及次数'])
|
|
|
|
|
if len(views_df) > 0 and views_df['提及次数'].sum() > 0:
|
|
|
|
|
views_df['百分比'] = (views_df['提及次数'] / views_df['提及次数'].sum() * 100).round(2)
|
|
|
|
|
views_df.to_excel(writer, sheet_name='用户观点分析', index=False)
|
|
|
|
|
|
|
|
|
|
# 保存科技媒体分析
|
|
|
|
|
tech_articles_df.to_excel(writer, sheet_name='科技媒体报道', index=False)
|
|
|
|
|
trend_scores_df.to_excel(writer, sheet_name='趋势领域分析', index=False)
|
|
|
|
|
predictions_df.to_excel(writer, sheet_name='发展趋势预测', index=False)
|
|
|
|
|
|
|
|
|
|
# 保存具体观点示例
|
|
|
|
|
for view_type, examples in specific_views.items():
|
|
|
|
|
if examples:
|
|
|
|
|
example_df = pd.DataFrame(examples[:10], columns=[f'{view_type}示例'])
|
|
|
|
|
example_df.to_excel(writer, sheet_name=f'{view_type[:5]}示例', index=False)
|
|
|
|
|
|
|
|
|
|
danmu_df.to_excel(writer, sheet_name='原始弹幕数据', index=False)
|
|
|
|
|
|
|
|
|
|
print(f"数据已保存到 {filename}")
|
|
|
|
|
return True
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"保存Excel文件失败: {e}")
|
|
|
|
|
# 尝试保存为CSV
|
|
|
|
|
try:
|
|
|
|
|
word_df = pd.DataFrame(word_freq.most_common(50), columns=['词语', '频次'])
|
|
|
|
|
word_df.to_csv('llm_word_freq.csv', index=False, encoding='utf-8-sig')
|
|
|
|
|
pd.DataFrame(top_applications, columns=['应用领域', '出现次数']).to_csv('llm_applications.csv', index=False, encoding='utf-8-sig')
|
|
|
|
|
print("数据已保存到CSV文件")
|
|
|
|
|
return True
|
|
|
|
|
except Exception as e2:
|
|
|
|
|
print(f"保存CSV文件也失败: {e2}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def plot_top_applications(self, top_applications):
|
|
|
|
|
"""绘制TOP应用柱状图"""
|
|
|
|
|
if not top_applications:
|
|
|
|
|
print("没有找到应用领域数据")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
apps, counts = zip(*top_applications)
|
|
|
|
|
|
|
|
|
|
plt.figure(figsize=(12, 8))
|
|
|
|
|
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7', '#DDA0DD', '#98D8C8', '#F7DC6F']
|
|
|
|
|
bars = plt.bar(apps, counts, color=colors[:len(apps)])
|
|
|
|
|
|
|
|
|
|
# 添加数据标签
|
|
|
|
|
for bar, count in zip(bars, counts):
|
|
|
|
|
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
|
|
|
|
|
f'{count}', ha='center', va='bottom', fontsize=12, fontweight='bold')
|
|
|
|
|
|
|
|
|
|
plt.title('大语言模型应用领域TOP8分布', fontsize=16, pad=20)
|
|
|
|
|
plt.xlabel('应用领域', fontsize=14)
|
|
|
|
|
plt.ylabel('出现频次', fontsize=14)
|
|
|
|
|
plt.xticks(rotation=45, ha='right')
|
|
|
|
|
plt.grid(axis='y', alpha=0.3)
|
|
|
|
|
plt.tight_layout()
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
def plot_user_views(self, views_analysis):
|
|
|
|
|
"""绘制用户观点分析图"""
|
|
|
|
|
if not views_analysis or sum(views_analysis.values()) == 0:
|
|
|
|
|
print("没有用户观点数据可展示")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# 过滤掉值为0的条目
|
|
|
|
|
filtered_views = {k: v for k, v in views_analysis.items() if v > 0}
|
|
|
|
|
|
|
|
|
|
if not filtered_views:
|
|
|
|
|
print("所有观点维度提及次数都为0")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
categories, counts = zip(*filtered_views.items())
|
|
|
|
|
|
|
|
|
|
plt.figure(figsize=(12, 8))
|
|
|
|
|
colors = ['#FF9999', '#66B2FF', '#99FF99', '#FFD700', '#FFB6C1', '#87CEEB', '#98FB98']
|
|
|
|
|
bars = plt.bar(categories, counts, color=colors[:len(categories)])
|
|
|
|
|
|
|
|
|
|
# 添加数据标签
|
|
|
|
|
for bar, count in zip(bars, counts):
|
|
|
|
|
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
|
|
|
|
|
f'{count}', ha='center', va='bottom', fontsize=12, fontweight='bold')
|
|
|
|
|
|
|
|
|
|
plt.title('用户对大语言模型的观点分布', fontsize=16, pad=20)
|
|
|
|
|
plt.xlabel('观点维度', fontsize=14)
|
|
|
|
|
plt.ylabel('提及次数', fontsize=14)
|
|
|
|
|
plt.xticks(rotation=45, ha='right')
|
|
|
|
|
plt.grid(axis='y', alpha=0.3)
|
|
|
|
|
plt.tight_layout()
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
def analyze_conclusions(self, word_freq, top_applications, views_analysis, specific_views, video_title, tech_data):
|
|
|
|
|
"""分析并得出结论"""
|
|
|
|
|
print("\n" + "="*60)
|
|
|
|
|
print("大语言模型应用分析结论")
|
|
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
|
print(f"\n分析视频: {video_title}")
|
|
|
|
|
|
|
|
|
|
# 1. 应用领域分析
|
|
|
|
|
if top_applications:
|
|
|
|
|
total_mentions = sum([count for _, count in top_applications])
|
|
|
|
|
print(f"\n1. 主要应用领域分布:")
|
|
|
|
|
for app, count in top_applications:
|
|
|
|
|
percentage = (count / total_mentions) * 100 if total_mentions > 0 else 0
|
|
|
|
|
print(f" - {app}: {count}次 ({percentage:.1f}%)")
|
|
|
|
|
|
|
|
|
|
# 2. 用户观点综合分析
|
|
|
|
|
print(f"\n2. 用户观点综合分析:")
|
|
|
|
|
total_views = sum(views_analysis.values())
|
|
|
|
|
if total_views > 0:
|
|
|
|
|
for category, count in views_analysis.items():
|
|
|
|
|
percentage = (count / total_views) * 100
|
|
|
|
|
print(f" - {category}: {count}次 ({percentage:.1f}%)")
|
|
|
|
|
|
|
|
|
|
# 3. 科技媒体趋势分析
|
|
|
|
|
print(f"\n3. 科技媒体趋势分析:")
|
|
|
|
|
for category, score in tech_data['trend_scores'].items():
|
|
|
|
|
print(f" - {category}: {score}分")
|
|
|
|
|
|
|
|
|
|
# 4. 发展趋势预测
|
|
|
|
|
print(f"\n4. 发展趋势预测:")
|
|
|
|
|
for pred in tech_data['predictions']:
|
|
|
|
|
print(f" - {pred['trend']}: {pred['prediction']} ({pred['timeframe']}, {pred['confidence']}置信度)")
|
|
|
|
|
|
|
|
|
|
# 5. 综合建议
|
|
|
|
|
print(f"\n5. 综合建议:")
|
|
|
|
|
if views_analysis.get('应用成本', 0) > 0:
|
|
|
|
|
print(" - 关注成本优化方案,推动技术普惠")
|
|
|
|
|
if views_analysis.get('安全隐私', 0) > 0:
|
|
|
|
|
print(" - 加强安全防护和隐私保护措施")
|
|
|
|
|
if tech_data['trend_scores'].get('技术演进', 0) > tech_data['trend_scores'].get('应用场景', 0):
|
|
|
|
|
print(" - 技术仍在快速演进期,建议保持技术敏感性")
|
|
|
|
|
else:
|
|
|
|
|
print(" - 应用场景拓展成为重点,建议关注垂直领域机会")
|
|
|
|
|
|
|
|
|
|
def extract_bvid_from_url(url):
|
|
|
|
|
"""从B站URL中提取BV号"""
|
|
|
|
|
bvid_pattern = r'BV[0-9A-Za-z]{10}'
|
|
|
|
|
match = re.search(bvid_pattern, url)
|
|
|
|
|
if match:
|
|
|
|
|
return match.group()
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
"""主函数"""
|
|
|
|
|
# 从URL中提取BV号
|
|
|
|
|
url = "https://www.bilibili.com/video/BV1fs4y1d7ex/?spm_id_from=333.337.search-card.all.click&vd_source=15df046f7c6c0dbb574611c9d3e4d5ef/"
|
|
|
|
|
bvid = extract_bvid_from_url(url)
|
|
|
|
|
|
|
|
|
|
if not bvid:
|
|
|
|
|
print("无法从URL中提取BV号")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
print(f"提取的BV号: {bvid}")
|
|
|
|
|
|
|
|
|
|
analyzer = BilibiliVideoAnalyzer()
|
|
|
|
|
tech_analyzer = TechMediaAnalyzer()
|
|
|
|
|
|
|
|
|
|
# 1. 获取视频信息和CID
|
|
|
|
|
print("\n获取视频信息...")
|
|
|
|
|
video_info = analyzer.get_video_info(bvid)
|
|
|
|
|
|
|
|
|
|
if not video_info:
|
|
|
|
|
print("无法获取视频信息,程序结束")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# 2. 获取弹幕数据
|
|
|
|
|
print("\n获取弹幕数据...")
|
|
|
|
|
danmu_data = analyzer.get_danmu_data(video_info['cid'])
|
|
|
|
|
|
|
|
|
|
if not danmu_data:
|
|
|
|
|
print("无法获取弹幕数据,程序结束")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
analyzer.danmu_data = danmu_data
|
|
|
|
|
|
|
|
|
|
# 3. 过滤噪声
|
|
|
|
|
print("\n过滤噪声弹幕...")
|
|
|
|
|
filtered_danmu = analyzer.filter_noise(danmu_data)
|
|
|
|
|
|
|
|
|
|
if not filtered_danmu:
|
|
|
|
|
print("过滤后无有效弹幕,程序结束")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# 4. 分词统计词频
|
|
|
|
|
print("\n进行分词和词频统计...")
|
|
|
|
|
word_freq, all_words = analyzer.segment_and_count_words(filtered_danmu)
|
|
|
|
|
|
|
|
|
|
# 显示前20个高频词
|
|
|
|
|
print("\n前20个高频词:")
|
|
|
|
|
for word, count in word_freq.most_common(20):
|
|
|
|
|
print(f" {word}: {count}")
|
|
|
|
|
|
|
|
|
|
# 5. 提取LLM应用案例
|
|
|
|
|
print("\n提取LLM应用案例...")
|
|
|
|
|
top_applications, application_scores = analyzer.extract_llm_applications(word_freq, 8)
|
|
|
|
|
|
|
|
|
|
# 6. 分析用户观点
|
|
|
|
|
print("\n分析用户观点...")
|
|
|
|
|
views_analysis, specific_views = analyzer.analyze_user_views(word_freq, filtered_danmu)
|
|
|
|
|
|
|
|
|
|
# 7. 爬取和分析科技媒体观点
|
|
|
|
|
print("\n爬取和分析科技媒体观点...")
|
|
|
|
|
tech_articles = tech_analyzer.crawl_tech_news()
|
|
|
|
|
trend_scores, predictions, keyword_freq = tech_analyzer.analyze_trends(tech_articles)
|
|
|
|
|
|
|
|
|
|
# 可视化科技媒体趋势分析
|
|
|
|
|
tech_analyzer.plot_trend_analysis(trend_scores, predictions)
|
|
|
|
|
|
|
|
|
|
tech_data = {
|
|
|
|
|
'articles': tech_articles,
|
|
|
|
|
'trend_scores': trend_scores,
|
|
|
|
|
'predictions': predictions,
|
|
|
|
|
'keyword_freq': keyword_freq
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 8. 显示TOP8应用
|
|
|
|
|
if top_applications:
|
|
|
|
|
print("\nTOP 8 大语言模型应用领域:")
|
|
|
|
|
for i, (app, count) in enumerate(top_applications, 1):
|
|
|
|
|
print(f"{i}. {app}: {count}次")
|
|
|
|
|
|
|
|
|
|
analyzer.plot_top_applications(top_applications)
|
|
|
|
|
else:
|
|
|
|
|
print("未识别到明显的LLM应用领域")
|
|
|
|
|
|
|
|
|
|
# 9. 显示用户观点分析
|
|
|
|
|
print("\n用户观点分析:")
|
|
|
|
|
for category, count in views_analysis.items():
|
|
|
|
|
print(f" {category}: {count}次")
|
|
|
|
|
|
|
|
|
|
analyzer.plot_user_views(views_analysis)
|
|
|
|
|
|
|
|
|
|
# 10. 生成词云
|
|
|
|
|
print("\n生成词云图...")
|
|
|
|
|
analyzer.generate_wordcloud(all_words)
|
|
|
|
|
|
|
|
|
|
# 11. 保存到Excel
|
|
|
|
|
print("\n保存数据到Excel...")
|
|
|
|
|
success = analyzer.save_to_excel(word_freq, top_applications, application_scores, video_info, views_analysis, specific_views, tech_data)
|
|
|
|
|
|
|
|
|
|
if success:
|
|
|
|
|
print("数据分析完成!")
|
|
|
|
|
else:
|
|
|
|
|
print("数据分析完成,但数据保存失败")
|
|
|
|
|
|
|
|
|
|
# 12. 分析结论
|
|
|
|
|
analyzer.analyze_conclusions(word_freq, top_applications, views_analysis, specific_views, video_info['title'], tech_data)
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|