|
|
import matplotlib.pyplot as plt
|
|
|
import matplotlib.font_manager as fm
|
|
|
from wordcloud import WordCloud
|
|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
from collections import Counter
|
|
|
import os
|
|
|
|
|
|
class Visualizer:
|
|
|
def __init__(self):
|
|
|
# 设置中文字体
|
|
|
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']
|
|
|
plt.rcParams['axes.unicode_minus'] = False
|
|
|
self.font_path = self.find_chinese_font()
|
|
|
|
|
|
def find_chinese_font(self):
|
|
|
"""寻找中文字体"""
|
|
|
try:
|
|
|
# 尝试常见的中文字体路径
|
|
|
font_paths = [
|
|
|
'C:/Windows/Fonts/simhei.ttf', # Windows
|
|
|
'/System/Library/Fonts/PingFang.ttc', # macOS
|
|
|
'/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf' # Linux
|
|
|
]
|
|
|
|
|
|
for font_path in font_paths:
|
|
|
if os.path.exists(font_path):
|
|
|
return font_path
|
|
|
|
|
|
# 如果找不到,使用matplotlib默认字体
|
|
|
return None
|
|
|
except:
|
|
|
return None
|
|
|
|
|
|
def create_wordcloud(self, word_freq_df: pd.DataFrame, save_path: str):
|
|
|
"""创建词云图"""
|
|
|
# 创建词频字典
|
|
|
word_freq = dict(zip(word_freq_df['词语'], word_freq_df['频次']))
|
|
|
|
|
|
# 创建词云
|
|
|
wc_config = {
|
|
|
'width': 1200,
|
|
|
'height': 800,
|
|
|
'background_color': 'white',
|
|
|
'colormap': 'viridis',
|
|
|
'max_words': 100,
|
|
|
'relative_scaling': 0.5
|
|
|
}
|
|
|
|
|
|
if self.font_path:
|
|
|
wc_config['font_path'] = self.font_path
|
|
|
|
|
|
wc = WordCloud(**wc_config)
|
|
|
wordcloud = wc.generate_from_frequencies(word_freq)
|
|
|
|
|
|
# 绘制词云
|
|
|
plt.figure(figsize=(15, 10))
|
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
|
plt.axis('off')
|
|
|
plt.title('大语言模型应用弹幕词云分析', fontsize=20, pad=20)
|
|
|
plt.tight_layout()
|
|
|
|
|
|
# 确保目录存在
|
|
|
os.makedirs('visualization', exist_ok=True)
|
|
|
plt.savefig(save_path, dpi=300, bbox_inches='tight',
|
|
|
facecolor='white', edgecolor='none')
|
|
|
plt.show()
|
|
|
|
|
|
print(f"词云图已保存到: {save_path}")
|
|
|
|
|
|
def plot_applications_bar(self, top_apps_df: pd.DataFrame, save_path: str):
|
|
|
"""绘制应用领域条形图"""
|
|
|
plt.figure(figsize=(12, 8))
|
|
|
|
|
|
colors = plt.cm.Set3(np.linspace(0, 1, len(top_apps_df)))
|
|
|
|
|
|
bars = plt.barh(top_apps_df['应用领域'], top_apps_df['出现次数'],
|
|
|
color=colors, edgecolor='black', alpha=0.8)
|
|
|
|
|
|
# 添加数据标签
|
|
|
for bar in bars:
|
|
|
width = bar.get_width()
|
|
|
plt.text(width + 0.1, bar.get_y() + bar.get_height()/2,
|
|
|
f'{int(width)}', ha='left', va='center', fontsize=12)
|
|
|
|
|
|
plt.xlabel('出现次数', fontsize=14)
|
|
|
plt.title('大语言模型应用领域分布(Top 8)', fontsize=16, pad=20)
|
|
|
plt.grid(axis='x', alpha=0.3)
|
|
|
plt.tight_layout()
|
|
|
|
|
|
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
|
|
plt.show()
|
|
|
|
|
|
print(f"应用领域分布图已保存到: {save_path}")
|
|
|
|
|
|
def plot_sentiment_analysis(self, processed_df: pd.DataFrame, save_path: str):
|
|
|
"""绘制情感分析图"""
|
|
|
# 简单的情感关键词分类
|
|
|
positive_words = ['好', '强', '棒', '厉害', '方便', '高效', '智能', '强大', '优秀', '推荐']
|
|
|
negative_words = ['差', '弱', '问题', '担心', '风险', '贵', '难', '复杂', '取代', '改进']
|
|
|
|
|
|
sentiment_counts = {'积极': 0, '消极': 0, '中性': 0}
|
|
|
|
|
|
for danmu in processed_df['original_danmu']:
|
|
|
positive_count = sum(1 for word in positive_words if word in danmu)
|
|
|
negative_count = sum(1 for word in negative_words if word in danmu)
|
|
|
|
|
|
if positive_count > negative_count:
|
|
|
sentiment_counts['积极'] += 1
|
|
|
elif negative_count > positive_count:
|
|
|
sentiment_counts['消极'] += 1
|
|
|
else:
|
|
|
sentiment_counts['中性'] += 1
|
|
|
|
|
|
# 绘制饼图
|
|
|
plt.figure(figsize=(10, 8))
|
|
|
colors = ['#ff9999', '#66b3ff', '#99ff99']
|
|
|
plt.pie(sentiment_counts.values(), labels=sentiment_counts.keys(),
|
|
|
autopct='%1.1f%%', colors=colors, startangle=90,
|
|
|
explode=(0.1, 0, 0)) # 突出显示积极评价
|
|
|
plt.title('弹幕情感倾向分布', fontsize=16)
|
|
|
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
|
|
plt.show()
|
|
|
|
|
|
print(f"情感分析图已保存到: {save_path}")
|
|
|
|
|
|
def create_comprehensive_visualization(self, processed_df: pd.DataFrame,
|
|
|
top_apps_df: pd.DataFrame,
|
|
|
word_freq_df: pd.DataFrame):
|
|
|
"""创建综合可视化"""
|
|
|
# 确保可视化目录存在
|
|
|
os.makedirs('visualization', exist_ok=True)
|
|
|
|
|
|
# 1. 词云图
|
|
|
self.create_wordcloud(word_freq_df, 'visualization/wordcloud.png')
|
|
|
|
|
|
# 2. 应用领域分布
|
|
|
self.plot_applications_bar(top_apps_df, 'visualization/applications_distribution.png')
|
|
|
|
|
|
# 3. 情感倾向分析
|
|
|
self.plot_sentiment_analysis(processed_df, 'visualization/sentiment_analysis.png')
|
|
|
|
|
|
def main():
|
|
|
visualizer = Visualizer()
|
|
|
|
|
|
try:
|
|
|
# 加载处理后的数据
|
|
|
processed_df = pd.read_excel('data/processed/llm_analysis.xlsx',
|
|
|
sheet_name='弹幕数据')
|
|
|
top_apps_df = pd.read_excel('data/processed/llm_analysis.xlsx',
|
|
|
sheet_name='应用领域排名')
|
|
|
word_freq_df = pd.read_excel('data/processed/llm_analysis.xlsx',
|
|
|
sheet_name='词频统计')
|
|
|
|
|
|
# 创建可视化
|
|
|
visualizer.create_comprehensive_visualization(processed_df, top_apps_df, word_freq_df)
|
|
|
print("所有可视化图表生成完成!")
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"可视化过程中出现错误: {e}")
|
|
|
print("请先运行 data_processor.py 生成数据")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main() |