|
|
|
@ -0,0 +1,157 @@
|
|
|
|
|
"""
|
|
|
|
|
visualization.py - 生成弹幕词云图的模块
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
from collections import Counter
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
from wordcloud import WordCloud
|
|
|
|
|
import jieba
|
|
|
|
|
import numpy as np
|
|
|
|
|
from PIL import Image
|
|
|
|
|
|
|
|
|
|
def read_danmakus(filename):
|
|
|
|
|
"""
|
|
|
|
|
从 Excel 文件中读取弹幕内容,并返回弹幕列表。
|
|
|
|
|
|
|
|
|
|
:param filename: Excel 文件名
|
|
|
|
|
:return: 弹幕内容列表
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
data_frame = pd.read_excel(filename) # 读取 Excel 文件
|
|
|
|
|
return data_frame['弹幕内容'].tolist() # 返回弹幕内容列的列表
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
print(f"文件 {filename} 未找到。")
|
|
|
|
|
return []
|
|
|
|
|
except pd.errors.EmptyDataError:
|
|
|
|
|
print(f"文件 {filename} 是空的。")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
def filter_relevant_danmakus(danmakus, keywords):
|
|
|
|
|
"""
|
|
|
|
|
筛选与给定关键词相关的弹幕内容。
|
|
|
|
|
|
|
|
|
|
:param danmakus: 弹幕内容列表
|
|
|
|
|
:param keywords: 关键词列表
|
|
|
|
|
:return: 相关的弹幕内容列表
|
|
|
|
|
"""
|
|
|
|
|
relevant_danmakus = [] # 存储相关弹幕的列表
|
|
|
|
|
for danmaku in danmakus:
|
|
|
|
|
# 如果弹幕内容中包含任意一个关键词,则添加到相关弹幕列表
|
|
|
|
|
if any(keyword in danmaku for keyword in keywords):
|
|
|
|
|
relevant_danmakus.append(danmaku)
|
|
|
|
|
return relevant_danmakus
|
|
|
|
|
|
|
|
|
|
def preprocess_danmakus(danmakus):
|
|
|
|
|
"""
|
|
|
|
|
预处理弹幕内容,去除特殊字符和多余空格。
|
|
|
|
|
|
|
|
|
|
:param danmakus: 弹幕内容列表
|
|
|
|
|
:return: 处理后的弹幕内容列表
|
|
|
|
|
"""
|
|
|
|
|
processed_danmakus = [] # 存储处理后的弹幕
|
|
|
|
|
for danmaku in danmakus:
|
|
|
|
|
# 去除特殊字符,保留字母、数字和空格
|
|
|
|
|
danmaku = re.sub(r'[^\w\s]', '', danmaku)
|
|
|
|
|
# 将多个空格替换为单个空格,并去掉首尾空格
|
|
|
|
|
danmaku = re.sub(r'\s+', ' ', danmaku).strip()
|
|
|
|
|
processed_danmakus.append(danmaku) # 添加处理后的弹幕
|
|
|
|
|
return processed_danmakus
|
|
|
|
|
|
|
|
|
|
def extract_words(danmakus):
|
|
|
|
|
"""
|
|
|
|
|
使用 Jieba 分词库对弹幕内容进行分词。
|
|
|
|
|
|
|
|
|
|
:param danmakus: 弹幕内容列表
|
|
|
|
|
:return: 分词后的词语列表
|
|
|
|
|
"""
|
|
|
|
|
words = [] # 存储分词后的词语
|
|
|
|
|
for danmaku in danmakus:
|
|
|
|
|
seg_list = jieba.cut(danmaku) # 使用 Jieba 进行分词
|
|
|
|
|
words.extend(seg_list) # 将分词结果添加到词语列表
|
|
|
|
|
return words
|
|
|
|
|
|
|
|
|
|
def remove_stopwords(words, stopwords):
|
|
|
|
|
"""
|
|
|
|
|
去除停用词和单个字符。
|
|
|
|
|
|
|
|
|
|
:param words: 词语列表
|
|
|
|
|
:param stopwords: 停用词集合
|
|
|
|
|
:return: 去除停用词后的词语列表
|
|
|
|
|
"""
|
|
|
|
|
return [word for word in words if word not in stopwords and len(word) > 1] # 过滤停用词和单字符
|
|
|
|
|
|
|
|
|
|
def calculate_word_frequency(words):
|
|
|
|
|
"""
|
|
|
|
|
计算词语的频率。
|
|
|
|
|
|
|
|
|
|
:param words: 词语列表
|
|
|
|
|
:return: 词频统计结果
|
|
|
|
|
"""
|
|
|
|
|
word_freq = Counter(words) # 使用 Counter 统计词频
|
|
|
|
|
return word_freq
|
|
|
|
|
|
|
|
|
|
def generate_wordcloud(word_freq):
|
|
|
|
|
"""
|
|
|
|
|
生成词云图并将其保存为 PNG 文件。
|
|
|
|
|
|
|
|
|
|
:param word_freq: 词频统计结果
|
|
|
|
|
"""
|
|
|
|
|
if not word_freq: # 检查词频是否为空
|
|
|
|
|
print("没有词语生成词云图。")
|
|
|
|
|
return # 如果没有词语,直接返回
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# 加载中国地图形状图像
|
|
|
|
|
mask = np.array(Image.open("china_map.png"))
|
|
|
|
|
font_path = 'C:/Windows/Fonts/msyh.ttc'
|
|
|
|
|
wordcloud = WordCloud(
|
|
|
|
|
font_path=font_path,
|
|
|
|
|
mask=mask,
|
|
|
|
|
width=800,
|
|
|
|
|
height=400,
|
|
|
|
|
background_color='white',
|
|
|
|
|
).generate_from_frequencies(word_freq) # 根据词频生成词云
|
|
|
|
|
|
|
|
|
|
plt.figure(figsize=(10, 5)) # 设置图形大小
|
|
|
|
|
plt.imshow(wordcloud, interpolation='bilinear') # 显示词云图
|
|
|
|
|
plt.axis('off') # 不显示坐标轴
|
|
|
|
|
plt.title('2024巴黎奥运会应用AI技术的词云图', fontproperties='SimHei') # 设置标题
|
|
|
|
|
# 保存词云图为 PNG 文件
|
|
|
|
|
plt.savefig("wordcloud.png", bbox_inches='tight', dpi=300) # 保存为 PNG 文件
|
|
|
|
|
plt.show() # 显示词云图
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
print("地图形状文件 'china_map.png' 未找到。")
|
|
|
|
|
except ValueError as value_error:
|
|
|
|
|
print(f"数据转换错误: {value_error}. 请检查数据格式。")
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
"""
|
|
|
|
|
主函数,执行弹幕读取、筛选、处理,提取词语并生成词云图的逻辑。
|
|
|
|
|
"""
|
|
|
|
|
# 读取所有弹幕
|
|
|
|
|
input_filename = "all_danmakus.xlsx" # 输入文件名
|
|
|
|
|
danmakus = read_danmakus(input_filename) # 读取弹幕内容
|
|
|
|
|
|
|
|
|
|
if not danmakus:
|
|
|
|
|
print("没有读取到任何弹幕内容。")
|
|
|
|
|
return
|
|
|
|
|
# 筛选与“2024巴黎奥运会应用AI技术”相关的弹幕
|
|
|
|
|
keywords = ["AI", "智能", "科技", "应用", "数据", "创新", "算法", "数字", "视觉"]
|
|
|
|
|
relevant_danmakus = filter_relevant_danmakus(danmakus, keywords) # 筛选相关弹幕
|
|
|
|
|
# 预处理弹幕
|
|
|
|
|
processed_danmakus = preprocess_danmakus(relevant_danmakus) # 处理弹幕内容
|
|
|
|
|
# 提取词语
|
|
|
|
|
words = extract_words(processed_danmakus) # 分词
|
|
|
|
|
# 加载停用词
|
|
|
|
|
stopwords = set(['的', '是', '在', '有', '和', '这', '了', '与']) # 停用词列表
|
|
|
|
|
# 去除停用词
|
|
|
|
|
filtered_words = remove_stopwords(words, stopwords) # 过滤停用词
|
|
|
|
|
# 统计词频
|
|
|
|
|
word_freq = calculate_word_frequency(filtered_words) # 计算词频
|
|
|
|
|
# 生成词云图
|
|
|
|
|
generate_wordcloud(word_freq) # 生成词云图并保存
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main() # 执行主函数
|