ADD file via upload

11 months ago · eb0a8193c5
parent b0e370dd8c
commit eb0a8193c5
1 changed files with 157 additions and 0 deletions
--- a/visualization.py
+++ b/visualization.py
@ -0,0 +1,157 @@
+"""
+visualization.py - 生成弹幕词云图的模块
+"""
+
+import re
+from collections import Counter
+import pandas as pd
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+import jieba
+import numpy as np
+from PIL import Image
+
+def read_danmakus(filename):
+    """
+    从 Excel 文件中读取弹幕内容，并返回弹幕列表。
+    
+    :param filename: Excel 文件名
+    :return: 弹幕内容列表
+    """
+    try:
+        data_frame = pd.read_excel(filename)  # 读取 Excel 文件
+        return data_frame['弹幕内容'].tolist()  # 返回弹幕内容列的列表
+    except FileNotFoundError:
+        print(f"文件 {filename} 未找到。")
+        return []
+    except pd.errors.EmptyDataError:
+        print(f"文件 {filename} 是空的。")
+        return []
+
+def filter_relevant_danmakus(danmakus, keywords):
+    """
+    筛选与给定关键词相关的弹幕内容。
+    
+    :param danmakus: 弹幕内容列表
+    :param keywords: 关键词列表
+    :return: 相关的弹幕内容列表
+    """
+    relevant_danmakus = []  # 存储相关弹幕的列表
+    for danmaku in danmakus:
+        # 如果弹幕内容中包含任意一个关键词，则添加到相关弹幕列表
+        if any(keyword in danmaku for keyword in keywords):
+            relevant_danmakus.append(danmaku)
+    return relevant_danmakus
+
+def preprocess_danmakus(danmakus):
+    """
+    预处理弹幕内容，去除特殊字符和多余空格。
+    
+    :param danmakus: 弹幕内容列表
+    :return: 处理后的弹幕内容列表
+    """
+    processed_danmakus = []  # 存储处理后的弹幕
+    for danmaku in danmakus:
+        # 去除特殊字符，保留字母、数字和空格
+        danmaku = re.sub(r'[^\w\s]', '', danmaku)
+        # 将多个空格替换为单个空格，并去掉首尾空格
+        danmaku = re.sub(r'\s+', ' ', danmaku).strip()
+        processed_danmakus.append(danmaku)  # 添加处理后的弹幕
+    return processed_danmakus
+
+def extract_words(danmakus):
+    """
+    使用 Jieba 分词库对弹幕内容进行分词。
+    
+    :param danmakus: 弹幕内容列表
+    :return: 分词后的词语列表
+    """
+    words = []  # 存储分词后的词语
+    for danmaku in danmakus:
+        seg_list = jieba.cut(danmaku)  # 使用 Jieba 进行分词
+        words.extend(seg_list)  # 将分词结果添加到词语列表
+    return words
+
+def remove_stopwords(words, stopwords):
+    """
+    去除停用词和单个字符。
+    
+    :param words: 词语列表
+    :param stopwords: 停用词集合
+    :return: 去除停用词后的词语列表
+    """
+    return [word for word in words if word not in stopwords and len(word) > 1]  # 过滤停用词和单字符
+
+def calculate_word_frequency(words):
+    """
+    计算词语的频率。
+    
+    :param words: 词语列表
+    :return: 词频统计结果
+    """
+    word_freq = Counter(words)  # 使用 Counter 统计词频
+    return word_freq
+
+def generate_wordcloud(word_freq):
+    """
+    生成词云图并将其保存为 PNG 文件。
+    
+    :param word_freq: 词频统计结果
+    """
+    if not word_freq:  # 检查词频是否为空
+        print("没有词语生成词云图。")
+        return  # 如果没有词语，直接返回
+
+    try:
+        # 加载中国地图形状图像
+        mask = np.array(Image.open("china_map.png"))
+        font_path = 'C:/Windows/Fonts/msyh.ttc'
+        wordcloud = WordCloud(
+            font_path=font_path,
+            mask=mask,
+            width=800,
+            height=400,
+            background_color='white',
+        ).generate_from_frequencies(word_freq)  # 根据词频生成词云
+
+        plt.figure(figsize=(10, 5))  # 设置图形大小
+        plt.imshow(wordcloud, interpolation='bilinear')  # 显示词云图
+        plt.axis('off')  # 不显示坐标轴
+        plt.title('2024巴黎奥运会应用AI技术的词云图', fontproperties='SimHei')  # 设置标题
+        # 保存词云图为 PNG 文件
+        plt.savefig("wordcloud.png", bbox_inches='tight', dpi=300)  # 保存为 PNG 文件
+        plt.show()  # 显示词云图
+    except FileNotFoundError:
+        print("地图形状文件 'china_map.png' 未找到。")
+    except ValueError as value_error:
+        print(f"数据转换错误: {value_error}. 请检查数据格式。")
+
+def main():
+    """
+    主函数，执行弹幕读取、筛选、处理，提取词语并生成词云图的逻辑。
+    """
+    # 读取所有弹幕
+    input_filename = "all_danmakus.xlsx"  # 输入文件名
+    danmakus = read_danmakus(input_filename)  # 读取弹幕内容
+
+    if not danmakus:
+        print("没有读取到任何弹幕内容。")
+        return
+    # 筛选与“2024巴黎奥运会应用AI技术”相关的弹幕
+    keywords = ["AI", "智能", "科技", "应用", "数据", "创新", "算法", "数字", "视觉"]
+    relevant_danmakus = filter_relevant_danmakus(danmakus, keywords)  # 筛选相关弹幕
+    # 预处理弹幕
+    processed_danmakus = preprocess_danmakus(relevant_danmakus)  # 处理弹幕内容
+    # 提取词语
+    words = extract_words(processed_danmakus)  # 分词
+    # 加载停用词
+    stopwords = set(['的', '是', '在', '有', '和', '这', '了', '与'])  # 停用词列表
+    # 去除停用词
+    filtered_words = remove_stopwords(words, stopwords)  # 过滤停用词
+    # 统计词频
+    word_freq = calculate_word_frequency(filtered_words)  # 计算词频
+    # 生成词云图
+    generate_wordcloud(word_freq)  # 生成词云图并保存
+
+if __name__ == "__main__":
+    main()  # 执行主函数