ADD file via upload

11 months ago · b036130211
parent 5f8dac9503
commit b036130211
1 changed files with 132 additions and 0 deletions
--- a/CMyWordCould.py
+++ b/CMyWordCould.py
@ -0,0 +1,132 @@
+import pandas as pd
+import jieba  # 结巴分词
+import wordcloud  # 词云图模块
+import matplotlib.pyplot as plt
+import matplotlib.colors as mcolors
+import imageio  # 读取本地图片，修改词云图形
+import time
+
+def break_down_words(file_path, stopwords= '', output_txt='./Chiikawa/ch_stop.txt', output_turly=False):
+    """ 
+    从Excel文件中生成有效词并保存为TXT。
+    返回有效词集合。
+    """
+    # 1. 读取Excel文件中的弹幕数据
+    df = pd.read_excel(file_path)
+
+    # 2. 合并所有句子为一个字符串
+    text = ' '.join(df['包含关键词的弹幕'].astype(str))
+
+    # 3. 分词，将句子分割为词汇
+    text_list = jieba.lcut(text)
+
+    # 4. 去除停用词，并过滤出有效词（长度要大于等于2个字的词）
+    filtered_words = set(word for word in text_list if word not in stopwords and len(word) >= 2)
+
+    # 5. 将有效词保存到TXT文件
+    if output_turly:
+        with open(output_txt, 'w', encoding='utf-8') as f:
+            for word in filtered_words:
+                f.write(f"{word}\n")
+    
+        print(f"有效词已保存到 {output_txt}")
+    return filtered_words  # 返回有效词集合供后续使用
+
+# 从TXT文件中加载停用词
+def load_stopwords_from_txt(file_path):
+    """ 
+    从TXT文件中加载停用词
+    """
+    with open(file_path, 'r', encoding='utf-8') as f:
+        # 读取所有行并去掉换行符和空白字符
+        stopwords = {line.strip() for line in f if line.strip()}  # 只保留非空行
+    return stopwords
+
+# 根据有效词的出现次数生成字符串
+def generate_weighted_text_from_counts(file_path, stopwords):
+    """
+    参数1: file_path: 包含词频的Excel文件路径
+    参数2: stopwords: 停用词列表
+    返回值: 生成的文本
+    """
+    df = pd.read_excel(file_path)
+    total_count = df.shape[0] # 获取总行数
+    weighted_words = []
+
+    for index, row in df.iterrows():
+        word = row['关键词']
+        count = total_count - index
+        if(count/2 > 1):
+            count = int (count/2)
+        if word not in stopwords and len(word) >= 2:  # 确保不包含停用词
+            weighted_words.extend([word] * count)  # 将词按其出现次数重复添加
+
+    return ' '.join(weighted_words)
+
+# 根据有效词集合和词频生成并显示词云图
+def generate_wordcloud(filtered_words, weighted_text, output_path='./Chiikawa/ch_词云.png',mask_img = './Chiikawa/537.png'):
+    """ 
+    参数1: filtered_words 有效词集合
+    参数2: weighted_text 加权后的文本
+    """
+    # 将有效词集合转为字符串，空格分隔
+    filtered_text_str = ' '.join(filtered_words)
+
+    # 组合加权文本和有效词字符串
+    combined_text = weighted_text + ' ' + filtered_text_str
+
+    img = imageio.imread(mask_img)
+
+    # 定义自定义颜色
+    custom_colors = ['#ffff00', '#ffea00', '#ffd700', '#f5de57', '#fffacd']  # 各种黄色
+    cmap = mcolors.ListedColormap(custom_colors)
+
+
+    # 配置词云图
+    wc = wordcloud.WordCloud(
+        width=800,
+        height=800,
+        background_color='white',
+        font_path='msyh.ttc',  # 字体文件路径
+        max_font_size=200,
+        min_font_size=10,
+        colormap=cmap,
+        contour_color='yellow',  # 轮廓颜色
+        contour_width=1,        # 轮廓宽度
+        mask=img,
+        scale=4
+    )
+
+    # 生成词云
+    wc.generate(combined_text)
+
+    # 保存词云图为文件
+    wc.to_file(output_path)
+
+    # 显示词云图
+    plt.figure(figsize=(10, 10))
+    plt.imshow(wc, interpolation='bilinear')
+    plt.axis('off')  # 关闭坐标轴
+    plt.show()
+
+def main():
+    break_down_words('./Chiikawa/sentences.xlsx',output_turly=True)
+
+
+def main_word_could():
+    stopwords = load_stopwords_from_txt('./Chiikawa/ch_stop.txt')
+
+    # 处理AI相关句子以生成有效词
+    filtered_words = break_down_words('./Chiikawa/sentences.xlsx', stopwords)  # 处理AI相关句子
+    print('词语已生成')
+
+    # 读取ch_keyword_counts.xlsx文件中的词频并生成加权文本
+    weighted_text = generate_weighted_text_from_counts('./Chiikawa/ch_keyword_counts.xlsx', stopwords)
+    print('加权文本已生成')
+
+    # 生成词云图
+    generate_wordcloud(filtered_words, weighted_text)
+
+if __name__ == '__main__':
+    # main()
+    main_word_could()