import pandas as pd import jieba # 结巴分词 import wordcloud # 词云图模块 import matplotlib.pyplot as plt import matplotlib.colors as mcolors import imageio # 读取本地图片,修改词云图形 import time def break_down_words(file_path, stopwords= '', output_txt='./Chiikawa/ch_stop.txt', output_turly=False): """ 从Excel文件中生成有效词并保存为TXT。 返回有效词集合。 """ # 1. 读取Excel文件中的弹幕数据 df = pd.read_excel(file_path) # 2. 合并所有句子为一个字符串 text = ' '.join(df['包含关键词的弹幕'].astype(str)) # 3. 分词,将句子分割为词汇 text_list = jieba.lcut(text) # 4. 去除停用词,并过滤出有效词(长度要大于等于2个字的词) filtered_words = set(word for word in text_list if word not in stopwords and len(word) >= 2) # 5. 将有效词保存到TXT文件 if output_turly: with open(output_txt, 'w', encoding='utf-8') as f: for word in filtered_words: f.write(f"{word}\n") print(f"有效词已保存到 {output_txt}") return filtered_words # 返回有效词集合供后续使用 # 从TXT文件中加载停用词 def load_stopwords_from_txt(file_path): """ 从TXT文件中加载停用词 """ with open(file_path, 'r', encoding='utf-8') as f: # 读取所有行并去掉换行符和空白字符 stopwords = {line.strip() for line in f if line.strip()} # 只保留非空行 return stopwords # 根据有效词的出现次数生成字符串 def generate_weighted_text_from_counts(file_path, stopwords): """ 参数1: file_path: 包含词频的Excel文件路径 参数2: stopwords: 停用词列表 返回值: 生成的文本 """ df = pd.read_excel(file_path) total_count = df.shape[0] # 获取总行数 weighted_words = [] for index, row in df.iterrows(): word = row['关键词'] count = total_count - index if(count/2 > 1): count = int (count/2) if word not in stopwords and len(word) >= 2: # 确保不包含停用词 weighted_words.extend([word] * count) # 将词按其出现次数重复添加 return ' '.join(weighted_words) # 根据有效词集合和词频生成并显示词云图 def generate_wordcloud(filtered_words, weighted_text, output_path='./Chiikawa/ch_词云.png',mask_img = './Chiikawa/537.png'): """ 参数1: filtered_words 有效词集合 参数2: weighted_text 加权后的文本 """ # 将有效词集合转为字符串,空格分隔 filtered_text_str = ' '.join(filtered_words) # 组合加权文本和有效词字符串 combined_text = weighted_text + ' ' + filtered_text_str img = imageio.imread(mask_img) # 定义自定义颜色 custom_colors = ['#ffff00', '#ffea00', '#ffd700', '#f5de57', '#fffacd'] # 各种黄色 cmap = mcolors.ListedColormap(custom_colors) # 配置词云图 wc = wordcloud.WordCloud( width=800, height=800, background_color='white', font_path='msyh.ttc', # 字体文件路径 max_font_size=200, min_font_size=10, colormap=cmap, contour_color='yellow', # 轮廓颜色 contour_width=1, # 轮廓宽度 mask=img, scale=4 ) # 生成词云 wc.generate(combined_text) # 保存词云图为文件 wc.to_file(output_path) # 显示词云图 plt.figure(figsize=(10, 10)) plt.imshow(wc, interpolation='bilinear') plt.axis('off') # 关闭坐标轴 plt.show() def main(): break_down_words('./Chiikawa/sentences.xlsx',output_turly=True) def main_word_could(): stopwords = load_stopwords_from_txt('./Chiikawa/ch_stop.txt') # 处理AI相关句子以生成有效词 filtered_words = break_down_words('./Chiikawa/sentences.xlsx', stopwords) # 处理AI相关句子 print('词语已生成') # 读取ch_keyword_counts.xlsx文件中的词频并生成加权文本 weighted_text = generate_weighted_text_from_counts('./Chiikawa/ch_keyword_counts.xlsx', stopwords) print('加权文本已生成') # 生成词云图 generate_wordcloud(filtered_words, weighted_text) if __name__ == '__main__': # main() main_word_could()