diff --git a/MyWordCould.py b/MyWordCould.py new file mode 100644 index 0000000..41372ec --- /dev/null +++ b/MyWordCould.py @@ -0,0 +1,144 @@ +import pandas as pd +import jieba # 结巴分词 +import wordcloud # 词云图模块 +import matplotlib.pyplot as plt +import imageio # 读取本地图片,修改词云图形 +import time + +def break_down_words(file_path, stopwords= '', output_txt='stop.txt', output_turly=False): + """ + 从Excel文件中生成有效词并保存为TXT。 + 返回有效词集合。 + """ + # 1. 读取Excel文件中的弹幕数据 + df = pd.read_excel(file_path) + + # 2. 合并所有句子为一个字符串 + text = ' '.join(df['包含关键词的弹幕'].astype(str)) + + # 3. 分词,将句子分割为词汇 + text_list = jieba.lcut(text) + + # 4. 去除停用词,并过滤出有效词(长度要大于等于2个字的词) + filtered_words = set(word for word in text_list if word not in stopwords and len(word) >= 2) + + # 5. 将有效词保存到TXT文件 + if output_turly: + with open(output_txt, 'w', encoding='utf-8') as f: + for word in filtered_words: + f.write(f"{word}\n") + + print(f"有效词已保存到 {output_txt}") + return filtered_words # 返回有效词集合供后续使用 + +# 从TXT文件中加载停用词 +def load_stopwords_from_txt(file_path): + """ + 从TXT文件中加载停用词 + """ + with open(file_path, 'r', encoding='utf-8') as f: + # 读取所有行并去掉换行符和空白字符 + stopwords = {line.strip() for line in f if line.strip()} # 只保留非空行 + return stopwords + +# 根据有效词的出现次数生成字符串 +def generate_weighted_text_from_counts(file_path, stopwords): + """ + 参数1: file_path: 包含词频的Excel文件路径 + 参数2: stopwords: 停用词列表 + 返回值: 生成的文本 + """ + df = pd.read_excel(file_path) + total_count = df.shape[0] # 获取总行数 + weighted_words = [] + + for index, row in df.iterrows(): + word = row['关键词'] + count = total_count - index + if(count/2 > 1): + count = int (count/2) + if word not in stopwords and len(word) >= 2: # 确保不包含停用词 + weighted_words.extend([word] * count) # 将词按其出现次数重复添加 + + return ' '.join(weighted_words) + +# 根据有效词集合和词频生成并显示词云图 +def generate_wordcloud(filtered_words, weighted_text, output_path='词云.png'): + """ + 参数1: filtered_words 有效词集合 + 参数2: weighted_text 加权后的文本 + """ + # 将有效词集合转为字符串,空格分隔 + filtered_text_str = ' '.join(filtered_words) + + # 组合加权文本和有效词字符串 + combined_text = weighted_text + ' ' + filtered_text_str + + # 配置词云图 + wc = wordcloud.WordCloud( + width=600, + height=300, + background_color='white', + font_path='msyh.ttc', # 字体文件路径 + max_font_size=200, + min_font_size=10, + colormap='cool', + scale=4 + ) + + # 生成词云 + wc.generate(combined_text) + + # 保存词云图为文件 + wc.to_file(output_path) + + # 显示词云图 + plt.figure(figsize=(10, 10)) + plt.imshow(wc, interpolation='bilinear') + plt.axis('off') # 关闭坐标轴 + plt.show() + +def main_word_could(): + stopwords = load_stopwords_from_txt('stop.txt') + + # 处理AI相关句子以生成有效词 + filtered_words = break_down_words('ai_sentences.xlsx', stopwords) # 处理AI相关句子 + print('词语已生成') + + # 读取keyword_counts.xlsx文件中的词频并生成加权文本 + weighted_text = generate_weighted_text_from_counts('keyword_counts.xlsx', stopwords) + print('加权文本已生成') + + # 生成词云图 + generate_wordcloud(filtered_words, weighted_text) + +if __name__ == '__main__': + main_word_could() + + +# 获取相关函数运行时间 +def main_word_could2(): + start_time = time.time() # 开始计时 + stopwords = load_stopwords_from_txt('stop.txt') + end_time = time.time() # 结束计时 + print(f"load_stopwords_from_txt函数耗时: {end_time - start_time:.4f} 秒") + + # 处理AI相关句子以生成有效词 + start_time = time.time() # 开始计时 + filtered_words = break_down_words('ai_sentences.xlsx', stopwords) # 处理AI相关句子 + # print('词语已生成') + end_time = time.time() # 结束计时 + print(f"break_down_words函数耗时: {end_time - start_time:.4f} 秒") + + # 读取keyword_counts.xlsx文件中的词频并生成加权文本 + start_time = time.time() # 开始计时 + weighted_text = generate_weighted_text_from_counts('keyword_counts.xlsx', stopwords) + # print('加权文本已生成') + end_time = time.time() # 结束计时 + print(f"generate_weighted_text_from_counts函数耗时: {end_time - start_time:.4f} 秒") + + # 生成词云图 + start_time = time.time() # 开始计时 + generate_wordcloud(filtered_words, weighted_text) + end_time = time.time() # 结束计时 + print(f"generate_wordcloud函数耗时: {end_time - start_time:.4f} 秒")