diff --git a/get_wordcloud_pic.py b/get_wordcloud_pic.py new file mode 100644 index 0000000..70eb4bb --- /dev/null +++ b/get_wordcloud_pic.py @@ -0,0 +1,56 @@ +import pandas as pd +import jieba +from wordcloud import WordCloud +import matplotlib.pyplot as plt +from PIL import Image +import numpy as np + +# 1. 读取停用词表 +def load_stopwords(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + stopwords = set(line.strip() for line in f) + return stopwords + +# 2. 过滤停用词 +def remove_stopwords(words_list, stopwords): + return [word for word in words_list if word not in stopwords and len(word) > 1] + +# 3. 读取Excel文件并提取弹幕内容 +file_path = "danmu_data.xlsx" +df = pd.read_excel(file_path) +comments = df['danmu'].astype(str) +text = ' '.join(comments) + +# 4. 使用 jieba 分词 +words = jieba.cut(text, cut_all=False) + +# 5. 加载停用词表 +stopwords_file = "D://edge//stop.txt" # 替换为实际路径 +stopwords = load_stopwords(stopwords_file) + +# 6. 去除停用词 +filtered_words = remove_stopwords(words, stopwords) + +# 7. 将过滤后的词汇重新拼接为一个字符串 +words_list = ' '.join(filtered_words) + +# 8. 加载形状图片并生成词云 +mask = np.array(Image.open("D://edge//kk.png")) + +wordcloud = WordCloud( + font_path='simhei.ttf', # 确保支持中文 + background_color='white', + mask=mask, + contour_width=1, + contour_color='black', + width=800, + height=600 +).generate(words_list) + +# 9. 显示词云图 +plt.imshow(wordcloud, interpolation='bilinear') +plt.axis("off") +plt.show() + +# 10. 保存词云图 +wordcloud.to_file("filtered_wordcloud.png")