|
|
@ -5,36 +5,36 @@ import matplotlib.pyplot as plt
|
|
|
|
from PIL import Image
|
|
|
|
from PIL import Image
|
|
|
|
import numpy as np
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
# 1. 读取停用词表
|
|
|
|
# 读取停用词表
|
|
|
|
def load_stopwords(file_path):
|
|
|
|
def load_stopwords(file_path):
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
|
stopwords = set(line.strip() for line in f)
|
|
|
|
stopwords = set(line.strip() for line in f)
|
|
|
|
return stopwords
|
|
|
|
return stopwords
|
|
|
|
|
|
|
|
|
|
|
|
# 2. 过滤停用词
|
|
|
|
# 过滤停用词
|
|
|
|
def remove_stopwords(words_list, stopwords):
|
|
|
|
def remove_stopwords(words_list, stopwords):
|
|
|
|
return [word for word in words_list if word not in stopwords and len(word) > 1]
|
|
|
|
return [word for word in words_list if word not in stopwords and len(word) > 1]
|
|
|
|
|
|
|
|
|
|
|
|
# 3. 读取Excel文件并提取弹幕内容
|
|
|
|
# 读取Excel文件并提取弹幕内容
|
|
|
|
file_path = "danmu_data.xlsx"
|
|
|
|
file_path = "danmu_data.xlsx"
|
|
|
|
df = pd.read_excel(file_path)
|
|
|
|
df = pd.read_excel(file_path)
|
|
|
|
comments = df['danmu'].astype(str)
|
|
|
|
comments = df['danmu'].astype(str)
|
|
|
|
text = ' '.join(comments)
|
|
|
|
text = ' '.join(comments)
|
|
|
|
|
|
|
|
|
|
|
|
# 4. 使用 jieba 分词
|
|
|
|
# 使用 jieba 分词
|
|
|
|
words = jieba.cut(text, cut_all=False)
|
|
|
|
words = jieba.cut(text, cut_all=False)
|
|
|
|
|
|
|
|
|
|
|
|
# 5. 加载停用词表
|
|
|
|
# 加载停用词表
|
|
|
|
stopwords_file = "D://edge//stop.txt" # 替换为实际路径
|
|
|
|
stopwords_file = "D://edge//stop.txt" # 替换为实际路径
|
|
|
|
stopwords = load_stopwords(stopwords_file)
|
|
|
|
stopwords = load_stopwords(stopwords_file)
|
|
|
|
|
|
|
|
|
|
|
|
# 6. 去除停用词
|
|
|
|
# 去除停用词
|
|
|
|
filtered_words = remove_stopwords(words, stopwords)
|
|
|
|
filtered_words = remove_stopwords(words, stopwords)
|
|
|
|
|
|
|
|
|
|
|
|
# 7. 将过滤后的词汇重新拼接为一个字符串
|
|
|
|
# 将过滤后的词汇重新拼接为一个字符串
|
|
|
|
words_list = ' '.join(filtered_words)
|
|
|
|
words_list = ' '.join(filtered_words)
|
|
|
|
|
|
|
|
|
|
|
|
# 8. 加载形状图片并生成词云
|
|
|
|
# 加载形状图片并生成词云
|
|
|
|
mask = np.array(Image.open("D://edge//kk.png"))
|
|
|
|
mask = np.array(Image.open("D://edge//kk.png"))
|
|
|
|
|
|
|
|
|
|
|
|
wordcloud = WordCloud(
|
|
|
|
wordcloud = WordCloud(
|
|
|
@ -47,10 +47,10 @@ wordcloud = WordCloud(
|
|
|
|
height=600
|
|
|
|
height=600
|
|
|
|
).generate(words_list)
|
|
|
|
).generate(words_list)
|
|
|
|
|
|
|
|
|
|
|
|
# 9. 显示词云图
|
|
|
|
# 显示词云图
|
|
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
|
|
plt.axis("off")
|
|
|
|
plt.axis("off")
|
|
|
|
plt.show()
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
# 10. 保存词云图
|
|
|
|
# 保存词云图
|
|
|
|
wordcloud.to_file("filtered_wordcloud.png")
|
|
|
|
wordcloud.to_file("filtered_wordcloud.png")
|
|
|
|