You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

57 lines
1.4 KiB

import pandas as pd
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
# 读取停用词表
def load_stopwords(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
stopwords = set(line.strip() for line in f)
return stopwords
# 过滤停用词
def remove_stopwords(words_list, stopwords):
return [word for word in words_list if word not in stopwords and len(word) > 1]
# 读取Excel文件并提取弹幕内容
file_path = "danmu_data.xlsx"
df = pd.read_excel(file_path)
comments = df['danmu'].astype(str)
text = ' '.join(comments)
# 使用 jieba 分词
words = jieba.cut(text, cut_all=False)
# 加载停用词表
stopwords_file = "D://edge//stop.txt" # 替换为实际路径
stopwords = load_stopwords(stopwords_file)
# 去除停用词
filtered_words = remove_stopwords(words, stopwords)
# 将过滤后的词汇重新拼接为一个字符串
words_list = ' '.join(filtered_words)
# 加载形状图片并生成词云
mask = np.array(Image.open("D://edge//kk.png"))
wordcloud = WordCloud(
font_path='simhei.ttf', # 确保支持中文
background_color='white',
mask=mask,
contour_width=1,
contour_color='black',
width=800,
height=600
).generate(words_list)
# 显示词云图
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
# 保存词云图
wordcloud.to_file("filtered_wordcloud.png")