You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

126 lines
4.1 KiB

#第一种普通词云图,下面第二种奖杯词云图,暂时注释掉了
import os
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
directory = 'E:\前端\软件工程\弹幕收集按序'
#合并所有文件内容
text = ""
for filename in os.listdir(directory):
if filename.endswith('.txt'):
with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
text += file.read()
words = jieba.cut(text)
#停用词列表
stop_words = set([
"", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "所以", "", "",
"所以", "", "", "", "", "", "", "", "", "", "", "", "","",""
])
#过滤停用词
filtered_words = [word for word in words if word.strip() and word not in stop_words]
#词频统计
word_freq = {}
for word in filtered_words:
word_freq[word] = word_freq.get(word, 0) + 1
#生成词云
wordcloud = WordCloud(font_path='simsun.ttc', width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
# 显示词云
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
#第二种奖杯词云图
# from wordcloud import WordCloud, STOPWORDS
# import matplotlib.pyplot as plt
# import numpy as np
# import jieba.posseg as pseg
# from collections import Counter
# import PIL.Image as Image
# from matplotlib import colors
# import os
# directory = 'E:\\前端\\软件工程\\弹幕收集按序'
# #合并所有文件内容
# text = ""
# for filename in os.listdir(directory):
# if filename.endswith('.txt'):
# with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
# text += file.read()
# words = pseg.cut(text)
# #按指定长度和词性提取词
# report_words = []
# for word, flag in words:
# if (len(word) >= 2) and ('n' in flag): #提取的是名词
# report_words.append(word)
# #统计高频词汇
# result = Counter(report_words).most_common(300)
# #建立词汇字典
# content = dict(result)
# #输出词频统计结果
# for i in range(50):
# word,flag=result[i]
# print("{0:<10}{1:>5}".format(word,flag))
# #设置停用词
# stopwords = set(STOPWORDS)
# stopwords.update(["我", "你", "他", "她", "它", "是", "的", "了", "在", "吗", "啊", "吧",
# "也", "有", "这", "那", "从", "为", "上", "下", "和", "与", "就", "不",
# "中", "还", "要", "会", "能", "对", "着", "个", "把", "所以", "但", "也",
# "所以", "从", "如", "她", "他", "它", "还", "也", "吗", "啊", "哦", "", "", "", "。"])
# #设置png掩膜
# background = Image.open("E:\前端\奖杯4.png").convert('RGB')
# mask = np.array(background)
# font_path = r"C:\Windows\Fonts\STLITI.TTF"
# max_font_size =100
# min_font_size =10
# #建立颜色数组,可随意更改显示颜色
# color_list = ['#FF274B']
# #调用颜色数组
# colormap = colors.ListedColormap(color_list)
# #生成词云
# wordcloud = WordCloud(scale=4, #输出清晰度
# font_path=font_path, #输出路径
# colormap=colormap, #字体颜色
# width=1600, #输出图片宽度
# height=900, #输出图片高度
# background_color='white', #图片背景颜色
# stopwords=stopwords, #停用词
# mask=mask, #掩膜
# max_font_size=max_font_size, #最大字体大小
# min_font_size=min_font_size) #最小字体大小
# wordcloud.generate_from_frequencies(content)
# #使用 matplotlib 显示词云
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.show()
# #保存词云图
# wordcloud.to_file("wordcloud.png")