You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

126 lines
4.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#第一种普通词云图,下面第二种奖杯词云图,暂时注释掉了
import os
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
directory = 'E:\前端\软件工程\弹幕收集按序'
#合并所有文件内容
text = ""
for filename in os.listdir(directory):
if filename.endswith('.txt'):
with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
text += file.read()
words = jieba.cut(text)
#停用词列表
stop_words = set([
"", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "所以", "", "",
"所以", "", "", "", "", "", "", "", "", "", "", "", "","",""
])
#过滤停用词
filtered_words = [word for word in words if word.strip() and word not in stop_words]
#词频统计
word_freq = {}
for word in filtered_words:
word_freq[word] = word_freq.get(word, 0) + 1
#生成词云
wordcloud = WordCloud(font_path='simsun.ttc', width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
# 显示词云
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
#第二种奖杯词云图
# from wordcloud import WordCloud, STOPWORDS
# import matplotlib.pyplot as plt
# import numpy as np
# import jieba.posseg as pseg
# from collections import Counter
# import PIL.Image as Image
# from matplotlib import colors
# import os
# directory = 'E:\\前端\\软件工程\\弹幕收集按序'
# #合并所有文件内容
# text = ""
# for filename in os.listdir(directory):
# if filename.endswith('.txt'):
# with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
# text += file.read()
# words = pseg.cut(text)
# #按指定长度和词性提取词
# report_words = []
# for word, flag in words:
# if (len(word) >= 2) and ('n' in flag): #提取的是名词
# report_words.append(word)
# #统计高频词汇
# result = Counter(report_words).most_common(300)
# #建立词汇字典
# content = dict(result)
# #输出词频统计结果
# for i in range(50):
# word,flag=result[i]
# print("{0:<10}{1:>5}".format(word,flag))
# #设置停用词
# stopwords = set(STOPWORDS)
# stopwords.update(["我", "你", "他", "她", "它", "是", "的", "了", "在", "吗", "啊", "吧",
# "也", "有", "这", "那", "从", "为", "上", "下", "和", "与", "就", "不",
# "中", "还", "要", "会", "能", "对", "着", "个", "把", "所以", "但", "也",
# "所以", "从", "如", "她", "他", "它", "还", "也", "吗", "啊", "哦", "", "", "", "。"])
# #设置png掩膜
# background = Image.open("E:\前端\奖杯4.png").convert('RGB')
# mask = np.array(background)
# font_path = r"C:\Windows\Fonts\STLITI.TTF"
# max_font_size =100
# min_font_size =10
# #建立颜色数组,可随意更改显示颜色
# color_list = ['#FF274B']
# #调用颜色数组
# colormap = colors.ListedColormap(color_list)
# #生成词云
# wordcloud = WordCloud(scale=4, #输出清晰度
# font_path=font_path, #输出路径
# colormap=colormap, #字体颜色
# width=1600, #输出图片宽度
# height=900, #输出图片高度
# background_color='white', #图片背景颜色
# stopwords=stopwords, #停用词
# mask=mask, #掩膜
# max_font_size=max_font_size, #最大字体大小
# min_font_size=min_font_size) #最小字体大小
# wordcloud.generate_from_frequencies(content)
# #使用 matplotlib 显示词云
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.show()
# #保存词云图
# wordcloud.to_file("wordcloud.png")