|
|
|
|
#第一种普通词云图,下面第二种奖杯词云图,暂时注释掉了
|
|
|
|
|
import os
|
|
|
|
|
import jieba
|
|
|
|
|
from wordcloud import WordCloud
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
|
|
directory = 'E:\\前端\软件工程\\莎莎和陈梦\\弹幕收集'
|
|
|
|
|
|
|
|
|
|
#合并所有文件内容
|
|
|
|
|
text = ""
|
|
|
|
|
for filename in os.listdir(directory):
|
|
|
|
|
if filename.endswith('.txt'):
|
|
|
|
|
with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
|
|
|
|
|
text += file.read()
|
|
|
|
|
|
|
|
|
|
words = jieba.cut(text)
|
|
|
|
|
|
|
|
|
|
#停用词列表
|
|
|
|
|
stop_words = set([
|
|
|
|
|
"我", "你", "他", "她", "它", "是", "的", "了", "在", "吗", "啊", "吧",
|
|
|
|
|
"也", "有", "这", "那", "从", "为", "上", "下", "和", "与", "就", "不",
|
|
|
|
|
"中", "还", "要", "会", "能", "对", "着", "个", "把", "所以", "但", "也",
|
|
|
|
|
"所以", "从", "如", "她", "他", "它", "还", "也", "吗", "啊", "哦", "?", "!",",","。","哈哈哈"
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
#过滤停用词
|
|
|
|
|
filtered_words = [word for word in words if word.strip() and word not in stop_words and len(word) >= 2]
|
|
|
|
|
|
|
|
|
|
#词频统计
|
|
|
|
|
word_freq = {}
|
|
|
|
|
for word in filtered_words:
|
|
|
|
|
word_freq[word] = word_freq.get(word, 0) + 1
|
|
|
|
|
|
|
|
|
|
#生成词云
|
|
|
|
|
wordcloud = WordCloud(font_path='simsun.ttc', width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
|
|
|
|
|
|
|
|
|
|
# 显示词云
|
|
|
|
|
plt.figure(figsize=(10, 5))
|
|
|
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
|
|
|
plt.axis("off")
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#第二种奖杯词云图
|
|
|
|
|
|
|
|
|
|
# from wordcloud import WordCloud, STOPWORDS
|
|
|
|
|
# import matplotlib.pyplot as plt
|
|
|
|
|
# import numpy as np
|
|
|
|
|
# import jieba.posseg as pseg
|
|
|
|
|
# from collections import Counter
|
|
|
|
|
# import PIL.Image as Image
|
|
|
|
|
# from matplotlib import colors
|
|
|
|
|
# import os
|
|
|
|
|
|
|
|
|
|
# directory = 'E:\\前端\\软件工程\\弹幕收集按序'
|
|
|
|
|
|
|
|
|
|
# #合并所有文件内容
|
|
|
|
|
# text = ""
|
|
|
|
|
# for filename in os.listdir(directory):
|
|
|
|
|
# if filename.endswith('.txt'):
|
|
|
|
|
# with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
|
|
|
|
|
# text += file.read()
|
|
|
|
|
|
|
|
|
|
# words = pseg.cut(text)
|
|
|
|
|
|
|
|
|
|
# #按指定长度和词性提取词
|
|
|
|
|
# report_words = []
|
|
|
|
|
# for word, flag in words:
|
|
|
|
|
# if (len(word) >= 2) and ('n' in flag): #提取的是名词
|
|
|
|
|
# report_words.append(word)
|
|
|
|
|
|
|
|
|
|
# #统计高频词汇
|
|
|
|
|
# result = Counter(report_words).most_common(300)
|
|
|
|
|
|
|
|
|
|
# #建立词汇字典
|
|
|
|
|
# content = dict(result)
|
|
|
|
|
# #输出词频统计结果
|
|
|
|
|
# for i in range(50):
|
|
|
|
|
# word,flag=result[i]
|
|
|
|
|
# print("{0:<10}{1:>5}".format(word,flag))
|
|
|
|
|
|
|
|
|
|
# #设置停用词
|
|
|
|
|
# stopwords = set(STOPWORDS)
|
|
|
|
|
# stopwords.update(["我", "你", "他", "她", "它", "是", "的", "了", "在", "吗", "啊", "吧",
|
|
|
|
|
# "也", "有", "这", "那", "从", "为", "上", "下", "和", "与", "就", "不",
|
|
|
|
|
# "中", "还", "要", "会", "能", "对", "着", "个", "把", "所以", "但", "也",
|
|
|
|
|
# "所以", "从", "如", "她", "他", "它", "还", "也", "吗", "啊", "哦", "?", "!", ",", "。"])
|
|
|
|
|
|
|
|
|
|
# #设置png掩膜
|
|
|
|
|
# background = Image.open("E:\前端\奖杯4.png").convert('RGB')
|
|
|
|
|
# mask = np.array(background)
|
|
|
|
|
|
|
|
|
|
# font_path = r"C:\Windows\Fonts\STLITI.TTF"
|
|
|
|
|
|
|
|
|
|
# max_font_size =100
|
|
|
|
|
# min_font_size =10
|
|
|
|
|
|
|
|
|
|
# #建立颜色数组,可随意更改显示颜色
|
|
|
|
|
# color_list = ['#FF274B']
|
|
|
|
|
# #调用颜色数组
|
|
|
|
|
# colormap = colors.ListedColormap(color_list)
|
|
|
|
|
|
|
|
|
|
# #生成词云
|
|
|
|
|
# wordcloud = WordCloud(scale=4, #输出清晰度
|
|
|
|
|
# font_path=font_path, #输出路径
|
|
|
|
|
# colormap=colormap, #字体颜色
|
|
|
|
|
# width=1600, #输出图片宽度
|
|
|
|
|
# height=900, #输出图片高度
|
|
|
|
|
# background_color='white', #图片背景颜色
|
|
|
|
|
# stopwords=stopwords, #停用词
|
|
|
|
|
# mask=mask, #掩膜
|
|
|
|
|
# max_font_size=max_font_size, #最大字体大小
|
|
|
|
|
# min_font_size=min_font_size) #最小字体大小
|
|
|
|
|
# wordcloud.generate_from_frequencies(content)
|
|
|
|
|
|
|
|
|
|
# #使用 matplotlib 显示词云
|
|
|
|
|
# plt.imshow(wordcloud, interpolation='bilinear')
|
|
|
|
|
# plt.axis('off')
|
|
|
|
|
# plt.show()
|
|
|
|
|
# #保存词云图
|
|
|
|
|
# wordcloud.to_file("wordcloud.png")
|