Add 词云统计

4 years ago · 2eb624d2bd
parent 80a1e21e6e
commit 2eb624d2bd
1 changed files with 92 additions and 0 deletions
--- a/92
+++ b/92
@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jun  2 16:35:01 2022
+
+@author: xe
+"""
+from os import path
+from PIL import Image
+import numpy as np
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+# 获得原始停止词列表
+from wordcloud import STOPWORDS
+import jieba
+import random
+
+# 打开文本
+text_z = open('C:/Users/xe/Desktop/计算机/歌词.txt',encoding='utf8').read()
+
+# jieba分词
+text_new = " ".join(jieba.cut(text_z))
+
+text_new = text_new.split()
+
+#print(text_new)
+
+# 遍历统计
+count_ = {}
+for text in text_new:
+    count_[text] = count_.get(text,0) + 1
+
+# 去除停留词
+file = open('C:/Users/xe/Desktop/计算机/hit_stopwords.txt', encoding='utf-8')
+zh_tc = set()
+
+for line in file.readlines():
+    line = line.strip('\n')
+    zh_tc.add(line)
+
+#建立排除库，排除掉大多数冠词、代词、连接词等语法型词汇
+for word in list(zh_tc):
+    # 根据停留词进行排除，没有找到则返回0
+    count_.pop(word, 0)
+# count_
+# 转换类型
+items = list(count_.items())
+# 按次数从大到小排序
+items.sort(key = lambda x:x[1], reverse = True)
+items
+
+infos, counts = [], []
+for i in range(10):
+    word, count = items[i]
+    infos.append(word)
+    counts.append(count)
+    print('{0:<10}{1:>5}'.format(word, count))
+
+# 把中文分词好的转换成数据类型，支持词云可输入的
+count_dct = dict()
+for every in iter(count_):
+    print(every, count_[every])
+    count_dct[every] = count_[every]
+count_dct
+
+print(count_dct)
+
+
+#开始画图  
+
+#透明背景： mode=‘RGBA’, background_color=None
+
+
+
+# 设置图片掩膜
+mask = np.array(Image.open("C:/Users/xe/Desktop/图3.png"))
+
+# 颜色函数
+def random_color(word, font_size, position, orientation, font_path, random_state):
+    s = 'hsl(0, %d%%, %d%%)' % (random.randint(60,80), random.randint(60,80))
+    return s
+
+# 生成对象,font_path:中文正常显示,不加字体会中文乱码
+
+wc = WordCloud(color_func=random_color, font_path='C:/Users/xe/Desktop/计算机/SourceHanSerifK-Light.otf.otf',mode='RGBA', background_color=None, mask=mask).generate(text_z)
+
+width,height = 24, 14
+# 默认画布大小
+plt.figure()                        
+plt.figure(figsize=(width,height))
+# 显示词云
+plt.imshow(wc, interpolation='bilinear')
+plt.axis('off')