Add 词云统计

master
hnu202109070127 4 years ago
parent 80a1e21e6e
commit 2eb624d2bd

@ -0,0 +1,92 @@
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 2 16:35:01 2022
@author: xe
"""
from os import path
from PIL import Image
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 获得原始停止词列表
from wordcloud import STOPWORDS
import jieba
import random
# 打开文本
text_z = open('C:/Users/xe/Desktop/计算机/歌词.txt',encoding='utf8').read()
# jieba分词
text_new = " ".join(jieba.cut(text_z))
text_new = text_new.split()
#print(text_new)
# 遍历统计
count_ = {}
for text in text_new:
count_[text] = count_.get(text,0) + 1
# 去除停留词
file = open('C:/Users/xe/Desktop/计算机/hit_stopwords.txt', encoding='utf-8')
zh_tc = set()
for line in file.readlines():
line = line.strip('\n')
zh_tc.add(line)
#建立排除库,排除掉大多数冠词、代词、连接词等语法型词汇
for word in list(zh_tc):
# 根据停留词进行排除没有找到则返回0
count_.pop(word, 0)
# count_
# 转换类型
items = list(count_.items())
# 按次数从大到小排序
items.sort(key = lambda x:x[1], reverse = True)
items
infos, counts = [], []
for i in range(10):
word, count = items[i]
infos.append(word)
counts.append(count)
print('{0:<10}{1:>5}'.format(word, count))
# 把中文分词好的转换成数据类型,支持词云可输入的
count_dct = dict()
for every in iter(count_):
print(every, count_[every])
count_dct[every] = count_[every]
count_dct
print(count_dct)
#开始画图
#透明背景: mode=RGBA, background_color=None
# 设置图片掩膜
mask = np.array(Image.open("C:/Users/xe/Desktop/图3.png"))
# 颜色函数
def random_color(word, font_size, position, orientation, font_path, random_state):
s = 'hsl(0, %d%%, %d%%)' % (random.randint(60,80), random.randint(60,80))
return s
# 生成对象,font_path:中文正常显示,不加字体会中文乱码
wc = WordCloud(color_func=random_color, font_path='C:/Users/xe/Desktop/计算机/SourceHanSerifK-Light.otf.otf',mode='RGBA', background_color=None, mask=mask).generate(text_z)
width,height = 24, 14
# 默认画布大小
plt.figure()
plt.figure(figsize=(width,height))
# 显示词云
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
Loading…
Cancel
Save