|
|
|
|
@ -0,0 +1,92 @@
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
"""
|
|
|
|
|
Created on Thu Jun 2 16:35:01 2022
|
|
|
|
|
|
|
|
|
|
@author: xe
|
|
|
|
|
"""
|
|
|
|
|
from os import path
|
|
|
|
|
from PIL import Image
|
|
|
|
|
import numpy as np
|
|
|
|
|
from wordcloud import WordCloud
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
# 获得原始停止词列表
|
|
|
|
|
from wordcloud import STOPWORDS
|
|
|
|
|
import jieba
|
|
|
|
|
import random
|
|
|
|
|
|
|
|
|
|
# 打开文本
|
|
|
|
|
text_z = open('C:/Users/xe/Desktop/计算机/歌词.txt',encoding='utf8').read()
|
|
|
|
|
|
|
|
|
|
# jieba分词
|
|
|
|
|
text_new = " ".join(jieba.cut(text_z))
|
|
|
|
|
|
|
|
|
|
text_new = text_new.split()
|
|
|
|
|
|
|
|
|
|
#print(text_new)
|
|
|
|
|
|
|
|
|
|
# 遍历统计
|
|
|
|
|
count_ = {}
|
|
|
|
|
for text in text_new:
|
|
|
|
|
count_[text] = count_.get(text,0) + 1
|
|
|
|
|
|
|
|
|
|
# 去除停留词
|
|
|
|
|
file = open('C:/Users/xe/Desktop/计算机/hit_stopwords.txt', encoding='utf-8')
|
|
|
|
|
zh_tc = set()
|
|
|
|
|
|
|
|
|
|
for line in file.readlines():
|
|
|
|
|
line = line.strip('\n')
|
|
|
|
|
zh_tc.add(line)
|
|
|
|
|
|
|
|
|
|
#建立排除库,排除掉大多数冠词、代词、连接词等语法型词汇
|
|
|
|
|
for word in list(zh_tc):
|
|
|
|
|
# 根据停留词进行排除,没有找到则返回0
|
|
|
|
|
count_.pop(word, 0)
|
|
|
|
|
# count_
|
|
|
|
|
# 转换类型
|
|
|
|
|
items = list(count_.items())
|
|
|
|
|
# 按次数从大到小排序
|
|
|
|
|
items.sort(key = lambda x:x[1], reverse = True)
|
|
|
|
|
items
|
|
|
|
|
|
|
|
|
|
infos, counts = [], []
|
|
|
|
|
for i in range(10):
|
|
|
|
|
word, count = items[i]
|
|
|
|
|
infos.append(word)
|
|
|
|
|
counts.append(count)
|
|
|
|
|
print('{0:<10}{1:>5}'.format(word, count))
|
|
|
|
|
|
|
|
|
|
# 把中文分词好的转换成数据类型,支持词云可输入的
|
|
|
|
|
count_dct = dict()
|
|
|
|
|
for every in iter(count_):
|
|
|
|
|
print(every, count_[every])
|
|
|
|
|
count_dct[every] = count_[every]
|
|
|
|
|
count_dct
|
|
|
|
|
|
|
|
|
|
print(count_dct)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#开始画图
|
|
|
|
|
|
|
|
|
|
#透明背景: mode=‘RGBA’, background_color=None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 设置图片掩膜
|
|
|
|
|
mask = np.array(Image.open("C:/Users/xe/Desktop/图3.png"))
|
|
|
|
|
|
|
|
|
|
# 颜色函数
|
|
|
|
|
def random_color(word, font_size, position, orientation, font_path, random_state):
|
|
|
|
|
s = 'hsl(0, %d%%, %d%%)' % (random.randint(60,80), random.randint(60,80))
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
|
# 生成对象,font_path:中文正常显示,不加字体会中文乱码
|
|
|
|
|
|
|
|
|
|
wc = WordCloud(color_func=random_color, font_path='C:/Users/xe/Desktop/计算机/SourceHanSerifK-Light.otf.otf',mode='RGBA', background_color=None, mask=mask).generate(text_z)
|
|
|
|
|
|
|
|
|
|
width,height = 24, 14
|
|
|
|
|
# 默认画布大小
|
|
|
|
|
plt.figure()
|
|
|
|
|
plt.figure(figsize=(width,height))
|
|
|
|
|
# 显示词云
|
|
|
|
|
plt.imshow(wc, interpolation='bilinear')
|
|
|
|
|
plt.axis('off')
|