You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

85 lines
2.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# coding=gbk
from __future__ import print_function
import jieba.analyse
import wordCloud.Convert
from PIL import Image
import numpy as np
from matplotlib import pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
def clean_using_stopword(text,stopwords_path):
mywordlist = []
seg_list = jieba.cut(text, cut_all=False)
liststr = "/".join(seg_list)
with open(stopwords_path,'r', encoding='UTF-8') as f_stop:
f_stop_text = f_stop.read()
f_stop_text = str(f_stop_text)
f_stop_seg_list = f_stop_text.split('\n')
for myword in liststr.split('/'): # 去除停顿词,生成新文档
if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
mywordlist.append(myword)
return ''.join(mywordlist)
def preprocessing(text_path,stopwords_path):
with open(text_path,'r', encoding='UTF-8') as f:
content = f.read()
return clean_using_stopword(content,stopwords_path)
return content
def extract_keywords(max_words,text_path,stopwords_path):
# 抽取1000个关键词带权重后面需要根据权重来生成词云
allow_pos = ('nr',) # 词性
tags = jieba.analyse.extract_tags(preprocessing(text_path,stopwords_path), max_words, withWeight=True)
keywords = dict()
for i in tags:
print("%s---%f" % (i[0], i[1]))
keywords[i[0]] = i[1]
return keywords
def draw_wordcloud(bg_image_path, font_path,text_path,stopwords_path, background_color, max_words,save_path, filepath, savepath, namedict_path):
wordCloud.Convert.convert(filepath, savepath)
jieba.load_userdict(namedict_path)
back_coloring = plt.imread(bg_image_path) # 设置背景图片
# 设置词云属性
wc = WordCloud(font_path=font_path, # 设置字体
background_color=background_color, # 背景颜色
max_words=max_words, # 词云显示的最大词数
mask=back_coloring, # 设置背景图片
)
# 根据频率生成词云
wc.generate_from_frequencies(extract_keywords(max_words,text_path,stopwords_path))
mask = np.array(Image.open(filepath))
image_colors = ImageColorGenerator(mask)
wc.recolor(color_func=image_colors)
wc.to_file(save_path)
wc.to_file(save_path)
if __name__ == '__main__':
filepath = 'pic/image1.png'
savepath = 'pic/convert.jpg'
bg_image_path = "pic/convert.jpg"
text_path = 'text/jsjs.txt'
font_path = 'text/msyh.ttf'
stopwords_path = 'text/stopword.txt'
background_color = "white" # 背景颜色
max_words = 2000 # 词云显示的最大词数
save_path = "out/wordcloud.jpg"
namedict_path = "text/namedict.txt"
draw_wordcloud(bg_image_path, font_path,text_path,stopwords_path, background_color, max_words,save_path,filepath, savepath, namedict_path)