102201510/barrage/wordCloud.py

'''
生成词云图片

filePath词云所需的词语文本路径
maskImgPath蒙版图片路径，如果无蒙版图片则置为''
saveImgPath保存图片的路径
width生成图片的横向尺寸
height生成图片的纵向尺寸
save是否保存图片
'''

import jieba
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator
from sklearn.feature_extraction.text import TfidfVectorizer

# 将弹幕文本分隔成易于处理的字词
def ReadAndCutWords(filePath):
    with open(filePath, 'r', encoding='utf-8') as file:
        text = file.read()
    words = jieba.cut(text, cut_all=False)
    word_list = ' '.join(words)
    return word_list

# 利用TF-IDF将字词按频率划分
def ChangeToFreq(word_list):
    documents = [word_list]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    word_freq = dict(zip(feature_names, tfidf_matrix.toarray().sum(axis=0)))
    return word_freq

# 根据字词频率来生成图云
def CreateWordCloud(word_freq, width, height, maskImgPath, saveImgPath, save=False):
    if maskImgPath == '':
        mask = None
    else:
        mask = np.array(Image.open(maskImgPath))
    wordcloud = WordCloud(font_path='simhei.ttf',
                        mask= mask,
                        width=width, 
                        height=height, 
                        background_color='white').generate_from_frequencies(word_freq)
    if maskImgPath != '':
        image_colors = ImageColorGenerator(mask)
        wordcloud.recolor(color_func=image_colors)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()
    if save:
        wordcloud.to_file(saveImgPath)

def main():
    filePath = './docs/allBarrage.txt'
    maskImgPath = './docs/maskImg.png'
    saveImgPath = './docs/wordCloud.png'
    word_list = ReadAndCutWords(filePath)
    word_freq = ChangeToFreq(word_list)
    width = 1920
    height = 1440
    isSave = True
    CreateWordCloud(word_freq, width, height, maskImgPath, saveImgPath, isSave)


if __name__ == '__main__':
    main()