""" 生成基于全部弹幕数据的词云图,并进行关键词提取和归一化处理 """ import re import pandas as pd import numpy as np import wordcloud from matplotlib.image import imread from jieba import analyse from concurrent.futures import ThreadPoolExecutor def blue_color_func(_, __, ___, ____, _random_state=None, **_kwargs): """定义蓝色调色板,用于词云图的颜色设置""" return f"hsl(210, 100%, {np.random.randint(50, 90)}%)" def normalize_hahaha(text): """归一化处理,将所有类似的“哈哈哈”统一为“哈哈哈”""" return re.sub(r'哈{3,}', '哈哈哈', text) def process_keywords(dm_list): """并行处理关键词提取""" dm_string = ' '.join(dm_list) with ThreadPoolExecutor() as executor: keywords = list(executor.map(lambda kw: analyse.extract_tags(kw, topK=100, withWeight=False, allowPOS=()), [dm_string])) return ' '.join(keywords[0]) def wordcloud_generation(danmu_data, stopwords, output_path): """生成词云图并保存""" dm_list = danmu_data['danmu'].dropna().astype(str).tolist() dm_list = [normalize_hahaha(text) for text in dm_list] dmreal_string = process_keywords(dm_list) img = imread("/output/OIP.jpg") wc = wordcloud.WordCloud( stopwords=stopwords, width=1920, height=1200, background_color='white', font_path='msyhl.ttc', mask=img, max_words=100, color_func=blue_color_func, ).generate(dmreal_string) wc.to_file(output_path) def main(): """加载数据并生成词云""" dm = pd.read_excel('E:/Crawler/output/All_Danmu.xlsx', sheet_name='Sheet1') stopwords = {'我', '你', '他', '这', '个', '是', '的', '了', '啊', '吗', '吧', '就', '都', '不', '也', '哈哈哈'} wordcloud_generation(dm, stopwords, '/output/alldanmu_dwordcloud.png') if __name__ == '__main__': main()