You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

56 lines
1.9 KiB

"""
生成基于全部弹幕数据的词云图并进行关键词提取和归一化处理
"""
import re
import pandas as pd
import numpy as np
import wordcloud
from matplotlib.image import imread
from jieba import analyse
from concurrent.futures import ThreadPoolExecutor
def blue_color_func(_, __, ___, ____, _random_state=None, **_kwargs):
"""定义蓝色调色板,用于词云图的颜色设置"""
return f"hsl(210, 100%, {np.random.randint(50, 90)}%)"
def normalize_hahaha(text):
"""归一化处理,将所有类似的“哈哈哈”统一为“哈哈哈”"""
return re.sub(r'{3,}', '哈哈哈', text)
def process_keywords(dm_list):
"""并行处理关键词提取"""
dm_string = ' '.join(dm_list)
with ThreadPoolExecutor() as executor:
keywords = list(executor.map(lambda kw: analyse.extract_tags(kw, topK=100, withWeight=False, allowPOS=()), [dm_string]))
return ' '.join(keywords[0])
def wordcloud_generation(danmu_data, stopwords, output_path):
"""生成词云图并保存"""
dm_list = danmu_data['danmu'].dropna().astype(str).tolist()
dm_list = [normalize_hahaha(text) for text in dm_list]
dmreal_string = process_keywords(dm_list)
img = imread("/output/OIP.jpg")
wc = wordcloud.WordCloud(
stopwords=stopwords,
width=1920,
height=1200,
background_color='white',
font_path='msyhl.ttc',
mask=img,
max_words=100,
color_func=blue_color_func,
).generate(dmreal_string)
wc.to_file(output_path)
def main():
"""加载数据并生成词云"""
dm = pd.read_excel('E:/Crawler/output/All_Danmu.xlsx', sheet_name='Sheet1')
stopwords = {'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '哈哈哈'}
wordcloud_generation(dm, stopwords, '/output/alldanmu_dwordcloud.png')
if __name__ == '__main__':
main()