You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
49 lines
1.5 KiB
49 lines
1.5 KiB
2 months ago
|
"""
|
||
|
生成基于弹幕数据的词云图
|
||
|
"""
|
||
|
|
||
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
import wordcloud
|
||
|
from matplotlib.image import imread
|
||
|
import jieba
|
||
|
from concurrent.futures import ThreadPoolExecutor
|
||
|
|
||
|
def blue_color_func(_random_state=None, **_kwargs):
|
||
|
"""Generates a color in the HSL format with a random lightness value."""
|
||
|
return f"hsl(210, 100%, {np.random.randint(50, 90)}%)"
|
||
|
|
||
|
def process_text(danmu_list):
|
||
|
"""并行处理弹幕文本,进行分词等操作"""
|
||
|
with ThreadPoolExecutor() as executor:
|
||
|
dm_string = ' '.join(executor.map(lambda text: ' '.join(jieba.lcut(text)), danmu_list))
|
||
|
return dm_string
|
||
|
|
||
|
def wordcloud_generation(danmu_data, stopwords, output_path):
|
||
|
"""生成词云图并保存"""
|
||
|
dm_list = danmu_data['danmu'].dropna().astype(str).tolist()
|
||
|
dmreal_string = process_text(dm_list)
|
||
|
img = imread("/output/OIP.jpg")
|
||
|
|
||
|
wc = wordcloud.WordCloud(
|
||
|
stopwords=stopwords,
|
||
|
width=1920,
|
||
|
height=1200,
|
||
|
background_color='white',
|
||
|
font_path='msyhl.ttc',
|
||
|
mask=img,
|
||
|
max_words=100,
|
||
|
color_func=blue_color_func,
|
||
|
).generate(dmreal_string)
|
||
|
wc.to_file(output_path)
|
||
|
|
||
|
def main():
|
||
|
"""加载数据并生成词云"""
|
||
|
dm = pd.read_excel('E:/Crawler/output/Top8_Danmu.xlsx', sheet_name='Sheet1')
|
||
|
stopwords = {'我', '你', '他', '这', '个', '是', '的', '了', '啊', '吗', '吧', '就', '都', '不', '也'}
|
||
|
wordcloud_generation(dm, stopwords, '/output/danmu_dwordcloud.png')
|
||
|
print("词云图生成完成!")
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|