You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

49 lines
1.7 KiB

"""
生成基于全部弹幕数据的词云图,并进行关键词提取和归一化处理
"""
import re
import pandas as pd
import numpy as np
import wordcloud
from matplotlib.image import imread
from jieba import analyse
def blue_color_func(_, __, ___, ____, _random_state=None, **_kwargs):
"""定义蓝色调色板,用于词云图的颜色设置"""
return f"hsl(210, 100%, {np.random.randint(50, 90)}%)"
def normalize_hahaha(text):
"""归一化处理,将所有类似的“哈哈哈”统一为“哈哈哈”"""
return re.sub(r'{3,}', '哈哈哈', text)
def wordcloud_generation(danmu_data):
"""生成词云图并保存"""
dm_list = danmu_data['danmu'].dropna().astype(str).tolist()
dm_list = [normalize_hahaha(text) for text in dm_list]
dm_string = ' '.join(dm_list)
keywords = analyse.extract_tags(dm_string, topK=100, withWeight=False, allowPOS=())
keywords = [word for word in keywords if word not in my_stopwords]
dmreal_string = ' '.join(keywords)
img = imread("E:/Crawler/output/OIP.jpg")
wc = wordcloud.WordCloud(
stopwords=my_stopwords,
width=1920,
height=1200,
background_color='white',
font_path='msyhl.ttc',
mask=img,
max_words=100,
color_func=blue_color_func,
).generate(dmreal_string)
wc.to_file('E:/Crawler/output/alldanmu_dwordcloud.png')
# 加载数据并生成词云
dm = pd.read_excel('E:/Crawler/output/All_Danmu.xlsx', sheet_name='Sheet1')
my_stopwords = {'', '', '', '', '', '', '', '', '', '', '', '', '不是', '', '哈哈哈',
'', '', '', '', '', '', '', '呵呵', '', '嘿嘿', '哎呀', '', '', ''}
wordcloud_generation(dm)