You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
49 lines
1.7 KiB
49 lines
1.7 KiB
"""
|
|
生成基于全部弹幕数据的词云图,并进行关键词提取和归一化处理
|
|
"""
|
|
|
|
import re
|
|
import pandas as pd
|
|
import numpy as np
|
|
import wordcloud
|
|
from matplotlib.image import imread
|
|
from jieba import analyse
|
|
|
|
def blue_color_func(_, __, ___, ____, _random_state=None, **_kwargs):
|
|
"""定义蓝色调色板,用于词云图的颜色设置"""
|
|
return f"hsl(210, 100%, {np.random.randint(50, 90)}%)"
|
|
|
|
def normalize_hahaha(text):
|
|
"""归一化处理,将所有类似的“哈哈哈”统一为“哈哈哈”"""
|
|
return re.sub(r'哈{3,}', '哈哈哈', text)
|
|
|
|
def wordcloud_generation(danmu_data):
|
|
"""生成词云图并保存"""
|
|
dm_list = danmu_data['danmu'].dropna().astype(str).tolist()
|
|
dm_list = [normalize_hahaha(text) for text in dm_list]
|
|
dm_string = ' '.join(dm_list)
|
|
|
|
keywords = analyse.extract_tags(dm_string, topK=100, withWeight=False, allowPOS=())
|
|
keywords = [word for word in keywords if word not in my_stopwords]
|
|
dmreal_string = ' '.join(keywords)
|
|
|
|
img = imread("E:/Crawler/output/OIP.jpg")
|
|
|
|
wc = wordcloud.WordCloud(
|
|
stopwords=my_stopwords,
|
|
width=1920,
|
|
height=1200,
|
|
background_color='white',
|
|
font_path='msyhl.ttc',
|
|
mask=img,
|
|
max_words=100,
|
|
color_func=blue_color_func,
|
|
).generate(dmreal_string)
|
|
wc.to_file('E:/Crawler/output/alldanmu_dwordcloud.png')
|
|
|
|
# 加载数据并生成词云
|
|
dm = pd.read_excel('E:/Crawler/output/All_Danmu.xlsx', sheet_name='Sheet1')
|
|
my_stopwords = {'我', '你', '他', '这', '个', '是', '的', '了', '啊', '吗', '就', '都', '不是', '也', '哈哈哈',
|
|
'吧', '呀', '哦', '呢', '哇', '么', '嘛', '呵呵', '呵', '嘿嘿', '哎呀', '哎', '哼', '呃'}
|
|
wordcloud_generation(dm)
|