You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

62 lines
1.8 KiB

import pandas as pd
import numpy as np
import wordcloud
from matplotlib.image import imread
import jieba
import jieba.analyse as analyse
import re
# 定义蓝色调色板
def blue_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
return "hsl(210, 100%%, %d%%)" % np.random.randint(50, 90)
# 归一化“哈哈哈”
def normalize_hahaha(text):
return re.sub(r'{3,}', '哈哈哈', text)
# 将数据导入
dm = pd.read_excel('All_Danmu.xlsx', sheet_name='Sheet1')
# 扩展停用词列表
my_stopwords = set(['', '', '', '', '', '', '', '', '', '', '', '', '', '不是', '', '哈哈哈',
'', '', '', '', '', '', '', '呵呵', '', '嘿嘿', '哎呀', '', '', ''])
# 词云图生成
def wordcloud_generation(dm):
dm_list = dm['danmu'].dropna().astype(str).tolist()
# 归一化处理
dm_list = [normalize_hahaha(text) for text in dm_list]
dm_string = ' '.join(dm_list) # 弹幕字符串
# 使用TF-IDF提取关键词
keywords = analyse.extract_tags(dm_string, topK=100, withWeight=False, allowPOS=())
# 去掉停用词后的关键词
keywords = [word for word in keywords if word not in my_stopwords]
# 将关键词拼接为一个字符串
dmreal_string = ' '.join(keywords)
img = imread("OIP.jpg")
# 词云生成
wc = wordcloud.WordCloud(
stopwords=my_stopwords,
width=1920,
height=1200,
background_color='white',
font_path='msyhl.ttc',
mask=img,
max_words=100,
color_func=blue_color_func,
).generate(dmreal_string)
wc.to_file('alldanmu_dwordcloud.png')
# 调用词云生成
wordcloud_generation(dm)