parent
6258d204d1
commit
8d22d0402e
@ -0,0 +1,61 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import wordcloud
|
||||
from matplotlib.image import imread
|
||||
import jieba
|
||||
import jieba.analyse as analyse
|
||||
import re
|
||||
|
||||
|
||||
# 定义蓝色调色板
|
||||
def blue_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
|
||||
return "hsl(210, 100%%, %d%%)" % np.random.randint(50, 90)
|
||||
|
||||
|
||||
# 归一化“哈哈哈”
|
||||
def normalize_hahaha(text):
|
||||
return re.sub(r'哈{3,}', '哈哈哈', text)
|
||||
|
||||
|
||||
# 将数据导入
|
||||
dm = pd.read_excel('All_Danmu.xlsx', sheet_name='Sheet1')
|
||||
|
||||
# 扩展停用词列表
|
||||
my_stopwords = set(['我', '你', '他', '这', '个', '是', '的', '了', '啊', '吗', '吧', '就', '都', '不是', '也', '哈哈哈',
|
||||
'吧', '呀', '哦', '呢', '哇', '么', '嘛', '呵呵', '呵', '嘿嘿', '哎呀', '哎', '哼', '呃'])
|
||||
|
||||
|
||||
# 词云图生成
|
||||
def wordcloud_generation(dm):
|
||||
dm_list = dm['danmu'].dropna().astype(str).tolist()
|
||||
|
||||
# 归一化处理
|
||||
dm_list = [normalize_hahaha(text) for text in dm_list]
|
||||
dm_string = ' '.join(dm_list) # 弹幕字符串
|
||||
|
||||
# 使用TF-IDF提取关键词
|
||||
keywords = analyse.extract_tags(dm_string, topK=100, withWeight=False, allowPOS=())
|
||||
|
||||
# 去掉停用词后的关键词
|
||||
keywords = [word for word in keywords if word not in my_stopwords]
|
||||
|
||||
# 将关键词拼接为一个字符串
|
||||
dmreal_string = ' '.join(keywords)
|
||||
|
||||
img = imread("OIP.jpg")
|
||||
# 词云生成
|
||||
wc = wordcloud.WordCloud(
|
||||
stopwords=my_stopwords,
|
||||
width=1920,
|
||||
height=1200,
|
||||
background_color='white',
|
||||
font_path='msyhl.ttc',
|
||||
mask=img,
|
||||
max_words=100,
|
||||
color_func=blue_color_func,
|
||||
).generate(dmreal_string)
|
||||
wc.to_file('alldanmu_dwordcloud.png')
|
||||
|
||||
|
||||
# 调用词云生成
|
||||
wordcloud_generation(dm)
|
Loading…
Reference in new issue