diff --git a/b_wordcloud.py b/b_wordcloud.py new file mode 100644 index 0000000..d842216 --- /dev/null +++ b/b_wordcloud.py @@ -0,0 +1,61 @@ +import pandas as pd +import numpy as np +import wordcloud +from matplotlib.image import imread +import jieba +import jieba.analyse as analyse +import re + + +# 定义蓝色调色板 +def blue_color_func(word, font_size, position, orientation, random_state=None, **kwargs): + return "hsl(210, 100%%, %d%%)" % np.random.randint(50, 90) + + +# 归一化“哈哈哈” +def normalize_hahaha(text): + return re.sub(r'哈{3,}', '哈哈哈', text) + + +# 将数据导入 +dm = pd.read_excel('All_Danmu.xlsx', sheet_name='Sheet1') + +# 扩展停用词列表 +my_stopwords = set(['我', '你', '他', '这', '个', '是', '的', '了', '啊', '吗', '吧', '就', '都', '不是', '也', '哈哈哈', + '吧', '呀', '哦', '呢', '哇', '么', '嘛', '呵呵', '呵', '嘿嘿', '哎呀', '哎', '哼', '呃']) + + +# 词云图生成 +def wordcloud_generation(dm): + dm_list = dm['danmu'].dropna().astype(str).tolist() + + # 归一化处理 + dm_list = [normalize_hahaha(text) for text in dm_list] + dm_string = ' '.join(dm_list) # 弹幕字符串 + + # 使用TF-IDF提取关键词 + keywords = analyse.extract_tags(dm_string, topK=100, withWeight=False, allowPOS=()) + + # 去掉停用词后的关键词 + keywords = [word for word in keywords if word not in my_stopwords] + + # 将关键词拼接为一个字符串 + dmreal_string = ' '.join(keywords) + + img = imread("OIP.jpg") + # 词云生成 + wc = wordcloud.WordCloud( + stopwords=my_stopwords, + width=1920, + height=1200, + background_color='white', + font_path='msyhl.ttc', + mask=img, + max_words=100, + color_func=blue_color_func, + ).generate(dmreal_string) + wc.to_file('alldanmu_dwordcloud.png') + + +# 调用词云生成 +wordcloud_generation(dm)