feat:生成词云图（所有弹幕版）

10 months ago · 8d22d0402e
parent 6258d204d1
commit 8d22d0402e
1 changed files with 61 additions and 0 deletions
--- a/b_wordcloud.py
+++ b/b_wordcloud.py
@ -0,0 +1,61 @@
+import pandas as pd
+import numpy as np
+import wordcloud
+from matplotlib.image import imread
+import jieba
+import jieba.analyse as analyse
+import re
+
+
+# 定义蓝色调色板
+def blue_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
+    return "hsl(210, 100%%, %d%%)" % np.random.randint(50, 90)
+
+
+# 归一化“哈哈哈”
+def normalize_hahaha(text):
+    return re.sub(r'哈{3,}', '哈哈哈', text)
+
+
+# 将数据导入
+dm = pd.read_excel('All_Danmu.xlsx', sheet_name='Sheet1')
+
+# 扩展停用词列表
+my_stopwords = set(['我', '你', '他', '这', '个', '是', '的', '了', '啊', '吗', '吧', '就', '都', '不是', '也', '哈哈哈',
+                    '吧', '呀', '哦', '呢',  '哇', '么', '嘛', '呵呵', '呵', '嘿嘿', '哎呀', '哎', '哼', '呃'])
+
+
+# 词云图生成
+def wordcloud_generation(dm):
+    dm_list = dm['danmu'].dropna().astype(str).tolist()
+
+    # 归一化处理
+    dm_list = [normalize_hahaha(text) for text in dm_list]
+    dm_string = ' '.join(dm_list)  # 弹幕字符串
+
+    # 使用TF-IDF提取关键词
+    keywords = analyse.extract_tags(dm_string, topK=100, withWeight=False, allowPOS=())
+
+    # 去掉停用词后的关键词
+    keywords = [word for word in keywords if word not in my_stopwords]
+
+    # 将关键词拼接为一个字符串
+    dmreal_string = ' '.join(keywords)
+
+    img = imread("OIP.jpg")
+    # 词云生成
+    wc = wordcloud.WordCloud(
+        stopwords=my_stopwords,
+        width=1920,
+        height=1200,
+        background_color='white',
+        font_path='msyhl.ttc',
+        mask=img,
+        max_words=100,
+        color_func=blue_color_func,
+    ).generate(dmreal_string)
+    wc.to_file('alldanmu_dwordcloud.png')
+
+
+# 调用词云生成
+wordcloud_generation(dm)