ADD file via upload

1 year ago · bcd70c66aa
parent 86f10e902b
commit bcd70c66aa
1 changed files with 121 additions and 0 deletions
--- a/generate_Cloud.py
+++ b/generate_Cloud.py
@ -0,0 +1,121 @@
+# 从爬虫生成的Excel表格中读取数据并生成词云图
+import configparser
+import os
+import sys
+from collections import Counter
+from multiprocessing import Pool
+import PIL
+import jieba
+import matplotlib.pyplot as plt
+import numpy as np
+import openpyxl
+import pandas as pd
+import wordcloud
+
+# 定义一些参数，参数的详细介绍见GitHub上的readme.md
+config_file = 'config/config.ini'
+config_Section_Name = 'GC_DEFAULT'  # 要读取的配置页名
+stop_Word = ['!', '！', ':', '*', '，', ',', '？', '《', '》',
+             '。', ' ', '的', '了', '是', '啊', '吗', '吧', '这', '你', '我', '他', '就']  # 停用词表
+
+
+def read_Danmu(workbook_Name, sheet_Name):  # 从Excel表中读取数据
+    try:
+        workbook = openpyxl.load_workbook(workbook_Name)
+        worksheet = workbook[sheet_Name]  # 当然也可以通过索引读sheet,为了可读性选择用名称
+        data = worksheet.iter_rows(values_only=1)
+        return data
+    # 若报错，则返回空迭代器
+    except openpyxl.utils.exceptions.InvalidFileException:
+        print(f"输入文件的路径或格式错误，请打开{config_file}文件重新配置路径\n")
+        return iter(())
+    except KeyError:
+        print(f"工作表页名错误，请检查Sheet的名字和{config_file}中是否一致\n")
+        return iter(())
+    except:
+        exc_type, exc_value, exc_traceback = sys.exc_info()
+        print(f"发生错误: {exc_type} - {exc_value}")
+        return iter(())
+
+
+def cut_words(row):
+    try:
+        # 每行第一列是弹幕，第二列是出现次数
+        sentence = row[0]
+        count = row[1]
+        # 运用jieba 进行分词，将结果储存在Counter中，再将其中词语的出现次数翻count倍
+        words = jieba.lcut(sentence)
+        # 去除停用词表中的词
+        cut_Words = pd.Series(words)
+        cut_Words = cut_Words[~cut_Words.isin(stop_Word)]
+        # 将分词存入计数器中
+        new_Counter = Counter(cut_Words.tolist())
+        for item in new_Counter:
+            new_Counter[item] *= count  # 弹幕中词语出现数 = 弹幕出现次数*弹幕中词语出现次数
+        return new_Counter
+    except TypeError:
+        return Counter()  # 遇见异常输入的情况，返回空计数器。
+
+
+def generate_Word_Cloud(counter):  # 生成词云图
+    try:
+        if not counter:  # 如果计数器对象为空，则给出提示并退出函数
+            return "输入的词频为空！"
+        img = PIL.Image.open(pic_Path).convert('RGBA')  # 解决灰度图像ERROR
+        pic = np.array(img)
+        image_colors = wordcloud.ImageColorGenerator(pic)
+        word_Cloud = wordcloud.WordCloud(
+            font_path=font_Path, mask=pic, width=WC_Width, height=WC_Height, mode="RGBA", background_color='white')
+        word_Cloud.generate_from_frequencies(counter)
+        plt.imshow(word_Cloud.recolor(color_func=image_colors),
+                   interpolation='bilinear')
+        word_Cloud.to_file(output_Path)
+        plt.axis('off')
+        plt.show()
+        return f"词云图生成完成，请前往{output_Path}查看"
+    except FileNotFoundError:  # pic_Path 或 font_Path错误的情况
+        return f"图片或字体路径错误，请前往{config_file}核查。"
+    except TypeError or ValueError:  # WC_Width 或WC_Height类型或数组错误的情况
+        return f"图片的Height与Width设置有误，请前往{config_file}核查。"
+    except PIL.UnidentifiedImageError:
+        return f"不支持该类型的图片，请修改图片路径。"
+    except Exception as e:
+        return f"生成词云图时发生错误：{e}"
+
+
+def main():
+    rows = read_Danmu(workbook_Name, sheet_Name)
+    word_counts = Counter()
+    # 利用线程池优化分词速度，在生成所有弹幕的词云图是能节省时间
+    with Pool() as pool:
+        cut_words_results = pool.map(cut_words, rows)
+        for result in cut_words_results:
+            word_counts.update(result)
+
+    print(generate_Word_Cloud(word_counts))
+
+
+if __name__ == "__main__":
+    # 读取参数的配置
+    config = configparser.ConfigParser()
+    if not os.path.exists(config_file):
+        print(f"配置文件 {config_file} 不存在！")
+        exit(1)
+    with open(config_file, encoding='utf-8') as f:
+        config.read_file(f)
+
+    workbook_Name = config.get(config_Section_Name, 'workbook_name',
+                               fallback='output/Top8_danmu.xlsx')  # 要读取的Excel表的名称，默认为crawler.py生成的文件
+    # 要读取的Excel表的页的名称，可从['Top 8', '所有弹幕']中选择
+    sheet_Name = config.get(config_Section_Name, 'sheet_Name', fallback='所有弹幕')
+    WC_Width = config.getint(
+        config_Section_Name, 'WC_Width', fallback=1200)  # 词云图的宽度
+    WC_Height = config.getint(
+        config_Section_Name, 'WC_Height', fallback=1200)  # 词云图的高度
+    font_Path = config.get(config_Section_Name, 'font_Path',
+                           fallback="config/msyh.ttc")  # 字体存储路径
+    pic_Path = config.get(config_Section_Name, 'pic_Path',
+                          fallback="config/m.png")  # 词云背景图路径
+    output_Path = config.get(
+        config_Section_Name, 'output_Path', fallback="output/word_could.png")
+    main()