|
|
@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
# 从爬虫生成的Excel表格中读取数据并生成词云图
|
|
|
|
|
|
|
|
import configparser
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
from collections import Counter
|
|
|
|
|
|
|
|
from multiprocessing import Pool
|
|
|
|
|
|
|
|
import PIL
|
|
|
|
|
|
|
|
import jieba
|
|
|
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
import openpyxl
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
import wordcloud
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 定义一些参数,参数的详细介绍见GitHub上的readme.md
|
|
|
|
|
|
|
|
config_file = 'config/config.ini'
|
|
|
|
|
|
|
|
config_Section_Name = 'GC_DEFAULT' # 要读取的配置页名
|
|
|
|
|
|
|
|
stop_Word = ['!', '!', ':', '*', ',', ',', '?', '《', '》',
|
|
|
|
|
|
|
|
'。', ' ', '的', '了', '是', '啊', '吗', '吧', '这', '你', '我', '他', '就'] # 停用词表
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_Danmu(workbook_Name, sheet_Name): # 从Excel表中读取数据
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
workbook = openpyxl.load_workbook(workbook_Name)
|
|
|
|
|
|
|
|
worksheet = workbook[sheet_Name] # 当然也可以通过索引读sheet,为了可读性选择用名称
|
|
|
|
|
|
|
|
data = worksheet.iter_rows(values_only=1)
|
|
|
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
# 若报错,则返回空迭代器
|
|
|
|
|
|
|
|
except openpyxl.utils.exceptions.InvalidFileException:
|
|
|
|
|
|
|
|
print(f"输入文件的路径或格式错误,请打开{config_file}文件重新配置路径\n")
|
|
|
|
|
|
|
|
return iter(())
|
|
|
|
|
|
|
|
except KeyError:
|
|
|
|
|
|
|
|
print(f"工作表页名错误,请检查Sheet的名字和{config_file}中是否一致\n")
|
|
|
|
|
|
|
|
return iter(())
|
|
|
|
|
|
|
|
except:
|
|
|
|
|
|
|
|
exc_type, exc_value, exc_traceback = sys.exc_info()
|
|
|
|
|
|
|
|
print(f"发生错误: {exc_type} - {exc_value}")
|
|
|
|
|
|
|
|
return iter(())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def cut_words(row):
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
# 每行第一列是弹幕,第二列是出现次数
|
|
|
|
|
|
|
|
sentence = row[0]
|
|
|
|
|
|
|
|
count = row[1]
|
|
|
|
|
|
|
|
# 运用jieba 进行分词,将结果储存在Counter中,再将其中词语的出现次数翻count倍
|
|
|
|
|
|
|
|
words = jieba.lcut(sentence)
|
|
|
|
|
|
|
|
# 去除停用词表中的词
|
|
|
|
|
|
|
|
cut_Words = pd.Series(words)
|
|
|
|
|
|
|
|
cut_Words = cut_Words[~cut_Words.isin(stop_Word)]
|
|
|
|
|
|
|
|
# 将分词存入计数器中
|
|
|
|
|
|
|
|
new_Counter = Counter(cut_Words.tolist())
|
|
|
|
|
|
|
|
for item in new_Counter:
|
|
|
|
|
|
|
|
new_Counter[item] *= count # 弹幕中词语出现数 = 弹幕出现次数*弹幕中词语出现次数
|
|
|
|
|
|
|
|
return new_Counter
|
|
|
|
|
|
|
|
except TypeError:
|
|
|
|
|
|
|
|
return Counter() # 遇见异常输入的情况,返回空计数器。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_Word_Cloud(counter): # 生成词云图
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
if not counter: # 如果计数器对象为空,则给出提示并退出函数
|
|
|
|
|
|
|
|
return "输入的词频为空!"
|
|
|
|
|
|
|
|
img = PIL.Image.open(pic_Path).convert('RGBA') # 解决灰度图像ERROR
|
|
|
|
|
|
|
|
pic = np.array(img)
|
|
|
|
|
|
|
|
image_colors = wordcloud.ImageColorGenerator(pic)
|
|
|
|
|
|
|
|
word_Cloud = wordcloud.WordCloud(
|
|
|
|
|
|
|
|
font_path=font_Path, mask=pic, width=WC_Width, height=WC_Height, mode="RGBA", background_color='white')
|
|
|
|
|
|
|
|
word_Cloud.generate_from_frequencies(counter)
|
|
|
|
|
|
|
|
plt.imshow(word_Cloud.recolor(color_func=image_colors),
|
|
|
|
|
|
|
|
interpolation='bilinear')
|
|
|
|
|
|
|
|
word_Cloud.to_file(output_Path)
|
|
|
|
|
|
|
|
plt.axis('off')
|
|
|
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
return f"词云图生成完成,请前往{output_Path}查看"
|
|
|
|
|
|
|
|
except FileNotFoundError: # pic_Path 或 font_Path错误的情况
|
|
|
|
|
|
|
|
return f"图片或字体路径错误,请前往{config_file}核查。"
|
|
|
|
|
|
|
|
except TypeError or ValueError: # WC_Width 或WC_Height类型或数组错误的情况
|
|
|
|
|
|
|
|
return f"图片的Height与Width设置有误,请前往{config_file}核查。"
|
|
|
|
|
|
|
|
except PIL.UnidentifiedImageError:
|
|
|
|
|
|
|
|
return f"不支持该类型的图片,请修改图片路径。"
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
return f"生成词云图时发生错误:{e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
|
|
rows = read_Danmu(workbook_Name, sheet_Name)
|
|
|
|
|
|
|
|
word_counts = Counter()
|
|
|
|
|
|
|
|
# 利用线程池优化分词速度,在生成所有弹幕的词云图是能节省时间
|
|
|
|
|
|
|
|
with Pool() as pool:
|
|
|
|
|
|
|
|
cut_words_results = pool.map(cut_words, rows)
|
|
|
|
|
|
|
|
for result in cut_words_results:
|
|
|
|
|
|
|
|
word_counts.update(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(generate_Word_Cloud(word_counts))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
# 读取参数的配置
|
|
|
|
|
|
|
|
config = configparser.ConfigParser()
|
|
|
|
|
|
|
|
if not os.path.exists(config_file):
|
|
|
|
|
|
|
|
print(f"配置文件 {config_file} 不存在!")
|
|
|
|
|
|
|
|
exit(1)
|
|
|
|
|
|
|
|
with open(config_file, encoding='utf-8') as f:
|
|
|
|
|
|
|
|
config.read_file(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
workbook_Name = config.get(config_Section_Name, 'workbook_name',
|
|
|
|
|
|
|
|
fallback='output/Top8_danmu.xlsx') # 要读取的Excel表的名称,默认为crawler.py生成的文件
|
|
|
|
|
|
|
|
# 要读取的Excel表的页的名称,可从['Top 8', '所有弹幕']中选择
|
|
|
|
|
|
|
|
sheet_Name = config.get(config_Section_Name, 'sheet_Name', fallback='所有弹幕')
|
|
|
|
|
|
|
|
WC_Width = config.getint(
|
|
|
|
|
|
|
|
config_Section_Name, 'WC_Width', fallback=1200) # 词云图的宽度
|
|
|
|
|
|
|
|
WC_Height = config.getint(
|
|
|
|
|
|
|
|
config_Section_Name, 'WC_Height', fallback=1200) # 词云图的高度
|
|
|
|
|
|
|
|
font_Path = config.get(config_Section_Name, 'font_Path',
|
|
|
|
|
|
|
|
fallback="config/msyh.ttc") # 字体存储路径
|
|
|
|
|
|
|
|
pic_Path = config.get(config_Section_Name, 'pic_Path',
|
|
|
|
|
|
|
|
fallback="config/m.png") # 词云背景图路径
|
|
|
|
|
|
|
|
output_Path = config.get(
|
|
|
|
|
|
|
|
config_Section_Name, 'output_Path', fallback="output/word_could.png")
|
|
|
|
|
|
|
|
main()
|