Crawler/generate_bullet_wordcloud.py

import collections
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image
import numpy as np


# 读取弹幕文件
def read_comments(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        comments = file.readlines()

    # 移除每条弹幕的换行符
    cleaned_comments = [comment.strip() for comment in comments]
    return cleaned_comments


# 统计词频
def count_words(comments):
    word_counter = collections.Counter()
    for comment in comments:
        words = comment.split()  # 简单分词，按空格切分词语
        word_counter.update(words)
    return word_counter


# 生成词云图
def generate_wordcloud(word_frequencies, output_file, mask_image=None):
    wordcloud = WordCloud(
        width=1600,  # 提高图像分辨率
        height=800,  # 提高图像分辨率
        background_color='white',  # 设置背景为白色
        max_words=150,  # 减少最大词数，避免文字重叠
        min_font_size=10,  # 确保最小字体大小不会太小
        relative_scaling=0.5,  # 调整字体大小比例
        colormap='cool',  # 使用冷色调配色方案
        mask=mask_image,  # 可选：使用自定义形状的mask
        contour_width=3,  # 设置轮廓宽度
        contour_color='steelblue',  # 轮廓颜色
        font_path='C:/Windows/Fonts/simhei.ttf'  # Windows下的黑体字体
    ).generate_from_frequencies(word_frequencies)

    # 显示词云图
    plt.figure(figsize=(12, 6))  # 设置展示窗口的尺寸
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')  # 不显示坐标轴
    plt.show()

    # 保存词云图到文件
    wordcloud.to_file(output_file)
    print(f"词云图已保存至 {output_file}")


# 主函数：读取文件、统计词频、生成词云
file_path = '弹幕.txt'  # 弹幕文件的路径
output_wordcloud_file = 'bullet_wordcloud.png'  # 输出的词云图文件名

# 1. 读取所有弹幕
comments = read_comments(file_path)

# 2. 统计词频
word_frequencies = count_words(comments)

# 3. (可选) 使用自定义图片作为词云形状
mask_image = None  # 如果不使用自定义形状，设为 None

# 4. 生成并展示词云图
generate_wordcloud(word_frequencies, output_wordcloud_file, mask_image)