AAA/wordcloud_visualizer.py

import matplotlib.pyplot as plt
import jieba
import numpy as np
from wordcloud import WordCloud
from PIL import Image

# 设置matplotlib支持中文显示
plt.rcParams["font.family"] = ["SimHei"]


class WordCloudVisualizer:
    def __init__(self):
        # 额外的过滤词集合（针对弹幕场景中无意义的常见词汇）
        self.extra_filter = {"视频", "UP主", "弹幕", "这里", "一下", "大家", "今天", "这个", "什么"}

    def generate(self, danmukus, stopwords,
                 min_length=2,
                 min_freq=3,
                 filename="弹幕词云.png",
                 mask_image="yun.png",  # 你的图片路径（任意非纯白背景图）
                 background_color="white",
                 max_words=200,
                 colormap="viridis",
                 white_threshold=245):  # 白色判定阈值（接近255为纯白）
        """
        核心功能：仅纯白色区域不写字，所有非白色区域（黑/灰/彩色）都显示文字
        :param danmukus: 弹幕列表
        :param stopwords: 停用词集合
        :param min_length: 词语最小长度过滤（小于该长度的词将被过滤）
        :param min_freq: 词语最小出现次数过滤（小于该次数的词将被过滤）
        :param filename: 词云图片保存路径
        :param mask_image: 词云形状掩码图片路径
        :param background_color: 词云背景颜色
        :param max_words: 词云显示的最大词语数量
        :param colormap: 词云颜色映射方案
        :param white_threshold: 纯白判定阈值（0-255），值越高，接近纯白的区域不显示文字
        """
        # 1. 文本处理与分词
        # 将所有弹幕拼接成一个字符串
        all_text = " ".join(danmukus)
        # 使用jieba对文本进行分词
        words = jieba.cut(all_text)
        # 过滤不符合条件的词语：不在停用词表、不在额外过滤词表、长度不小于最小长度
        filtered_words = [
            word for word in words
            if word not in stopwords
            and word not in self.extra_filter
            and len(word) >= min_length
        ]

        # 2. 频率过滤
        from collections import Counter
        # 统计词语出现次数
        word_counts = Counter(filtered_words)
        # 过滤出现次数低于最小频率的词语
        freq_filtered = [word for word, count in word_counts.items() if count >= min_freq]
        # 将过滤后的词语拼接成字符串，用于生成词云
        text = " ".join(freq_filtered)
        # 若过滤后无有效词汇，则提示并返回
        if not text:
            print("过滤后无有效词汇，无法生成词云")
            return

        # 3. 处理图片：非白色区域可写字，纯白色区域不写字
        mask = None
        if mask_image:
            try:
                # 打开掩码图片
                img = Image.open(mask_image)
                # 将图片转换为numpy数组
                mask_array = np.array(img)
                print(f"已加载图片: {mask_image}")

                # 关键逻辑：区分纯白色和非白色区域
                if len(mask_array.shape) == 3:  # 彩色图片（RGB通道）
                    # 纯白色判定：R、G、B三个通道值均≥阈值
                    is_pure_white = (mask_array[:, :, 0] >= white_threshold) & \
                                    (mask_array[:, :, 1] >= white_threshold) & \
                                    (mask_array[:, :, 2] >= white_threshold)
                else:  # 灰度图片（单通道）
                    # 纯白色判定：灰度值≥阈值
                    is_pure_white = mask_array >= white_threshold

                # 生成掩码：纯白色区域设为255（不显示文字），非白色区域设为0（显示文字）
                mask = np.where(is_pure_white, 255, 0).astype(np.uint8)
                print(f"已处理图片，非白色区域可写字")

            except Exception as e:
                # 图片加载失败时，使用默认矩形词云
                print(f"加载图片失败: {e}，将使用矩形词云")
                mask = None

        # 4. 生成词云（仅在非白色区域显示）
        wc = WordCloud(
            font_path="C:/Windows/Fonts/simhei.ttf",  # 指定中文字体路径
            background_color=background_color,  # 背景颜色
            # 词云宽度（若有掩码则使用掩码宽度，否则默认1200）
            width=mask.shape[1] if mask is not None else 1200,
            # 词云高度（若有掩码则使用掩码高度，否则默认800）
            height=mask.shape[0] if mask is not None else 800,
            max_words=max_words,  # 最大显示词语数量
            collocations=False,  # 不显示词语搭配
            colormap=colormap,  # 颜色映射方案
            mask=mask  # 应用掩码，控制词云形状
        )
        # 根据处理后的文本生成词云
        wc.generate(text)

        # 5. 显示并保存词云图片
        plt.figure(figsize=(12, 8))  # 创建画布
        plt.imshow(wc, interpolation="bilinear")  # 显示词云，使用双线性插值使图片更平滑
        plt.axis("off")  # 隐藏坐标轴
        plt.tight_layout(pad=0)  # 调整布局，去除边距
        # 保存词云图片，设置dpi为300以保证清晰度
        plt.savefig(filename, dpi=300, bbox_inches="tight")
        plt.show()  # 显示图片
        print(f"词云图保存: {filename}")
        # 显示高频词统计信息
        self.show_word_stats(word_counts)

    def show_word_stats(self, word_counts, top_n=10):
        """显示高频词统计结果"""
        # 获取出现次数最多的前N个词语
        top_words = word_counts.most_common(top_n)
        print(f"\n前{top_n}个高频词:")
        # 遍历并打印高频词及其出现次数
        for i, (word, count) in enumerate(top_words, 1):
            print(f"{i}. {word}: {count}次")

    def add_filter_words(self, words):
        """添加额外的过滤词"""
        if isinstance(words, list):
            # 若输入为列表，则批量添加到过滤词集合
            self.extra_filter.update(words)
        else:
            # 若输入为单个词语，则直接添加
            self.extra_filter.add(words)
        print(f"已添加过滤词: {words}")