You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
AAA/wordcloud_visualizer.py

138 lines
6.4 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import matplotlib.pyplot as plt
import jieba
import numpy as np
from wordcloud import WordCloud
from PIL import Image
# 设置matplotlib支持中文显示
plt.rcParams["font.family"] = ["SimHei"]
class WordCloudVisualizer:
def __init__(self):
# 额外的过滤词集合(针对弹幕场景中无意义的常见词汇)
self.extra_filter = {"视频", "UP主", "弹幕", "这里", "一下", "大家", "今天", "这个", "什么"}
def generate(self, danmukus, stopwords,
min_length=2,
min_freq=3,
filename="弹幕词云.png",
mask_image="yun.png", # 你的图片路径(任意非纯白背景图)
background_color="white",
max_words=200,
colormap="viridis",
white_threshold=245): # 白色判定阈值接近255为纯白
"""
核心功能:仅纯白色区域不写字,所有非白色区域(黑/灰/彩色)都显示文字
:param danmukus: 弹幕列表
:param stopwords: 停用词集合
:param min_length: 词语最小长度过滤(小于该长度的词将被过滤)
:param min_freq: 词语最小出现次数过滤(小于该次数的词将被过滤)
:param filename: 词云图片保存路径
:param mask_image: 词云形状掩码图片路径
:param background_color: 词云背景颜色
:param max_words: 词云显示的最大词语数量
:param colormap: 词云颜色映射方案
:param white_threshold: 纯白判定阈值0-255值越高接近纯白的区域不显示文字
"""
# 1. 文本处理与分词
# 将所有弹幕拼接成一个字符串
all_text = " ".join(danmukus)
# 使用jieba对文本进行分词
words = jieba.cut(all_text)
# 过滤不符合条件的词语:不在停用词表、不在额外过滤词表、长度不小于最小长度
filtered_words = [
word for word in words
if word not in stopwords
and word not in self.extra_filter
and len(word) >= min_length
]
# 2. 频率过滤
from collections import Counter
# 统计词语出现次数
word_counts = Counter(filtered_words)
# 过滤出现次数低于最小频率的词语
freq_filtered = [word for word, count in word_counts.items() if count >= min_freq]
# 将过滤后的词语拼接成字符串,用于生成词云
text = " ".join(freq_filtered)
# 若过滤后无有效词汇,则提示并返回
if not text:
print("过滤后无有效词汇,无法生成词云")
return
# 3. 处理图片:非白色区域可写字,纯白色区域不写字
mask = None
if mask_image:
try:
# 打开掩码图片
img = Image.open(mask_image)
# 将图片转换为numpy数组
mask_array = np.array(img)
print(f"已加载图片: {mask_image}")
# 关键逻辑:区分纯白色和非白色区域
if len(mask_array.shape) == 3: # 彩色图片RGB通道
# 纯白色判定R、G、B三个通道值均≥阈值
is_pure_white = (mask_array[:, :, 0] >= white_threshold) & \
(mask_array[:, :, 1] >= white_threshold) & \
(mask_array[:, :, 2] >= white_threshold)
else: # 灰度图片(单通道)
# 纯白色判定:灰度值≥阈值
is_pure_white = mask_array >= white_threshold
# 生成掩码纯白色区域设为255不显示文字非白色区域设为0显示文字
mask = np.where(is_pure_white, 255, 0).astype(np.uint8)
print(f"已处理图片,非白色区域可写字")
except Exception as e:
# 图片加载失败时,使用默认矩形词云
print(f"加载图片失败: {e},将使用矩形词云")
mask = None
# 4. 生成词云(仅在非白色区域显示)
wc = WordCloud(
font_path="C:/Windows/Fonts/simhei.ttf", # 指定中文字体路径
background_color=background_color, # 背景颜色
# 词云宽度若有掩码则使用掩码宽度否则默认1200
width=mask.shape[1] if mask is not None else 1200,
# 词云高度若有掩码则使用掩码高度否则默认800
height=mask.shape[0] if mask is not None else 800,
max_words=max_words, # 最大显示词语数量
collocations=False, # 不显示词语搭配
colormap=colormap, # 颜色映射方案
mask=mask # 应用掩码,控制词云形状
)
# 根据处理后的文本生成词云
wc.generate(text)
# 5. 显示并保存词云图片
plt.figure(figsize=(12, 8)) # 创建画布
plt.imshow(wc, interpolation="bilinear") # 显示词云,使用双线性插值使图片更平滑
plt.axis("off") # 隐藏坐标轴
plt.tight_layout(pad=0) # 调整布局,去除边距
# 保存词云图片设置dpi为300以保证清晰度
plt.savefig(filename, dpi=300, bbox_inches="tight")
plt.show() # 显示图片
print(f"词云图保存: {filename}")
# 显示高频词统计信息
self.show_word_stats(word_counts)
def show_word_stats(self, word_counts, top_n=10):
"""显示高频词统计结果"""
# 获取出现次数最多的前N个词语
top_words = word_counts.most_common(top_n)
print(f"\n{top_n}个高频词:")
# 遍历并打印高频词及其出现次数
for i, (word, count) in enumerate(top_words, 1):
print(f"{i}. {word}: {count}")
def add_filter_words(self, words):
"""添加额外的过滤词"""
if isinstance(words, list):
# 若输入为列表,则批量添加到过滤词集合
self.extra_filter.update(words)
else:
# 若输入为单个词语,则直接添加
self.extra_filter.add(words)
print(f"已添加过滤词: {words}")