|
|
import matplotlib.pyplot as plt
|
|
|
import jieba
|
|
|
import numpy as np
|
|
|
from wordcloud import WordCloud
|
|
|
from PIL import Image
|
|
|
|
|
|
# 设置matplotlib支持中文显示
|
|
|
plt.rcParams["font.family"] = ["SimHei"]
|
|
|
|
|
|
|
|
|
class WordCloudVisualizer:
|
|
|
def __init__(self):
|
|
|
# 额外的过滤词集合(针对弹幕场景中无意义的常见词汇)
|
|
|
self.extra_filter = {"视频", "UP主", "弹幕", "这里", "一下", "大家", "今天", "这个", "什么"}
|
|
|
|
|
|
def generate(self, danmukus, stopwords,
|
|
|
min_length=2,
|
|
|
min_freq=3,
|
|
|
filename="弹幕词云.png",
|
|
|
mask_image="yun.png", # 你的图片路径(任意非纯白背景图)
|
|
|
background_color="white",
|
|
|
max_words=200,
|
|
|
colormap="viridis",
|
|
|
white_threshold=245): # 白色判定阈值(接近255为纯白)
|
|
|
"""
|
|
|
核心功能:仅纯白色区域不写字,所有非白色区域(黑/灰/彩色)都显示文字
|
|
|
:param danmukus: 弹幕列表
|
|
|
:param stopwords: 停用词集合
|
|
|
:param min_length: 词语最小长度过滤(小于该长度的词将被过滤)
|
|
|
:param min_freq: 词语最小出现次数过滤(小于该次数的词将被过滤)
|
|
|
:param filename: 词云图片保存路径
|
|
|
:param mask_image: 词云形状掩码图片路径
|
|
|
:param background_color: 词云背景颜色
|
|
|
:param max_words: 词云显示的最大词语数量
|
|
|
:param colormap: 词云颜色映射方案
|
|
|
:param white_threshold: 纯白判定阈值(0-255),值越高,接近纯白的区域不显示文字
|
|
|
"""
|
|
|
# 1. 文本处理与分词
|
|
|
# 将所有弹幕拼接成一个字符串
|
|
|
all_text = " ".join(danmukus)
|
|
|
# 使用jieba对文本进行分词
|
|
|
words = jieba.cut(all_text)
|
|
|
# 过滤不符合条件的词语:不在停用词表、不在额外过滤词表、长度不小于最小长度
|
|
|
filtered_words = [
|
|
|
word for word in words
|
|
|
if word not in stopwords
|
|
|
and word not in self.extra_filter
|
|
|
and len(word) >= min_length
|
|
|
]
|
|
|
|
|
|
# 2. 频率过滤
|
|
|
from collections import Counter
|
|
|
# 统计词语出现次数
|
|
|
word_counts = Counter(filtered_words)
|
|
|
# 过滤出现次数低于最小频率的词语
|
|
|
freq_filtered = [word for word, count in word_counts.items() if count >= min_freq]
|
|
|
# 将过滤后的词语拼接成字符串,用于生成词云
|
|
|
text = " ".join(freq_filtered)
|
|
|
# 若过滤后无有效词汇,则提示并返回
|
|
|
if not text:
|
|
|
print("过滤后无有效词汇,无法生成词云")
|
|
|
return
|
|
|
|
|
|
# 3. 处理图片:非白色区域可写字,纯白色区域不写字
|
|
|
mask = None
|
|
|
if mask_image:
|
|
|
try:
|
|
|
# 打开掩码图片
|
|
|
img = Image.open(mask_image)
|
|
|
# 将图片转换为numpy数组
|
|
|
mask_array = np.array(img)
|
|
|
print(f"已加载图片: {mask_image}")
|
|
|
|
|
|
# 关键逻辑:区分纯白色和非白色区域
|
|
|
if len(mask_array.shape) == 3: # 彩色图片(RGB通道)
|
|
|
# 纯白色判定:R、G、B三个通道值均≥阈值
|
|
|
is_pure_white = (mask_array[:, :, 0] >= white_threshold) & \
|
|
|
(mask_array[:, :, 1] >= white_threshold) & \
|
|
|
(mask_array[:, :, 2] >= white_threshold)
|
|
|
else: # 灰度图片(单通道)
|
|
|
# 纯白色判定:灰度值≥阈值
|
|
|
is_pure_white = mask_array >= white_threshold
|
|
|
|
|
|
# 生成掩码:纯白色区域设为255(不显示文字),非白色区域设为0(显示文字)
|
|
|
mask = np.where(is_pure_white, 255, 0).astype(np.uint8)
|
|
|
print(f"已处理图片,非白色区域可写字")
|
|
|
|
|
|
except Exception as e:
|
|
|
# 图片加载失败时,使用默认矩形词云
|
|
|
print(f"加载图片失败: {e},将使用矩形词云")
|
|
|
mask = None
|
|
|
|
|
|
# 4. 生成词云(仅在非白色区域显示)
|
|
|
wc = WordCloud(
|
|
|
font_path="C:/Windows/Fonts/simhei.ttf", # 指定中文字体路径
|
|
|
background_color=background_color, # 背景颜色
|
|
|
# 词云宽度(若有掩码则使用掩码宽度,否则默认1200)
|
|
|
width=mask.shape[1] if mask is not None else 1200,
|
|
|
# 词云高度(若有掩码则使用掩码高度,否则默认800)
|
|
|
height=mask.shape[0] if mask is not None else 800,
|
|
|
max_words=max_words, # 最大显示词语数量
|
|
|
collocations=False, # 不显示词语搭配
|
|
|
colormap=colormap, # 颜色映射方案
|
|
|
mask=mask # 应用掩码,控制词云形状
|
|
|
)
|
|
|
# 根据处理后的文本生成词云
|
|
|
wc.generate(text)
|
|
|
|
|
|
# 5. 显示并保存词云图片
|
|
|
plt.figure(figsize=(12, 8)) # 创建画布
|
|
|
plt.imshow(wc, interpolation="bilinear") # 显示词云,使用双线性插值使图片更平滑
|
|
|
plt.axis("off") # 隐藏坐标轴
|
|
|
plt.tight_layout(pad=0) # 调整布局,去除边距
|
|
|
# 保存词云图片,设置dpi为300以保证清晰度
|
|
|
plt.savefig(filename, dpi=300, bbox_inches="tight")
|
|
|
plt.show() # 显示图片
|
|
|
print(f"词云图保存: {filename}")
|
|
|
# 显示高频词统计信息
|
|
|
self.show_word_stats(word_counts)
|
|
|
|
|
|
def show_word_stats(self, word_counts, top_n=10):
|
|
|
"""显示高频词统计结果"""
|
|
|
# 获取出现次数最多的前N个词语
|
|
|
top_words = word_counts.most_common(top_n)
|
|
|
print(f"\n前{top_n}个高频词:")
|
|
|
# 遍历并打印高频词及其出现次数
|
|
|
for i, (word, count) in enumerate(top_words, 1):
|
|
|
print(f"{i}. {word}: {count}次")
|
|
|
|
|
|
def add_filter_words(self, words):
|
|
|
"""添加额外的过滤词"""
|
|
|
if isinstance(words, list):
|
|
|
# 若输入为列表,则批量添加到过滤词集合
|
|
|
self.extra_filter.update(words)
|
|
|
else:
|
|
|
# 若输入为单个词语,则直接添加
|
|
|
self.extra_filter.add(words)
|
|
|
print(f"已添加过滤词: {words}") |