You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
AAA/wordcloud_generator.py

79 lines
2.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 设置中文字体
plt.rcParams["font.family"] = ["SimHei"]
class WordCloudGenerator:
def __init__(self, stopwords=None):
"""
初始化词云生成器
:param stopwords: 停用词集合若为None则使用默认停用词
"""
self.stopwords = stopwords if stopwords is not None else self._get_default_stopwords()
def _get_default_stopwords(self):
"""获取默认停用词"""
return set(["", "", "", "", "", "", "", "", "", "", "", "",
"一个", "", "", "", "", "", "", "", "", "", "",
"没有", "", "", "自己", ""])
def load_stopwords_from_file(self, file_path="stopwords.txt"):
"""从文件加载停用词"""
try:
with open(file_path, "r", encoding="utf-8") as f:
self.stopwords = set([line.strip() for line in f.readlines()])
print(f"成功从{file_path}加载停用词")
except Exception as e:
print(f"加载停用词文件失败: {e},将使用默认停用词")
self.stopwords = self._get_default_stopwords()
return self
def generate_from_texts(self, texts, filename="词云图.png", font_path="simhei.ttf",
width=1200, height=800, max_words=200):
"""
从文本列表生成词云图
:param texts: 文本列表
:param filename: 保存的文件名
:param font_path: 字体文件路径
:param width: 词云图宽度
:param height: 词云图高度
:param max_words: 最大词数
"""
if not texts:
print("没有可用于生成词云的文本数据")
return
# 合并文本并分词
all_text = " ".join(texts)
words = jieba.cut(all_text)
# 过滤停用词和短词
filtered_words = [word for word in words if word not in self.stopwords and len(word) > 1]
processed_text = " ".join(filtered_words)
if not processed_text:
print("处理后的文本为空,无法生成词云")
return
# 生成词云
wc = WordCloud(
font_path=font_path,
background_color="white",
width=width,
height=height,
max_words=max_words,
collocations=False
).generate(processed_text)
# 显示并保存词云
plt.figure(figsize=(width/100, height/100))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.tight_layout()
plt.savefig(filename, dpi=300)
plt.show()
print(f"词云图已保存到{filename}")
return wc