|
|
import jieba
|
|
|
from wordcloud import WordCloud
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
# 设置中文字体
|
|
|
plt.rcParams["font.family"] = ["SimHei"]
|
|
|
|
|
|
class WordCloudGenerator:
|
|
|
def __init__(self, stopwords=None):
|
|
|
"""
|
|
|
初始化词云生成器
|
|
|
:param stopwords: 停用词集合,若为None则使用默认停用词
|
|
|
"""
|
|
|
self.stopwords = stopwords if stopwords is not None else self._get_default_stopwords()
|
|
|
|
|
|
def _get_default_stopwords(self):
|
|
|
"""获取默认停用词"""
|
|
|
return set(["的", "了", "是", "在", "我", "有", "和", "就", "不", "人", "都", "一",
|
|
|
"一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着",
|
|
|
"没有", "看", "好", "自己", "这"])
|
|
|
|
|
|
def load_stopwords_from_file(self, file_path="stopwords.txt"):
|
|
|
"""从文件加载停用词"""
|
|
|
try:
|
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
|
self.stopwords = set([line.strip() for line in f.readlines()])
|
|
|
print(f"成功从{file_path}加载停用词")
|
|
|
except Exception as e:
|
|
|
print(f"加载停用词文件失败: {e},将使用默认停用词")
|
|
|
self.stopwords = self._get_default_stopwords()
|
|
|
return self
|
|
|
|
|
|
def generate_from_texts(self, texts, filename="词云图.png", font_path="simhei.ttf",
|
|
|
width=1200, height=800, max_words=200):
|
|
|
"""
|
|
|
从文本列表生成词云图
|
|
|
:param texts: 文本列表
|
|
|
:param filename: 保存的文件名
|
|
|
:param font_path: 字体文件路径
|
|
|
:param width: 词云图宽度
|
|
|
:param height: 词云图高度
|
|
|
:param max_words: 最大词数
|
|
|
"""
|
|
|
if not texts:
|
|
|
print("没有可用于生成词云的文本数据")
|
|
|
return
|
|
|
|
|
|
# 合并文本并分词
|
|
|
all_text = " ".join(texts)
|
|
|
words = jieba.cut(all_text)
|
|
|
|
|
|
# 过滤停用词和短词
|
|
|
filtered_words = [word for word in words if word not in self.stopwords and len(word) > 1]
|
|
|
processed_text = " ".join(filtered_words)
|
|
|
|
|
|
if not processed_text:
|
|
|
print("处理后的文本为空,无法生成词云")
|
|
|
return
|
|
|
|
|
|
# 生成词云
|
|
|
wc = WordCloud(
|
|
|
font_path=font_path,
|
|
|
background_color="white",
|
|
|
width=width,
|
|
|
height=height,
|
|
|
max_words=max_words,
|
|
|
collocations=False
|
|
|
).generate(processed_text)
|
|
|
|
|
|
# 显示并保存词云
|
|
|
plt.figure(figsize=(width/100, height/100))
|
|
|
plt.imshow(wc, interpolation="bilinear")
|
|
|
plt.axis("off")
|
|
|
plt.tight_layout()
|
|
|
plt.savefig(filename, dpi=300)
|
|
|
plt.show()
|
|
|
print(f"词云图已保存到{filename}")
|
|
|
|
|
|
return wc |