You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
41 lines
1.3 KiB
41 lines
1.3 KiB
import csv
|
|
import os
|
|
from wordcloud import WordCloud, STOPWORDS
|
|
import jieba
|
|
|
|
# 定义csv文件和中文停词表的路径
|
|
csv_dir = "./danmaku_files"
|
|
stop_words_file = "./cn_stopwords.txt"
|
|
|
|
# 加载中文词汇
|
|
with open(stop_words_file, encoding="utf8") as f:
|
|
stop_words = f.read().splitlines()
|
|
|
|
# 创建新文件夹为储存结果
|
|
if not os.path.exists("word_clouds"):
|
|
os.makedirs("word_clouds")
|
|
|
|
# 遍历danmaku_files中所有的文件
|
|
for filename in os.listdir(csv_dir):
|
|
if filename.endswith(".csv"):
|
|
# 读取CSV文件并提取弹幕内容
|
|
with open(os.path.join(csv_dir, filename), encoding="utf8") as f:
|
|
reader = csv.reader(f)
|
|
text = ' '.join([row[0] for row in reader])
|
|
|
|
# 移除中文停词
|
|
stopwords = set(STOPWORDS)
|
|
stopwords.update(stop_words)
|
|
text = ' '.join([word for word in jieba.cut(text) if word not in stopwords])
|
|
|
|
# 创建云图
|
|
wc = WordCloud(background_color="white",
|
|
font_path='simsun.ttc',
|
|
max_words=100,
|
|
width=800,
|
|
height=600)
|
|
wc.generate(text)
|
|
|
|
# 保存云图为png格式
|
|
wc.to_file(os.path.join("word_clouds", filename[:-4] + ".png"))
|