You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Crawler/generate_bullet_wordcloud.py

69 lines
2.3 KiB

import collections
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image
import numpy as np
# 读取弹幕文件
def read_comments(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
comments = file.readlines()
# 移除每条弹幕的换行符
cleaned_comments = [comment.strip() for comment in comments]
return cleaned_comments
# 统计词频
def count_words(comments):
word_counter = collections.Counter()
for comment in comments:
words = comment.split() # 简单分词,按空格切分词语
word_counter.update(words)
return word_counter
# 生成词云图
def generate_wordcloud(word_frequencies, output_file, mask_image=None):
wordcloud = WordCloud(
width=1600, # 提高图像分辨率
height=800, # 提高图像分辨率
background_color='white', # 设置背景为白色
max_words=150, # 减少最大词数,避免文字重叠
min_font_size=10, # 确保最小字体大小不会太小
relative_scaling=0.5, # 调整字体大小比例
colormap='cool', # 使用冷色调配色方案
mask=mask_image, # 可选使用自定义形状的mask
contour_width=3, # 设置轮廓宽度
contour_color='steelblue', # 轮廓颜色
font_path='C:/Windows/Fonts/simhei.ttf' # Windows下的黑体字体
).generate_from_frequencies(word_frequencies)
# 显示词云图
plt.figure(figsize=(12, 6)) # 设置展示窗口的尺寸
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off') # 不显示坐标轴
plt.show()
# 保存词云图到文件
wordcloud.to_file(output_file)
print(f"词云图已保存至 {output_file}")
# 主函数:读取文件、统计词频、生成词云
file_path = '弹幕.txt' # 弹幕文件的路径
output_wordcloud_file = 'bullet_wordcloud.png' # 输出的词云图文件名
# 1. 读取所有弹幕
comments = read_comments(file_path)
# 2. 统计词频
word_frequencies = count_words(comments)
# 3. (可选) 使用自定义图片作为词云形状
mask_image = None # 如果不使用自定义形状,设为 None
# 4. 生成并展示词云图
generate_wordcloud(word_frequencies, output_wordcloud_file, mask_image)