You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Crawler/generate_bullet_wordcloud.py

69 lines
2.3 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import collections
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image
import numpy as np
# 读取弹幕文件
def read_comments(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
comments = file.readlines()
# 移除每条弹幕的换行符
cleaned_comments = [comment.strip() for comment in comments]
return cleaned_comments
# 统计词频
def count_words(comments):
word_counter = collections.Counter()
for comment in comments:
words = comment.split() # 简单分词,按空格切分词语
word_counter.update(words)
return word_counter
# 生成词云图
def generate_wordcloud(word_frequencies, output_file, mask_image=None):
wordcloud = WordCloud(
width=1600, # 提高图像分辨率
height=800, # 提高图像分辨率
background_color='white', # 设置背景为白色
max_words=150, # 减少最大词数,避免文字重叠
min_font_size=10, # 确保最小字体大小不会太小
relative_scaling=0.5, # 调整字体大小比例
colormap='cool', # 使用冷色调配色方案
mask=mask_image, # 可选使用自定义形状的mask
contour_width=3, # 设置轮廓宽度
contour_color='steelblue', # 轮廓颜色
font_path='C:/Windows/Fonts/simhei.ttf' # Windows下的黑体字体
).generate_from_frequencies(word_frequencies)
# 显示词云图
plt.figure(figsize=(12, 6)) # 设置展示窗口的尺寸
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off') # 不显示坐标轴
plt.show()
# 保存词云图到文件
wordcloud.to_file(output_file)
print(f"词云图已保存至 {output_file}")
# 主函数:读取文件、统计词频、生成词云
file_path = '弹幕.txt' # 弹幕文件的路径
output_wordcloud_file = 'bullet_wordcloud.png' # 输出的词云图文件名
# 1. 读取所有弹幕
comments = read_comments(file_path)
# 2. 统计词频
word_frequencies = count_words(comments)
# 3. (可选) 使用自定义图片作为词云形状
mask_image = None # 如果不使用自定义形状,设为 None
# 4. 生成并展示词云图
generate_wordcloud(word_frequencies, output_wordcloud_file, mask_image)