You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

63 lines
2.1 KiB

import re
import jieba
import numpy as np
from collections import Counter
from PIL import Image
from wordcloud import WordCloud
class DataAnalysis:
"""
数据分析类
主要功能:
1. 统计弹幕中关键词出现的次数
2. 生成词云图
"""
def __init__(self):
pass
@staticmethod
def count_keywords(data: list, keywords: list) -> list:
"""
统计弹幕中关键词出现的次数
:param data: 弹幕数据
:param keywords: 关键词列表
:return: 各关键词出现的次数列表
"""
# 将所有关键词转换为小写,并创建一个模式
keywords_lower = [keyword.lower() for keyword in keywords]
pattern = re.compile('|'.join(re.escape(keyword) for keyword in keywords_lower))
all_comments = ' '.join(temp.lower() for temp in data) # 将所有评论拼接成一个大字符串
matches = pattern.findall(all_comments) # 使用正则表达式匹配所有关键词
counter = Counter(matches) # 使用 Counter 计数
return [[keyword, counter[keyword.lower()]] for keyword in keywords]
@staticmethod
def make_wordcloud(background, data: list, stopwords: list, output_path: str) -> None:
"""
生成词云图
:param background: 背景图片路径
:param data: 弹幕数据
:param stopwords: 停用词列表
:param output_path: 词云图保存路径
:return: 无
"""
txt = ''.join(data) # 合并所有评论为一个字符串
seg_list = jieba.lcut(txt) # 使用 jieba 分词
txt = ' '.join(seg_list) # 将分词结果合并成一个字符串
mask = np.array(Image.open(background)) # 导入背景图片
wordcloud = WordCloud(
max_words=256,
mask=mask,
max_font_size=120,
font_path='assets/微软雅黑.ttf',
width=1024,
height=1024,
stopwords=set(stopwords)
).generate(txt)
# 保存生成的词云图片
wordcloud.to_file(output_path)