You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
63 lines
2.1 KiB
63 lines
2.1 KiB
import re
|
|
import jieba
|
|
import numpy as np
|
|
from collections import Counter
|
|
from PIL import Image
|
|
from wordcloud import WordCloud
|
|
|
|
|
|
class DataAnalysis:
|
|
"""
|
|
数据分析类
|
|
主要功能:
|
|
1. 统计弹幕中关键词出现的次数
|
|
2. 生成词云图
|
|
"""
|
|
def __init__(self):
|
|
pass
|
|
|
|
@staticmethod
|
|
def count_keywords(data: list, keywords: list) -> list:
|
|
"""
|
|
统计弹幕中关键词出现的次数
|
|
|
|
:param data: 弹幕数据
|
|
:param keywords: 关键词列表
|
|
:return: 各关键词出现的次数列表
|
|
"""
|
|
# 将所有关键词转换为小写,并创建一个模式
|
|
keywords_lower = [keyword.lower() for keyword in keywords]
|
|
pattern = re.compile('|'.join(re.escape(keyword) for keyword in keywords_lower))
|
|
|
|
all_comments = ' '.join(temp.lower() for temp in data) # 将所有评论拼接成一个大字符串
|
|
matches = pattern.findall(all_comments) # 使用正则表达式匹配所有关键词
|
|
counter = Counter(matches) # 使用 Counter 计数
|
|
return [[keyword, counter[keyword.lower()]] for keyword in keywords]
|
|
|
|
@staticmethod
|
|
def make_wordcloud(background, data: list, stopwords: list, output_path: str) -> None:
|
|
"""
|
|
生成词云图
|
|
|
|
:param background: 背景图片路径
|
|
:param data: 弹幕数据
|
|
:param stopwords: 停用词列表
|
|
:param output_path: 词云图保存路径
|
|
:return: 无
|
|
"""
|
|
txt = ''.join(data) # 合并所有评论为一个字符串
|
|
seg_list = jieba.lcut(txt) # 使用 jieba 分词
|
|
txt = ' '.join(seg_list) # 将分词结果合并成一个字符串
|
|
mask = np.array(Image.open(background)) # 导入背景图片
|
|
wordcloud = WordCloud(
|
|
max_words=256,
|
|
mask=mask,
|
|
max_font_size=120,
|
|
font_path='assets/微软雅黑.ttf',
|
|
width=1024,
|
|
height=1024,
|
|
stopwords=set(stopwords)
|
|
).generate(txt)
|
|
# 保存生成的词云图片
|
|
wordcloud.to_file(output_path)
|