|
|
from collections import Counter
|
|
|
|
|
|
|
|
|
class DanmukuAnalyzer:
|
|
|
def __init__(self, danmukus):
|
|
|
# 初始化弹幕数据:去除空弹幕和前后空格
|
|
|
self.danmukus = [d.strip() for d in danmukus if d.strip()]
|
|
|
# 加载停用词表(用于过滤无意义词汇)
|
|
|
self.stopwords = self._load_stopwords()
|
|
|
|
|
|
def _load_stopwords(self):
|
|
|
"""加载停用词表,包含默认停用词和自定义停用词(从文件读取)"""
|
|
|
# 默认停用词集合(常见无实际意义的虚词)
|
|
|
default_stopwords = {"的", "了", "是", "在", "我", "有", "和", "就", "不", "人", "都"}
|
|
|
try:
|
|
|
# 尝试从本地文件读取额外停用词并合并
|
|
|
with open("stopwords.txt", "r", encoding="utf-8") as f:
|
|
|
# 读取文件中所有停用词,去除空格后与默认停用词合并
|
|
|
file_stopwords = {line.strip() for line in f.readlines()}
|
|
|
return default_stopwords.union(file_stopwords)
|
|
|
except:
|
|
|
# 若文件读取失败(如文件不存在),则仅返回默认停用词
|
|
|
return default_stopwords
|
|
|
|
|
|
def get_ai_related(self):
|
|
|
"""筛选出与AI相关的弹幕,并统计每条弹幕的出现次数"""
|
|
|
# 定义与AI相关的关键词集合
|
|
|
ai_keywords = {"AI", "人工智能", "大模型", "语言模型", "LLM", "训练", "算法", "数据"}
|
|
|
# 筛选包含任意AI关键词的弹幕
|
|
|
ai_danmukus = [d for d in self.danmukus if any(k in d for k in ai_keywords)]
|
|
|
# 使用Counter统计每条AI相关弹幕的出现次数
|
|
|
return Counter(ai_danmukus)
|
|
|
|
|
|
def classify_views(self, ai_counter):
|
|
|
"""
|
|
|
对AI相关弹幕按观点类别进行分类统计
|
|
|
:param ai_counter: 包含AI相关弹幕及其出现次数的Counter对象
|
|
|
:return: 各观点类别的总出现次数字典
|
|
|
"""
|
|
|
# 定义观点分类标准:键为类别名称,值为该类别对应的关键词列表
|
|
|
categories = {
|
|
|
"应用成本": [ "收费", "贵", "便宜",
|
|
|
"性价比", "会员", "订阅", "费用", "氪金", "省钱", "花钱",
|
|
|
"成本高", "零成本", "低价", "高价","价格", "免费", "付费"], # 与成本相关的观点
|
|
|
|
|
|
"应用领域": ["应用", "场景", "教育", "医疗", "创作","办公", "学习",
|
|
|
"编程", "写作", "翻译", "客服", "设计", "科研", "金融",
|
|
|
"教学", "聊天", "助手", "工具", "领域", "行业"], # 与应用场景相关的观点
|
|
|
|
|
|
"不利影响": ["风险", "偏见", "错误", "误导",
|
|
|
"依赖", "滥用", "漏洞", "安全", "泄露", "失控", "替代",
|
|
|
"虚假", "弊端", "危害" "伦理", "隐私", "失业"], # 与负面影响相关的观点
|
|
|
|
|
|
"积极影响": ["高效", "省力", "智能",
|
|
|
"好用", "厉害", "帮助", "提升", "优化", "进步", "解放",
|
|
|
"神器", "给力", "便捷" "方便", "强大", "有用"], # 与积极影响相关的观点
|
|
|
"其他观点": [] # 未匹配到上述类别的观点
|
|
|
}
|
|
|
# 初始化各观点类别的计数(初始值为0)
|
|
|
view_counts = {k: 0 for k in categories}
|
|
|
|
|
|
# 遍历每条AI相关弹幕及其出现次数
|
|
|
for danmuku, count in ai_counter.items():
|
|
|
matched = False # 标记是否匹配到某个观点类别
|
|
|
# 检查当前弹幕是否属于某个预设类别(跳过"其他观点")
|
|
|
for cat, keywords in categories.items():
|
|
|
if cat == "其他观点":
|
|
|
continue
|
|
|
# 若弹幕包含该类别下的任意关键词,则归为该类别
|
|
|
if any(k in danmuku for k in keywords):
|
|
|
view_counts[cat] += count # 累加该类别的总次数
|
|
|
matched = True
|
|
|
break # 匹配到后跳出循环,避免重复归类
|
|
|
# 若未匹配到任何预设类别,则归为"其他观点"
|
|
|
if not matched:
|
|
|
view_counts["其他观点"] += count
|
|
|
|
|
|
return view_counts |