You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

78 lines
4.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from collections import Counter
class DanmukuAnalyzer:
def __init__(self, danmukus):
# 初始化弹幕数据:去除空弹幕和前后空格
self.danmukus = [d.strip() for d in danmukus if d.strip()]
# 加载停用词表(用于过滤无意义词汇)
self.stopwords = self._load_stopwords()
def _load_stopwords(self):
"""加载停用词表,包含默认停用词和自定义停用词(从文件读取)"""
# 默认停用词集合(常见无实际意义的虚词)
default_stopwords = {"", "", "", "", "", "", "", "", "", "", ""}
try:
# 尝试从本地文件读取额外停用词并合并
with open("stopwords.txt", "r", encoding="utf-8") as f:
# 读取文件中所有停用词,去除空格后与默认停用词合并
file_stopwords = {line.strip() for line in f.readlines()}
return default_stopwords.union(file_stopwords)
except:
# 若文件读取失败(如文件不存在),则仅返回默认停用词
return default_stopwords
def get_ai_related(self):
"""筛选出与AI相关的弹幕并统计每条弹幕的出现次数"""
# 定义与AI相关的关键词集合
ai_keywords = {"AI", "人工智能", "大模型", "语言模型", "LLM", "训练", "算法", "数据"}
# 筛选包含任意AI关键词的弹幕
ai_danmukus = [d for d in self.danmukus if any(k in d for k in ai_keywords)]
# 使用Counter统计每条AI相关弹幕的出现次数
return Counter(ai_danmukus)
def classify_views(self, ai_counter):
"""
对AI相关弹幕按观点类别进行分类统计
:param ai_counter: 包含AI相关弹幕及其出现次数的Counter对象
:return: 各观点类别的总出现次数字典
"""
# 定义观点分类标准:键为类别名称,值为该类别对应的关键词列表
categories = {
"应用成本": [ "收费", "", "便宜",
"性价比", "会员", "订阅", "费用", "氪金", "省钱", "花钱",
"成本高", "零成本", "低价", "高价","价格", "免费", "付费"], # 与成本相关的观点
"应用领域": ["应用", "场景", "教育", "医疗", "创作","办公", "学习",
"编程", "写作", "翻译", "客服", "设计", "科研", "金融",
"教学", "聊天", "助手", "工具", "领域", "行业"], # 与应用场景相关的观点
"不利影响": ["风险", "偏见", "错误", "误导",
"依赖", "滥用", "漏洞", "安全", "泄露", "失控", "替代",
"虚假", "弊端", "危害" "伦理", "隐私", "失业"], # 与负面影响相关的观点
"积极影响": ["高效", "省力", "智能",
"好用", "厉害", "帮助", "提升", "优化", "进步", "解放",
"神器", "给力", "便捷" "方便", "强大", "有用"], # 与积极影响相关的观点
"其他观点": [] # 未匹配到上述类别的观点
}
# 初始化各观点类别的计数初始值为0
view_counts = {k: 0 for k in categories}
# 遍历每条AI相关弹幕及其出现次数
for danmuku, count in ai_counter.items():
matched = False # 标记是否匹配到某个观点类别
# 检查当前弹幕是否属于某个预设类别(跳过"其他观点"
for cat, keywords in categories.items():
if cat == "其他观点":
continue
# 若弹幕包含该类别下的任意关键词,则归为该类别
if any(k in danmuku for k in keywords):
view_counts[cat] += count # 累加该类别的总次数
matched = True
break # 匹配到后跳出循环,避免重复归类
# 若未匹配到任何预设类别,则归为"其他观点"
if not matched:
view_counts["其他观点"] += count
return view_counts