ADD file via upload

main
fzu102301136 5 months ago
parent e0a0dd8b36
commit 221b2219de

@ -0,0 +1,78 @@
from collections import Counter
class DanmukuAnalyzer:
def __init__(self, danmukus):
# 初始化弹幕数据:去除空弹幕和前后空格
self.danmukus = [d.strip() for d in danmukus if d.strip()]
# 加载停用词表(用于过滤无意义词汇)
self.stopwords = self._load_stopwords()
def _load_stopwords(self):
"""加载停用词表,包含默认停用词和自定义停用词(从文件读取)"""
# 默认停用词集合(常见无实际意义的虚词)
default_stopwords = {"", "", "", "", "", "", "", "", "", "", ""}
try:
# 尝试从本地文件读取额外停用词并合并
with open("stopwords.txt", "r", encoding="utf-8") as f:
# 读取文件中所有停用词,去除空格后与默认停用词合并
file_stopwords = {line.strip() for line in f.readlines()}
return default_stopwords.union(file_stopwords)
except:
# 若文件读取失败(如文件不存在),则仅返回默认停用词
return default_stopwords
def get_ai_related(self):
"""筛选出与AI相关的弹幕并统计每条弹幕的出现次数"""
# 定义与AI相关的关键词集合
ai_keywords = {"AI", "人工智能", "大模型", "语言模型", "LLM", "训练", "算法", "数据"}
# 筛选包含任意AI关键词的弹幕
ai_danmukus = [d for d in self.danmukus if any(k in d for k in ai_keywords)]
# 使用Counter统计每条AI相关弹幕的出现次数
return Counter(ai_danmukus)
def classify_views(self, ai_counter):
"""
对AI相关弹幕按观点类别进行分类统计
:param ai_counter: 包含AI相关弹幕及其出现次数的Counter对象
:return: 各观点类别的总出现次数字典
"""
# 定义观点分类标准:键为类别名称,值为该类别对应的关键词列表
categories = {
"应用成本": [ "收费", "", "便宜",
"性价比", "会员", "订阅", "费用", "氪金", "省钱", "花钱",
"成本高", "零成本", "低价", "高价","价格", "免费", "付费"], # 与成本相关的观点
"应用领域": ["应用", "场景", "教育", "医疗", "创作","办公", "学习",
"编程", "写作", "翻译", "客服", "设计", "科研", "金融",
"教学", "聊天", "助手", "工具", "领域", "行业"], # 与应用场景相关的观点
"不利影响": ["风险", "偏见", "错误", "误导",
"依赖", "滥用", "漏洞", "安全", "泄露", "失控", "替代",
"虚假", "弊端", "危害" "伦理", "隐私", "失业"], # 与负面影响相关的观点
"积极影响": ["高效", "省力", "智能",
"好用", "厉害", "帮助", "提升", "优化", "进步", "解放",
"神器", "给力", "便捷" "方便", "强大", "有用"], # 与积极影响相关的观点
"其他观点": [] # 未匹配到上述类别的观点
}
# 初始化各观点类别的计数初始值为0
view_counts = {k: 0 for k in categories}
# 遍历每条AI相关弹幕及其出现次数
for danmuku, count in ai_counter.items():
matched = False # 标记是否匹配到某个观点类别
# 检查当前弹幕是否属于某个预设类别(跳过"其他观点"
for cat, keywords in categories.items():
if cat == "其他观点":
continue
# 若弹幕包含该类别下的任意关键词,则归为该类别
if any(k in danmuku for k in keywords):
view_counts[cat] += count # 累加该类别的总次数
matched = True
break # 匹配到后跳出循环,避免重复归类
# 若未匹配到任何预设类别,则归为"其他观点"
if not matched:
view_counts["其他观点"] += count
return view_counts
Loading…
Cancel
Save