from collections import Counter class DanmukuAnalyzer: def __init__(self, danmukus): # 初始化弹幕数据:去除空弹幕和前后空格 self.danmukus = [d.strip() for d in danmukus if d.strip()] # 加载停用词表(用于过滤无意义词汇) self.stopwords = self._load_stopwords() def _load_stopwords(self): """加载停用词表,包含默认停用词和自定义停用词(从文件读取)""" # 默认停用词集合(常见无实际意义的虚词) default_stopwords = {"的", "了", "是", "在", "我", "有", "和", "就", "不", "人", "都"} try: # 尝试从本地文件读取额外停用词并合并 with open("stopwords.txt", "r", encoding="utf-8") as f: # 读取文件中所有停用词,去除空格后与默认停用词合并 file_stopwords = {line.strip() for line in f.readlines()} return default_stopwords.union(file_stopwords) except: # 若文件读取失败(如文件不存在),则仅返回默认停用词 return default_stopwords def get_ai_related(self): """筛选出与AI相关的弹幕,并统计每条弹幕的出现次数""" # 定义与AI相关的关键词集合 ai_keywords = {"AI", "人工智能", "大模型", "语言模型", "LLM", "训练", "算法", "数据"} # 筛选包含任意AI关键词的弹幕 ai_danmukus = [d for d in self.danmukus if any(k in d for k in ai_keywords)] # 使用Counter统计每条AI相关弹幕的出现次数 return Counter(ai_danmukus) def classify_views(self, ai_counter): """ 对AI相关弹幕按观点类别进行分类统计 :param ai_counter: 包含AI相关弹幕及其出现次数的Counter对象 :return: 各观点类别的总出现次数字典 """ # 定义观点分类标准:键为类别名称,值为该类别对应的关键词列表 categories = { "应用成本": [ "收费", "贵", "便宜", "性价比", "会员", "订阅", "费用", "氪金", "省钱", "花钱", "成本高", "零成本", "低价", "高价","价格", "免费", "付费"], # 与成本相关的观点 "应用领域": ["应用", "场景", "教育", "医疗", "创作","办公", "学习", "编程", "写作", "翻译", "客服", "设计", "科研", "金融", "教学", "聊天", "助手", "工具", "领域", "行业"], # 与应用场景相关的观点 "不利影响": ["风险", "偏见", "错误", "误导", "依赖", "滥用", "漏洞", "安全", "泄露", "失控", "替代", "虚假", "弊端", "危害" "伦理", "隐私", "失业"], # 与负面影响相关的观点 "积极影响": ["高效", "省力", "智能", "好用", "厉害", "帮助", "提升", "优化", "进步", "解放", "神器", "给力", "便捷" "方便", "强大", "有用"], # 与积极影响相关的观点 "其他观点": [] # 未匹配到上述类别的观点 } # 初始化各观点类别的计数(初始值为0) view_counts = {k: 0 for k in categories} # 遍历每条AI相关弹幕及其出现次数 for danmuku, count in ai_counter.items(): matched = False # 标记是否匹配到某个观点类别 # 检查当前弹幕是否属于某个预设类别(跳过"其他观点") for cat, keywords in categories.items(): if cat == "其他观点": continue # 若弹幕包含该类别下的任意关键词,则归为该类别 if any(k in danmuku for k in keywords): view_counts[cat] += count # 累加该类别的总次数 matched = True break # 匹配到后跳出循环,避免重复归类 # 若未匹配到任何预设类别,则归为"其他观点" if not matched: view_counts["其他观点"] += count return view_counts