From 5829915597580840a052d111fd0a587fd49f37bc Mon Sep 17 00:00:00 2001 From: fzu102301128 <1148297047@qq.com> Date: Sun, 16 Nov 2025 18:58:44 +0800 Subject: [PATCH] =?UTF-8?q?Update=20=E5=BC=B9=E5=B9=95.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 弹幕.py | 172 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 170 insertions(+), 2 deletions(-) diff --git a/弹幕.py b/弹幕.py index 9c7a6ba..eda8a20 100644 --- a/弹幕.py +++ b/弹幕.py @@ -68,7 +68,7 @@ def fetch_danmakus(aid): return [] def get_top_videos_aids(keyword, max_videos=120): - """根据关键词获取综合排序前N条视频的AID""" + """根据关键词获取综合排序前120条视频的AID""" aids = [] page = 1 page_size = 30 @@ -120,6 +120,159 @@ def get_top_videos_aids(keyword, max_videos=120): return aids[:max_videos] # 确保不超过最大数量 +# 模块2:弹幕噪声过滤 +def filter_noise(danmu_text): + """过滤噪声弹幕:长度≤2字、纯情绪、无意义互动""" + # 1. 过滤长度 + if len(danmu_text) <= 2: + return False + + # 2. 过滤无价值弹幕 + noise_patterns = [ + r"^666.*$", r"^哈哈哈.*$", r"^呜呜呜.*$", r"^牛逼.*$", r"^绝了.*$", + r"^前排.*$", r"^沙发.*$", r"^打卡.*$", r"^来了.*$", r"^加油.*$", + r"^点赞.*$", r"^关注.*$", r"^收藏.*$", r"^投币.*$", r"^三连.*$", + r"^精彩.*$" + ] + for pattern in noise_patterns: + if re.match(pattern, danmu_text): + return False + + # 3. 过滤无意义符号组合 + if not re.search(r"[一-龥a-zA-Z0-9]", danmu_text): + return False + + return True +# 模块3:数据统计 +def analyze_danmu(danmu_list): + """弹幕数据分析:词频统计、Top8弹幕、用户看法分类""" + # 1. 整体词频统计 + all_words = [] + for dm in danmu_list: + words = re.findall(r"[一-龥a-zA-Z0-9]{2,}", dm) # 保留长度≥2的词汇 + all_words.extend(words) + + # 2. Top8高频弹幕(原文本) + top8_danmu = Counter(danmu_list).most_common(8) + + # 3. Top8高频词汇(分词后) + top8_words = Counter(all_words).most_common(8) + + # 4. 用户主流看法分类统计 + opinion_categories = { + "应用成本低": ["免费", "便宜", "低成本", "门槛低", "新手友好"], + "应用成本高": ["付费", "贵", "高成本", "订阅费", "收费"], + "潜在应用领域": ["办公", "学习", "创作", "编程", "翻译", "客服", "数据分析", "设计"], + "正面影响": ["高效", "省事", "方便", "好用", "提升效率", "节省时间"], + "不利影响": ["依赖", "失业", "不准确", "误导", "隐私泄露", "偏见"], + "技术期待": ["更智能", "优化", "升级", "功能更强", "多模态"], + "技术质疑": ["没用", "鸡肋", "不实用", "夸大", "炒作"] + } + + opinion_stats = {} + for category, keywords in opinion_categories.items(): + count = 0 + related_danmu = [] + for dm in danmu_list: + if any(keyword in dm for keyword in keywords): + count += 1 + related_danmu.append(dm) + opinion_stats[category] = { + "提及次数": count, + "占比(%)": round(count / len(danmu_list) * 100, 2) if danmu_list else 0, + "相关弹幕示例": related_danmu[:3] + } + + return { + "总有效弹幕数": len(danmu_list), + "Top8高频弹幕": top8_danmu, + "Top8高频词汇": top8_words, + "用户看法统计": opinion_stats + } + +# 模块4:Excel写入 +def write_to_excel(analysis_result, danmu_list, output_path): + wb = Workbook() + + # 工作表1:原始有效弹幕 + ws1 = wb.active + ws1.title = "原始有效弹幕" + ws1["A1"] = "弹幕内容" + ws1["A1"].font = Font(bold=True) + ws1["A1"].fill = PatternFill(start_color="E6F3FF", end_color="E6F3FF", fill_type="solid") + for idx, dm in enumerate(danmu_list, 2): + ws1[f"A{idx}"] = dm + ws1.column_dimensions["A"].width = 50 + + # 工作表2:Top8统计 + ws2 = wb.create_sheet("Top8统计") + # 高频弹幕 + ws2["A1"] = "Top8高频弹幕" + ws2["A1"].font = Font(bold=True, size=12) + ws2["A1"].fill = PatternFill(start_color="E6F3FF", end_color="E6F3FF", fill_type="solid") + ws2["A2"] = "排名" + ws2["B2"] = "弹幕内容" + ws2["C2"] = "出现次数" + for idx, (danmu, count) in enumerate(analysis_result["Top8高频弹幕"], 3): + ws2[f"A{idx}"] = idx - 2 + ws2[f"B{idx}"] = danmu + ws2[f"C{idx}"] = count + # 高频词汇 + ws2["E1"] = "Top8高频词汇" + ws2["E1"].font = Font(bold=True, size=12) + ws2["E1"].fill = PatternFill(start_color="E6F3FF", end_color="E6F3FF", fill_type="solid") + ws2["E2"] = "排名" + ws2["F2"] = "词汇" + ws2["G2"] = "出现次数" + for idx, (word, count) in enumerate(analysis_result["Top8高频词汇"], 3): + ws2[f"E{idx}"] = idx - 2 + ws2[f"F{idx}"] = word + ws2[f"G{idx}"] = count + ws2.column_dimensions["B"].width = 50 + ws2.column_dimensions["F"].width = 20 + + # 工作表3:用户看法统计 + ws3 = wb.create_sheet("用户看法统计") + ws3["A1"] = "看法类别" + ws3["B1"] = "提及次数" + ws3["C1"] = "占比(%)" + ws3["D1"] = "相关弹幕示例1" + ws3["E1"] = "相关弹幕示例2" + ws3["F1"] = "相关弹幕示例3" + for col in ["A", "B", "C", "D", "E", "F"]: + ws3[f"{col}1"].font = Font(bold=True) + ws3[f"{col}1"].fill = PatternFill(start_color="E6F3FF", end_color="E6F3FF", fill_type="solid") + for idx, (category, stats) in enumerate(analysis_result["用户看法统计"].items(), 2): + ws3[f"A{idx}"] = category + ws3[f"B{idx}"] = stats["提及次数"] + ws3[f"C{idx}"] = stats["占比(%)"] + examples = stats["相关弹幕示例"] + [""] * 3 + ws3[f"D{idx}"] = examples[0] + ws3[f"E{idx}"] = examples[1] + ws3[f"F{idx}"] = examples[2] + ws3.column_dimensions["A"].width = 15 + ws3.column_dimensions["D"].width = 30 + ws3.column_dimensions["E"].width = 30 + ws3.column_dimensions["F"].width = 30 + + # 工作表4:统计概览 + ws4 = wb.create_sheet("统计概览") + ws4["A1"] = "LLM弹幕分析概览" + ws4["A1"].font = Font(bold=True, size=14) + ws4["A2"] = "总有效弹幕数" + ws4["B2"] = analysis_result["总有效弹幕数"] + ws4["A3"] = "分析时间" + ws4["B3"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + for col in ["A", "B"]: + for row in [2, 3]: + ws4[f"{col}{row}"].font = Font(bold=True) + ws4[f"{col}{row}"].fill = PatternFill(start_color="F0F8FF", end_color="F0F8FF", fill_type="solid") + ws4.column_dimensions["A"].width = 20 + ws4.column_dimensions["B"].width = 30 + + wb.save(output_path) + print(f"\nExcel文件已保存至:{output_path}") + if __name__ == "__main__": print("="*50) print("开始B站LLM相关弹幕分析任务(AID爬取版)") @@ -153,4 +306,19 @@ if __name__ == "__main__": print(f"\n弹幕爬取完成,累计原始弹幕:{len(all_raw_danmus)}条") if len(all_raw_danmus) == 0: print("警告:未爬取到任何原始弹幕,任务终止") - exit() \ No newline at end of file + exit() + # 步骤3:过滤噪声弹幕 + print("\n【步骤3/6】过滤噪声弹幕...") + all_cleaned_danmu = [dm for dm in all_raw_danmus if filter_noise(dm)] + print(f"噪声过滤完成,有效弹幕:{len(all_cleaned_danmu)}条") + if len(all_cleaned_danmu) == 0: + print("警告:过滤后无有效弹幕,任务终止") + exit() + + # 步骤4:数据分析 + print("\n【步骤4/6】进行数据分析...") + analysis_result = analyze_danmu(all_cleaned_danmu) + + # 步骤5:写入Excel + print("\n【步骤5/6】写入Excel文件...") + write_to_excel(analysis_result, all_cleaned_danmu, OUTPUT_EXCEL) \ No newline at end of file