Update 弹幕.py

main
fzu102301128 1 month ago
parent b193430ef3
commit 5829915597

@ -68,7 +68,7 @@ def fetch_danmakus(aid):
return []
def get_top_videos_aids(keyword, max_videos=120):
"""根据关键词获取综合排序前N条视频的AID"""
"""根据关键词获取综合排序前120条视频的AID"""
aids = []
page = 1
page_size = 30
@ -120,6 +120,159 @@ def get_top_videos_aids(keyword, max_videos=120):
return aids[:max_videos] # 确保不超过最大数量
# 模块2弹幕噪声过滤
def filter_noise(danmu_text):
"""过滤噪声弹幕长度≤2字、纯情绪、无意义互动"""
# 1. 过滤长度
if len(danmu_text) <= 2:
return False
# 2. 过滤无价值弹幕
noise_patterns = [
r"^666.*$", r"^哈哈哈.*$", r"^呜呜呜.*$", r"^牛逼.*$", r"^绝了.*$",
r"^前排.*$", r"^沙发.*$", r"^打卡.*$", r"^来了.*$", r"^加油.*$",
r"^点赞.*$", r"^关注.*$", r"^收藏.*$", r"^投币.*$", r"^三连.*$",
r"^精彩.*$"
]
for pattern in noise_patterns:
if re.match(pattern, danmu_text):
return False
# 3. 过滤无意义符号组合
if not re.search(r"[一-龥a-zA-Z0-9]", danmu_text):
return False
return True
# 模块3数据统计
def analyze_danmu(danmu_list):
"""弹幕数据分析词频统计、Top8弹幕、用户看法分类"""
# 1. 整体词频统计
all_words = []
for dm in danmu_list:
words = re.findall(r"[一-龥a-zA-Z0-9]{2,}", dm) # 保留长度≥2的词汇
all_words.extend(words)
# 2. Top8高频弹幕原文本
top8_danmu = Counter(danmu_list).most_common(8)
# 3. Top8高频词汇分词后
top8_words = Counter(all_words).most_common(8)
# 4. 用户主流看法分类统计
opinion_categories = {
"应用成本低": ["免费", "便宜", "低成本", "门槛低", "新手友好"],
"应用成本高": ["付费", "", "高成本", "订阅费", "收费"],
"潜在应用领域": ["办公", "学习", "创作", "编程", "翻译", "客服", "数据分析", "设计"],
"正面影响": ["高效", "省事", "方便", "好用", "提升效率", "节省时间"],
"不利影响": ["依赖", "失业", "不准确", "误导", "隐私泄露", "偏见"],
"技术期待": ["更智能", "优化", "升级", "功能更强", "多模态"],
"技术质疑": ["没用", "鸡肋", "不实用", "夸大", "炒作"]
}
opinion_stats = {}
for category, keywords in opinion_categories.items():
count = 0
related_danmu = []
for dm in danmu_list:
if any(keyword in dm for keyword in keywords):
count += 1
related_danmu.append(dm)
opinion_stats[category] = {
"提及次数": count,
"占比(%)": round(count / len(danmu_list) * 100, 2) if danmu_list else 0,
"相关弹幕示例": related_danmu[:3]
}
return {
"总有效弹幕数": len(danmu_list),
"Top8高频弹幕": top8_danmu,
"Top8高频词汇": top8_words,
"用户看法统计": opinion_stats
}
# 模块4Excel写入
def write_to_excel(analysis_result, danmu_list, output_path):
wb = Workbook()
# 工作表1原始有效弹幕
ws1 = wb.active
ws1.title = "原始有效弹幕"
ws1["A1"] = "弹幕内容"
ws1["A1"].font = Font(bold=True)
ws1["A1"].fill = PatternFill(start_color="E6F3FF", end_color="E6F3FF", fill_type="solid")
for idx, dm in enumerate(danmu_list, 2):
ws1[f"A{idx}"] = dm
ws1.column_dimensions["A"].width = 50
# 工作表2Top8统计
ws2 = wb.create_sheet("Top8统计")
# 高频弹幕
ws2["A1"] = "Top8高频弹幕"
ws2["A1"].font = Font(bold=True, size=12)
ws2["A1"].fill = PatternFill(start_color="E6F3FF", end_color="E6F3FF", fill_type="solid")
ws2["A2"] = "排名"
ws2["B2"] = "弹幕内容"
ws2["C2"] = "出现次数"
for idx, (danmu, count) in enumerate(analysis_result["Top8高频弹幕"], 3):
ws2[f"A{idx}"] = idx - 2
ws2[f"B{idx}"] = danmu
ws2[f"C{idx}"] = count
# 高频词汇
ws2["E1"] = "Top8高频词汇"
ws2["E1"].font = Font(bold=True, size=12)
ws2["E1"].fill = PatternFill(start_color="E6F3FF", end_color="E6F3FF", fill_type="solid")
ws2["E2"] = "排名"
ws2["F2"] = "词汇"
ws2["G2"] = "出现次数"
for idx, (word, count) in enumerate(analysis_result["Top8高频词汇"], 3):
ws2[f"E{idx}"] = idx - 2
ws2[f"F{idx}"] = word
ws2[f"G{idx}"] = count
ws2.column_dimensions["B"].width = 50
ws2.column_dimensions["F"].width = 20
# 工作表3用户看法统计
ws3 = wb.create_sheet("用户看法统计")
ws3["A1"] = "看法类别"
ws3["B1"] = "提及次数"
ws3["C1"] = "占比(%)"
ws3["D1"] = "相关弹幕示例1"
ws3["E1"] = "相关弹幕示例2"
ws3["F1"] = "相关弹幕示例3"
for col in ["A", "B", "C", "D", "E", "F"]:
ws3[f"{col}1"].font = Font(bold=True)
ws3[f"{col}1"].fill = PatternFill(start_color="E6F3FF", end_color="E6F3FF", fill_type="solid")
for idx, (category, stats) in enumerate(analysis_result["用户看法统计"].items(), 2):
ws3[f"A{idx}"] = category
ws3[f"B{idx}"] = stats["提及次数"]
ws3[f"C{idx}"] = stats["占比(%)"]
examples = stats["相关弹幕示例"] + [""] * 3
ws3[f"D{idx}"] = examples[0]
ws3[f"E{idx}"] = examples[1]
ws3[f"F{idx}"] = examples[2]
ws3.column_dimensions["A"].width = 15
ws3.column_dimensions["D"].width = 30
ws3.column_dimensions["E"].width = 30
ws3.column_dimensions["F"].width = 30
# 工作表4统计概览
ws4 = wb.create_sheet("统计概览")
ws4["A1"] = "LLM弹幕分析概览"
ws4["A1"].font = Font(bold=True, size=14)
ws4["A2"] = "总有效弹幕数"
ws4["B2"] = analysis_result["总有效弹幕数"]
ws4["A3"] = "分析时间"
ws4["B3"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
for col in ["A", "B"]:
for row in [2, 3]:
ws4[f"{col}{row}"].font = Font(bold=True)
ws4[f"{col}{row}"].fill = PatternFill(start_color="F0F8FF", end_color="F0F8FF", fill_type="solid")
ws4.column_dimensions["A"].width = 20
ws4.column_dimensions["B"].width = 30
wb.save(output_path)
print(f"\nExcel文件已保存至{output_path}")
if __name__ == "__main__":
print("="*50)
print("开始B站LLM相关弹幕分析任务AID爬取版")
@ -153,4 +306,19 @@ if __name__ == "__main__":
print(f"\n弹幕爬取完成,累计原始弹幕:{len(all_raw_danmus)}")
if len(all_raw_danmus) == 0:
print("警告:未爬取到任何原始弹幕,任务终止")
exit()
exit()
# 步骤3过滤噪声弹幕
print("\n【步骤3/6】过滤噪声弹幕...")
all_cleaned_danmu = [dm for dm in all_raw_danmus if filter_noise(dm)]
print(f"噪声过滤完成,有效弹幕:{len(all_cleaned_danmu)}")
if len(all_cleaned_danmu) == 0:
print("警告:过滤后无有效弹幕,任务终止")
exit()
# 步骤4数据分析
print("\n【步骤4/6】进行数据分析...")
analysis_result = analyze_danmu(all_cleaned_danmu)
# 步骤5写入Excel
print("\n【步骤5/6】写入Excel文件...")
write_to_excel(analysis_result, all_cleaned_danmu, OUTPUT_EXCEL)
Loading…
Cancel
Save