|
|
|
|
@ -68,7 +68,7 @@ def fetch_danmakus(aid):
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
def get_top_videos_aids(keyword, max_videos=120):
|
|
|
|
|
"""根据关键词获取综合排序前N条视频的AID"""
|
|
|
|
|
"""根据关键词获取综合排序前120条视频的AID"""
|
|
|
|
|
aids = []
|
|
|
|
|
page = 1
|
|
|
|
|
page_size = 30
|
|
|
|
|
@ -120,6 +120,159 @@ def get_top_videos_aids(keyword, max_videos=120):
|
|
|
|
|
|
|
|
|
|
return aids[:max_videos] # 确保不超过最大数量
|
|
|
|
|
|
|
|
|
|
# 模块2:弹幕噪声过滤
|
|
|
|
|
def filter_noise(danmu_text):
|
|
|
|
|
"""过滤噪声弹幕:长度≤2字、纯情绪、无意义互动"""
|
|
|
|
|
# 1. 过滤长度
|
|
|
|
|
if len(danmu_text) <= 2:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 2. 过滤无价值弹幕
|
|
|
|
|
noise_patterns = [
|
|
|
|
|
r"^666.*$", r"^哈哈哈.*$", r"^呜呜呜.*$", r"^牛逼.*$", r"^绝了.*$",
|
|
|
|
|
r"^前排.*$", r"^沙发.*$", r"^打卡.*$", r"^来了.*$", r"^加油.*$",
|
|
|
|
|
r"^点赞.*$", r"^关注.*$", r"^收藏.*$", r"^投币.*$", r"^三连.*$",
|
|
|
|
|
r"^精彩.*$"
|
|
|
|
|
]
|
|
|
|
|
for pattern in noise_patterns:
|
|
|
|
|
if re.match(pattern, danmu_text):
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 3. 过滤无意义符号组合
|
|
|
|
|
if not re.search(r"[一-龥a-zA-Z0-9]", danmu_text):
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
# 模块3:数据统计
|
|
|
|
|
def analyze_danmu(danmu_list):
|
|
|
|
|
"""弹幕数据分析:词频统计、Top8弹幕、用户看法分类"""
|
|
|
|
|
# 1. 整体词频统计
|
|
|
|
|
all_words = []
|
|
|
|
|
for dm in danmu_list:
|
|
|
|
|
words = re.findall(r"[一-龥a-zA-Z0-9]{2,}", dm) # 保留长度≥2的词汇
|
|
|
|
|
all_words.extend(words)
|
|
|
|
|
|
|
|
|
|
# 2. Top8高频弹幕(原文本)
|
|
|
|
|
top8_danmu = Counter(danmu_list).most_common(8)
|
|
|
|
|
|
|
|
|
|
# 3. Top8高频词汇(分词后)
|
|
|
|
|
top8_words = Counter(all_words).most_common(8)
|
|
|
|
|
|
|
|
|
|
# 4. 用户主流看法分类统计
|
|
|
|
|
opinion_categories = {
|
|
|
|
|
"应用成本低": ["免费", "便宜", "低成本", "门槛低", "新手友好"],
|
|
|
|
|
"应用成本高": ["付费", "贵", "高成本", "订阅费", "收费"],
|
|
|
|
|
"潜在应用领域": ["办公", "学习", "创作", "编程", "翻译", "客服", "数据分析", "设计"],
|
|
|
|
|
"正面影响": ["高效", "省事", "方便", "好用", "提升效率", "节省时间"],
|
|
|
|
|
"不利影响": ["依赖", "失业", "不准确", "误导", "隐私泄露", "偏见"],
|
|
|
|
|
"技术期待": ["更智能", "优化", "升级", "功能更强", "多模态"],
|
|
|
|
|
"技术质疑": ["没用", "鸡肋", "不实用", "夸大", "炒作"]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
opinion_stats = {}
|
|
|
|
|
for category, keywords in opinion_categories.items():
|
|
|
|
|
count = 0
|
|
|
|
|
related_danmu = []
|
|
|
|
|
for dm in danmu_list:
|
|
|
|
|
if any(keyword in dm for keyword in keywords):
|
|
|
|
|
count += 1
|
|
|
|
|
related_danmu.append(dm)
|
|
|
|
|
opinion_stats[category] = {
|
|
|
|
|
"提及次数": count,
|
|
|
|
|
"占比(%)": round(count / len(danmu_list) * 100, 2) if danmu_list else 0,
|
|
|
|
|
"相关弹幕示例": related_danmu[:3]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
"总有效弹幕数": len(danmu_list),
|
|
|
|
|
"Top8高频弹幕": top8_danmu,
|
|
|
|
|
"Top8高频词汇": top8_words,
|
|
|
|
|
"用户看法统计": opinion_stats
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 模块4:Excel写入
|
|
|
|
|
def write_to_excel(analysis_result, danmu_list, output_path):
|
|
|
|
|
wb = Workbook()
|
|
|
|
|
|
|
|
|
|
# 工作表1:原始有效弹幕
|
|
|
|
|
ws1 = wb.active
|
|
|
|
|
ws1.title = "原始有效弹幕"
|
|
|
|
|
ws1["A1"] = "弹幕内容"
|
|
|
|
|
ws1["A1"].font = Font(bold=True)
|
|
|
|
|
ws1["A1"].fill = PatternFill(start_color="E6F3FF", end_color="E6F3FF", fill_type="solid")
|
|
|
|
|
for idx, dm in enumerate(danmu_list, 2):
|
|
|
|
|
ws1[f"A{idx}"] = dm
|
|
|
|
|
ws1.column_dimensions["A"].width = 50
|
|
|
|
|
|
|
|
|
|
# 工作表2:Top8统计
|
|
|
|
|
ws2 = wb.create_sheet("Top8统计")
|
|
|
|
|
# 高频弹幕
|
|
|
|
|
ws2["A1"] = "Top8高频弹幕"
|
|
|
|
|
ws2["A1"].font = Font(bold=True, size=12)
|
|
|
|
|
ws2["A1"].fill = PatternFill(start_color="E6F3FF", end_color="E6F3FF", fill_type="solid")
|
|
|
|
|
ws2["A2"] = "排名"
|
|
|
|
|
ws2["B2"] = "弹幕内容"
|
|
|
|
|
ws2["C2"] = "出现次数"
|
|
|
|
|
for idx, (danmu, count) in enumerate(analysis_result["Top8高频弹幕"], 3):
|
|
|
|
|
ws2[f"A{idx}"] = idx - 2
|
|
|
|
|
ws2[f"B{idx}"] = danmu
|
|
|
|
|
ws2[f"C{idx}"] = count
|
|
|
|
|
# 高频词汇
|
|
|
|
|
ws2["E1"] = "Top8高频词汇"
|
|
|
|
|
ws2["E1"].font = Font(bold=True, size=12)
|
|
|
|
|
ws2["E1"].fill = PatternFill(start_color="E6F3FF", end_color="E6F3FF", fill_type="solid")
|
|
|
|
|
ws2["E2"] = "排名"
|
|
|
|
|
ws2["F2"] = "词汇"
|
|
|
|
|
ws2["G2"] = "出现次数"
|
|
|
|
|
for idx, (word, count) in enumerate(analysis_result["Top8高频词汇"], 3):
|
|
|
|
|
ws2[f"E{idx}"] = idx - 2
|
|
|
|
|
ws2[f"F{idx}"] = word
|
|
|
|
|
ws2[f"G{idx}"] = count
|
|
|
|
|
ws2.column_dimensions["B"].width = 50
|
|
|
|
|
ws2.column_dimensions["F"].width = 20
|
|
|
|
|
|
|
|
|
|
# 工作表3:用户看法统计
|
|
|
|
|
ws3 = wb.create_sheet("用户看法统计")
|
|
|
|
|
ws3["A1"] = "看法类别"
|
|
|
|
|
ws3["B1"] = "提及次数"
|
|
|
|
|
ws3["C1"] = "占比(%)"
|
|
|
|
|
ws3["D1"] = "相关弹幕示例1"
|
|
|
|
|
ws3["E1"] = "相关弹幕示例2"
|
|
|
|
|
ws3["F1"] = "相关弹幕示例3"
|
|
|
|
|
for col in ["A", "B", "C", "D", "E", "F"]:
|
|
|
|
|
ws3[f"{col}1"].font = Font(bold=True)
|
|
|
|
|
ws3[f"{col}1"].fill = PatternFill(start_color="E6F3FF", end_color="E6F3FF", fill_type="solid")
|
|
|
|
|
for idx, (category, stats) in enumerate(analysis_result["用户看法统计"].items(), 2):
|
|
|
|
|
ws3[f"A{idx}"] = category
|
|
|
|
|
ws3[f"B{idx}"] = stats["提及次数"]
|
|
|
|
|
ws3[f"C{idx}"] = stats["占比(%)"]
|
|
|
|
|
examples = stats["相关弹幕示例"] + [""] * 3
|
|
|
|
|
ws3[f"D{idx}"] = examples[0]
|
|
|
|
|
ws3[f"E{idx}"] = examples[1]
|
|
|
|
|
ws3[f"F{idx}"] = examples[2]
|
|
|
|
|
ws3.column_dimensions["A"].width = 15
|
|
|
|
|
ws3.column_dimensions["D"].width = 30
|
|
|
|
|
ws3.column_dimensions["E"].width = 30
|
|
|
|
|
ws3.column_dimensions["F"].width = 30
|
|
|
|
|
|
|
|
|
|
# 工作表4:统计概览
|
|
|
|
|
ws4 = wb.create_sheet("统计概览")
|
|
|
|
|
ws4["A1"] = "LLM弹幕分析概览"
|
|
|
|
|
ws4["A1"].font = Font(bold=True, size=14)
|
|
|
|
|
ws4["A2"] = "总有效弹幕数"
|
|
|
|
|
ws4["B2"] = analysis_result["总有效弹幕数"]
|
|
|
|
|
ws4["A3"] = "分析时间"
|
|
|
|
|
ws4["B3"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
|
|
|
for col in ["A", "B"]:
|
|
|
|
|
for row in [2, 3]:
|
|
|
|
|
ws4[f"{col}{row}"].font = Font(bold=True)
|
|
|
|
|
ws4[f"{col}{row}"].fill = PatternFill(start_color="F0F8FF", end_color="F0F8FF", fill_type="solid")
|
|
|
|
|
ws4.column_dimensions["A"].width = 20
|
|
|
|
|
ws4.column_dimensions["B"].width = 30
|
|
|
|
|
|
|
|
|
|
wb.save(output_path)
|
|
|
|
|
print(f"\nExcel文件已保存至:{output_path}")
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
print("="*50)
|
|
|
|
|
print("开始B站LLM相关弹幕分析任务(AID爬取版)")
|
|
|
|
|
@ -153,4 +306,19 @@ if __name__ == "__main__":
|
|
|
|
|
print(f"\n弹幕爬取完成,累计原始弹幕:{len(all_raw_danmus)}条")
|
|
|
|
|
if len(all_raw_danmus) == 0:
|
|
|
|
|
print("警告:未爬取到任何原始弹幕,任务终止")
|
|
|
|
|
exit()
|
|
|
|
|
exit()
|
|
|
|
|
# 步骤3:过滤噪声弹幕
|
|
|
|
|
print("\n【步骤3/6】过滤噪声弹幕...")
|
|
|
|
|
all_cleaned_danmu = [dm for dm in all_raw_danmus if filter_noise(dm)]
|
|
|
|
|
print(f"噪声过滤完成,有效弹幕:{len(all_cleaned_danmu)}条")
|
|
|
|
|
if len(all_cleaned_danmu) == 0:
|
|
|
|
|
print("警告:过滤后无有效弹幕,任务终止")
|
|
|
|
|
exit()
|
|
|
|
|
|
|
|
|
|
# 步骤4:数据分析
|
|
|
|
|
print("\n【步骤4/6】进行数据分析...")
|
|
|
|
|
analysis_result = analyze_danmu(all_cleaned_danmu)
|
|
|
|
|
|
|
|
|
|
# 步骤5:写入Excel
|
|
|
|
|
print("\n【步骤5/6】写入Excel文件...")
|
|
|
|
|
write_to_excel(analysis_result, all_cleaned_danmu, OUTPUT_EXCEL)
|