You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

72 lines
2.4 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from crawler import VideoCrawler
from danmaku_fetcher import get_bvid_from_url, get_cid_from_api, fetch_danmaku_xml, parse_danmaku_xml
from cleaner import filter_and_tokenize
from analyzer import count_tokens, top_n
from exporter import export_to_excel
import time
# === 噪声过滤关键词 ===
NOISE_WORDS = {"666", "", "", "厉害", "哈哈", "hhh", "加油", "好看", "不错"}
# === 搜索关键词 ===
KEYWORDS = ["大语言模型", "大模型", "LLM"]
def main():
max_videos_per_keyword = 100
all_records = []
print("🚀 启动 Edge 爬虫,准备批量抓取视频链接...")
c = VideoCrawler(headless=True) # 建议使用无头模式避免窗口干扰
all_video_links = set()
for kw in KEYWORDS:
links = c.search_and_collect(kw, max_videos=max_videos_per_keyword)
print(f"{kw} 获取到 {len(links)} 个视频链接")
all_video_links.update(links)
c.close()
print(f"\n📦 共收集视频链接 {len(all_video_links)} 个(去重后)\n")
for i, url in enumerate(all_video_links, 1):
print(f"📡 [{i}/{len(all_video_links)}] 处理:{url}")
bvid = get_bvid_from_url(url)
if not bvid:
print("❌ 无法解析 BV 号,跳过。")
continue
cid = get_cid_from_api(bvid)
if not cid:
print("❌ 无法获取 cid跳过。")
continue
try:
xml_data = fetch_danmaku_xml(cid)
danmakus = parse_danmaku_xml(xml_data)
# 过滤噪声弹幕
danmakus = [d for d in danmakus if d not in NOISE_WORDS]
print(f"💬 获取有效弹幕 {len(danmakus)} 条。")
except Exception as e:
print(f"⚠️ 获取弹幕失败:{e}")
continue
tokens = filter_and_tokenize(danmakus)
all_records.append({
"video_url": url,
"danmaku": " ".join(danmakus),
"tokens": tokens
})
print(f"\n📊 共采集弹幕:{sum(len(r['tokens']) for r in all_records)} 个词语")
token_counter = count_tokens(all_records)
top_words = top_n(token_counter, n=10)
print(f"🔥 Top10 热词: {top_words}")
export_to_excel(all_records, out_path="result_300.xlsx")
print("✅ 已导出 result_300.xlsx")
print("\n🎉 所有任务已完成!")
if __name__ == "__main__":
main()