05230137/main.py

from crawler import VideoCrawler
from danmaku_fetcher import get_bvid_from_url, get_cid_from_api, fetch_danmaku_xml, parse_danmaku_xml
from cleaner import filter_and_tokenize
from analyzer import count_tokens, top_n
from exporter import export_to_excel
import time

# === 噪声过滤关键词 ===
NOISE_WORDS = {"666", "牛", "赞", "厉害", "哈哈", "hhh", "加油", "好看", "不错"}

# === 搜索关键词 ===
KEYWORDS = ["大语言模型", "大模型", "LLM"]

def main():
    max_videos_per_keyword = 100
    all_records = []

    print("🚀 启动 Edge 爬虫，准备批量抓取视频链接...")
    c = VideoCrawler(headless=True)  # 建议使用无头模式避免窗口干扰

    all_video_links = set()
    for kw in KEYWORDS:
        links = c.search_and_collect(kw, max_videos=max_videos_per_keyword)
        print(f"✅ {kw} 获取到 {len(links)} 个视频链接")
        all_video_links.update(links)

    c.close()
    print(f"\n📦 共收集视频链接 {len(all_video_links)} 个（去重后）\n")

    for i, url in enumerate(all_video_links, 1):
        print(f"📡 [{i}/{len(all_video_links)}] 处理：{url}")
        bvid = get_bvid_from_url(url)
        if not bvid:
            print("❌ 无法解析 BV 号，跳过。")
            continue

        cid = get_cid_from_api(bvid)
        if not cid:
            print("❌ 无法获取 cid，跳过。")
            continue

        try:
            xml_data = fetch_danmaku_xml(cid)
            danmakus = parse_danmaku_xml(xml_data)
            # 过滤噪声弹幕
            danmakus = [d for d in danmakus if d not in NOISE_WORDS]
            print(f"💬 获取有效弹幕 {len(danmakus)} 条。")
        except Exception as e:
            print(f"⚠️ 获取弹幕失败：{e}")
            continue

        tokens = filter_and_tokenize(danmakus)
        all_records.append({
            "video_url": url,
            "danmaku": " ".join(danmakus),
            "tokens": tokens
        })

    print(f"\n📊 共采集弹幕：{sum(len(r['tokens']) for r in all_records)} 个词语")

    token_counter = count_tokens(all_records)
    top_words = top_n(token_counter, n=10)
    print(f"🔥 Top10 热词: {top_words}")

    export_to_excel(all_records, out_path="result_300.xlsx")
    print("✅ 已导出 result_300.xlsx")

    print("\n🎉 所有任务已完成！")

if __name__ == "__main__":
    main()