|
|
from crawler import VideoCrawler
|
|
|
from danmaku_fetcher import get_bvid_from_url, get_cid_from_api, fetch_danmaku_xml, parse_danmaku_xml
|
|
|
from cleaner import filter_and_tokenize
|
|
|
from analyzer import count_tokens, top_n
|
|
|
from exporter import export_to_excel
|
|
|
import time
|
|
|
|
|
|
# === 噪声过滤关键词 ===
|
|
|
NOISE_WORDS = {"666", "牛", "赞", "厉害", "哈哈", "hhh", "加油", "好看", "不错"}
|
|
|
|
|
|
# === 搜索关键词 ===
|
|
|
KEYWORDS = ["大语言模型", "大模型", "LLM"]
|
|
|
|
|
|
def main():
|
|
|
max_videos_per_keyword = 100
|
|
|
all_records = []
|
|
|
|
|
|
print("🚀 启动 Edge 爬虫,准备批量抓取视频链接...")
|
|
|
c = VideoCrawler(headless=True) # 建议使用无头模式避免窗口干扰
|
|
|
|
|
|
all_video_links = set()
|
|
|
for kw in KEYWORDS:
|
|
|
links = c.search_and_collect(kw, max_videos=max_videos_per_keyword)
|
|
|
print(f"✅ {kw} 获取到 {len(links)} 个视频链接")
|
|
|
all_video_links.update(links)
|
|
|
|
|
|
c.close()
|
|
|
print(f"\n📦 共收集视频链接 {len(all_video_links)} 个(去重后)\n")
|
|
|
|
|
|
for i, url in enumerate(all_video_links, 1):
|
|
|
print(f"📡 [{i}/{len(all_video_links)}] 处理:{url}")
|
|
|
bvid = get_bvid_from_url(url)
|
|
|
if not bvid:
|
|
|
print("❌ 无法解析 BV 号,跳过。")
|
|
|
continue
|
|
|
|
|
|
cid = get_cid_from_api(bvid)
|
|
|
if not cid:
|
|
|
print("❌ 无法获取 cid,跳过。")
|
|
|
continue
|
|
|
|
|
|
try:
|
|
|
xml_data = fetch_danmaku_xml(cid)
|
|
|
danmakus = parse_danmaku_xml(xml_data)
|
|
|
# 过滤噪声弹幕
|
|
|
danmakus = [d for d in danmakus if d not in NOISE_WORDS]
|
|
|
print(f"💬 获取有效弹幕 {len(danmakus)} 条。")
|
|
|
except Exception as e:
|
|
|
print(f"⚠️ 获取弹幕失败:{e}")
|
|
|
continue
|
|
|
|
|
|
tokens = filter_and_tokenize(danmakus)
|
|
|
all_records.append({
|
|
|
"video_url": url,
|
|
|
"danmaku": " ".join(danmakus),
|
|
|
"tokens": tokens
|
|
|
})
|
|
|
|
|
|
print(f"\n📊 共采集弹幕:{sum(len(r['tokens']) for r in all_records)} 个词语")
|
|
|
|
|
|
token_counter = count_tokens(all_records)
|
|
|
top_words = top_n(token_counter, n=10)
|
|
|
print(f"🔥 Top10 热词: {top_words}")
|
|
|
|
|
|
export_to_excel(all_records, out_path="result_300.xlsx")
|
|
|
print("✅ 已导出 result_300.xlsx")
|
|
|
|
|
|
print("\n🎉 所有任务已完成!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|