05230137/test_project.py

# tests/test_project.py
import os
import tempfile
import pytest
from danmaku_fetcher import get_bvid_from_url, parse_danmaku_xml
from cleaner import is_noise, filter_and_tokenize
from analyzer import count_tokens, top_n
from exporter import export_to_excel

# 1. 测试 BV 号解析
def test_get_bvid_from_url_ok():
    url = "https://www.bilibili.com/video/BV1xx411c7mD?spm_id_from=333.999.0.0"
    assert get_bvid_from_url(url) == "BV1xx411c7mD"

def test_get_bvid_from_url_none():
    url = "https://www.bilibili.com/video/12345"
    assert get_bvid_from_url(url) is None

# 2. 测试解析简单弹幕 XML
SAMPLE_XML = """<?xml version="1.0"?>
<i>
 <d p="1,2,3,4,5,6">Hello world</d>
 <d p="1,2,3,4,5,6">666</d>
 <d p="1,2,3,4,5,6">OpenAI好强</d>
</i>"""

def test_parse_danmaku_xml():
    danmakus = parse_danmaku_xml(SAMPLE_XML)
    assert "Hello world" in danmakus
    assert "666" in danmakus
    assert "OpenAI好强" in danmakus

# 3. 测试 is_noise 过滤
def test_is_noise_666():
    assert is_noise("666")
def test_is_noise_ha():
    assert is_noise("哈哈哈哈")
def test_is_noise_ok():
    assert not is_noise("ChatGPT 太强了")

# 4. 测试 filter_and_tokenize
def test_filter_and_tokenize_basic():
    input_danmakus = ["666", "ChatGPT 很强", "哈哈", "大语言模型值得关注"]
    tokens = filter_and_tokenize(input_danmakus)
    # tokens 应该包含 "ChatGPT" 或者 "大语言模型"，长度>0
    assert any("ChatGPT" in t or "大语言" in t or "大语言模型" in t for t in tokens)

# 5. 测试 count_tokens / top_n
def test_count_tokens_and_top_n():
    records = [
        {"tokens": ["大语言模型", "AI", "大语言模型"]},
        {"tokens": ["AI", "聊天", "大语言模型"]}
    ]
    cnt = count_tokens(records)
    assert cnt["大语言模型"] == 3
    top = top_n(cnt, n=2)
    assert top[0][0] == "大语言模型"

# 6. 测试 exporter 导出 (写临时文件)
def test_export_to_excel(tmp_path):
    records = [
        {"video_url": "http://a", "danmaku": "Hello", "tokens": ["hello"]},
        {"video_url": "http://b", "danmaku": "World", "tokens": ["world"]}
    ]
    out_file = tmp_path / "test_result.xlsx"
    export_to_excel(records, out_path=str(out_file))
    assert out_file.exists()

# 7. 扩展：测试 cleaner 对长文本的分词（健壮性）
def test_cleaner_long_text():
    long_text = "这是一个关于大语言模型及其应用的长句子，用于测试分词器是否能正常工作。"
    tokens = filter_and_tokenize([long_text])
    assert len(tokens) > 0

# 8. edge-case：空弹幕数组
def test_filter_empty_list():
    tokens = filter_and_tokenize([])
    assert tokens == []