|
|
# tests/test_project.py
|
|
|
import os
|
|
|
import tempfile
|
|
|
import pytest
|
|
|
from danmaku_fetcher import get_bvid_from_url, parse_danmaku_xml
|
|
|
from cleaner import is_noise, filter_and_tokenize
|
|
|
from analyzer import count_tokens, top_n
|
|
|
from exporter import export_to_excel
|
|
|
|
|
|
# 1. 测试 BV 号解析
|
|
|
def test_get_bvid_from_url_ok():
|
|
|
url = "https://www.bilibili.com/video/BV1xx411c7mD?spm_id_from=333.999.0.0"
|
|
|
assert get_bvid_from_url(url) == "BV1xx411c7mD"
|
|
|
|
|
|
def test_get_bvid_from_url_none():
|
|
|
url = "https://www.bilibili.com/video/12345"
|
|
|
assert get_bvid_from_url(url) is None
|
|
|
|
|
|
# 2. 测试解析简单弹幕 XML
|
|
|
SAMPLE_XML = """<?xml version="1.0"?>
|
|
|
<i>
|
|
|
<d p="1,2,3,4,5,6">Hello world</d>
|
|
|
<d p="1,2,3,4,5,6">666</d>
|
|
|
<d p="1,2,3,4,5,6">OpenAI好强</d>
|
|
|
</i>"""
|
|
|
|
|
|
def test_parse_danmaku_xml():
|
|
|
danmakus = parse_danmaku_xml(SAMPLE_XML)
|
|
|
assert "Hello world" in danmakus
|
|
|
assert "666" in danmakus
|
|
|
assert "OpenAI好强" in danmakus
|
|
|
|
|
|
# 3. 测试 is_noise 过滤
|
|
|
def test_is_noise_666():
|
|
|
assert is_noise("666")
|
|
|
def test_is_noise_ha():
|
|
|
assert is_noise("哈哈哈哈")
|
|
|
def test_is_noise_ok():
|
|
|
assert not is_noise("ChatGPT 太强了")
|
|
|
|
|
|
# 4. 测试 filter_and_tokenize
|
|
|
def test_filter_and_tokenize_basic():
|
|
|
input_danmakus = ["666", "ChatGPT 很强", "哈哈", "大语言模型值得关注"]
|
|
|
tokens = filter_and_tokenize(input_danmakus)
|
|
|
# tokens 应该包含 "ChatGPT" 或者 "大语言模型",长度>0
|
|
|
assert any("ChatGPT" in t or "大语言" in t or "大语言模型" in t for t in tokens)
|
|
|
|
|
|
# 5. 测试 count_tokens / top_n
|
|
|
def test_count_tokens_and_top_n():
|
|
|
records = [
|
|
|
{"tokens": ["大语言模型", "AI", "大语言模型"]},
|
|
|
{"tokens": ["AI", "聊天", "大语言模型"]}
|
|
|
]
|
|
|
cnt = count_tokens(records)
|
|
|
assert cnt["大语言模型"] == 3
|
|
|
top = top_n(cnt, n=2)
|
|
|
assert top[0][0] == "大语言模型"
|
|
|
|
|
|
# 6. 测试 exporter 导出 (写临时文件)
|
|
|
def test_export_to_excel(tmp_path):
|
|
|
records = [
|
|
|
{"video_url": "http://a", "danmaku": "Hello", "tokens": ["hello"]},
|
|
|
{"video_url": "http://b", "danmaku": "World", "tokens": ["world"]}
|
|
|
]
|
|
|
out_file = tmp_path / "test_result.xlsx"
|
|
|
export_to_excel(records, out_path=str(out_file))
|
|
|
assert out_file.exists()
|
|
|
|
|
|
# 7. 扩展:测试 cleaner 对长文本的分词(健壮性)
|
|
|
def test_cleaner_long_text():
|
|
|
long_text = "这是一个关于大语言模型及其应用的长句子,用于测试分词器是否能正常工作。"
|
|
|
tokens = filter_and_tokenize([long_text])
|
|
|
assert len(tokens) > 0
|
|
|
|
|
|
# 8. edge-case:空弹幕数组
|
|
|
def test_filter_empty_list():
|
|
|
tokens = filter_and_tokenize([])
|
|
|
assert tokens == []
|