You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

79 lines
2.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# tests/test_project.py
import os
import tempfile
import pytest
from danmaku_fetcher import get_bvid_from_url, parse_danmaku_xml
from cleaner import is_noise, filter_and_tokenize
from analyzer import count_tokens, top_n
from exporter import export_to_excel
# 1. 测试 BV 号解析
def test_get_bvid_from_url_ok():
url = "https://www.bilibili.com/video/BV1xx411c7mD?spm_id_from=333.999.0.0"
assert get_bvid_from_url(url) == "BV1xx411c7mD"
def test_get_bvid_from_url_none():
url = "https://www.bilibili.com/video/12345"
assert get_bvid_from_url(url) is None
# 2. 测试解析简单弹幕 XML
SAMPLE_XML = """<?xml version="1.0"?>
<i>
<d p="1,2,3,4,5,6">Hello world</d>
<d p="1,2,3,4,5,6">666</d>
<d p="1,2,3,4,5,6">OpenAI好强</d>
</i>"""
def test_parse_danmaku_xml():
danmakus = parse_danmaku_xml(SAMPLE_XML)
assert "Hello world" in danmakus
assert "666" in danmakus
assert "OpenAI好强" in danmakus
# 3. 测试 is_noise 过滤
def test_is_noise_666():
assert is_noise("666")
def test_is_noise_ha():
assert is_noise("哈哈哈哈")
def test_is_noise_ok():
assert not is_noise("ChatGPT 太强了")
# 4. 测试 filter_and_tokenize
def test_filter_and_tokenize_basic():
input_danmakus = ["666", "ChatGPT 很强", "哈哈", "大语言模型值得关注"]
tokens = filter_and_tokenize(input_danmakus)
# tokens 应该包含 "ChatGPT" 或者 "大语言模型",长度>0
assert any("ChatGPT" in t or "大语言" in t or "大语言模型" in t for t in tokens)
# 5. 测试 count_tokens / top_n
def test_count_tokens_and_top_n():
records = [
{"tokens": ["大语言模型", "AI", "大语言模型"]},
{"tokens": ["AI", "聊天", "大语言模型"]}
]
cnt = count_tokens(records)
assert cnt["大语言模型"] == 3
top = top_n(cnt, n=2)
assert top[0][0] == "大语言模型"
# 6. 测试 exporter 导出 (写临时文件)
def test_export_to_excel(tmp_path):
records = [
{"video_url": "http://a", "danmaku": "Hello", "tokens": ["hello"]},
{"video_url": "http://b", "danmaku": "World", "tokens": ["world"]}
]
out_file = tmp_path / "test_result.xlsx"
export_to_excel(records, out_path=str(out_file))
assert out_file.exists()
# 7. 扩展:测试 cleaner 对长文本的分词(健壮性)
def test_cleaner_long_text():
long_text = "这是一个关于大语言模型及其应用的长句子,用于测试分词器是否能正常工作。"
tokens = filter_and_tokenize([long_text])
assert len(tokens) > 0
# 8. edge-case空弹幕数组
def test_filter_empty_list():
tokens = filter_and_tokenize([])
assert tokens == []