parent
8e7c4a3117
commit
a270414ff0
@ -1,7 +1,9 @@
|
||||
from collections import Counter
|
||||
from cppy.cp_util import *
|
||||
|
||||
|
||||
#
|
||||
# 遇到异常,退出程序
|
||||
#
|
||||
def extract_words(path_to_file):
|
||||
assert(type(path_to_file) is str), "Must be a string!"
|
||||
assert(path_to_file), "Must be a non-empty string!"
|
@ -1,6 +1,8 @@
|
||||
from cppy.cp_util import *
|
||||
|
||||
|
||||
#
|
||||
# 用断言从事发点给出出错的准确信息
|
||||
#
|
||||
def extractWords(path_to_file):
|
||||
assert(type(path_to_file) is str), "Must be a string"
|
||||
assert(path_to_file), "Must be a non-empty string"
|
@ -0,0 +1,7 @@
|
||||
|
||||
data_dir: "./data"
|
||||
output_file: "./results.csv"
|
||||
stop_words_file: "./config/stopwords.txt"
|
||||
top_n: 10
|
||||
tokenizer: "simple" # 可选 simple/jieba
|
||||
|
@ -0,0 +1,10 @@
|
||||
from pathlib import Path
|
||||
import csv
|
||||
|
||||
def save_results(results: dict, output_path: str) -> None:
|
||||
"""保存结果到CSV"""
|
||||
Path(output_path).parent.mkdir(exist_ok=True)
|
||||
with open(output_path, 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(["word", "count"])
|
||||
writer.writerows(results.items())
|
@ -0,0 +1,36 @@
|
||||
import pytest
|
||||
from src.core import WordCounter
|
||||
from src.file_io import save_results
|
||||
import csv
|
||||
|
||||
@pytest.fixture
|
||||
def sample_data(tmp_path):
|
||||
data_dir = tmp_path / "data"
|
||||
data_dir.mkdir()
|
||||
(data_dir / "test1.txt").write_text("apple banana apple")
|
||||
(data_dir / "test2.txt").write_text("banana cherry")
|
||||
return data_dir
|
||||
|
||||
def test_full_pipeline(sample_data, tmp_path):
|
||||
config = {
|
||||
"data_dir": str(sample_data),
|
||||
"stop_words_file": str(tmp_path / "stopwords.txt"),
|
||||
"output_file": str(tmp_path / "results.csv"),
|
||||
"top_n": 2,
|
||||
"tokenizer": "simple"
|
||||
}
|
||||
|
||||
# 生成停用词文件
|
||||
(tmp_path / "stopwords.txt").write_text("cherry")
|
||||
|
||||
# 执行完整流程
|
||||
counter = WordCounter()
|
||||
counter.config = config
|
||||
results = counter.process_files()
|
||||
save_results(results, config['output_file'])
|
||||
|
||||
# 验证输出
|
||||
with open(config['output_file']) as f:
|
||||
reader = csv.reader(f)
|
||||
next(reader) # Skip header
|
||||
assert list(reader) == [["apple", "2"], ["banana", "2"]]
|
@ -0,0 +1,34 @@
|
||||
import pytest
|
||||
from src.core import WordCounter
|
||||
import tempfile
|
||||
import random
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def large_data():
|
||||
"""生成1MB测试数据"""
|
||||
words = [f"word{i}" for i in range(1000)]
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir) / "data"
|
||||
data_dir.mkdir()
|
||||
for i in range(10):
|
||||
with open(data_dir / f"bigfile{i}.txt", 'w') as f:
|
||||
content = " ".join(random.choices(words, k=100000))
|
||||
f.write(content)
|
||||
yield str(data_dir)
|
||||
|
||||
def test_processing_performance(benchmark, large_data):
|
||||
"""性能基准测试"""
|
||||
counter = WordCounter()
|
||||
counter.config = {
|
||||
"data_dir": large_data,
|
||||
"stop_words_file": "nonexistent",
|
||||
"output_file": "/dev/null",
|
||||
"top_n": 10,
|
||||
"tokenizer": "simple"
|
||||
}
|
||||
|
||||
# 执行基准测试
|
||||
result = benchmark(counter.process_files)
|
||||
|
||||
# 验证性能指标
|
||||
assert benchmark.stats['mean'] < 1.0 # 平均执行时间 < 1秒
|
@ -0,0 +1,16 @@
|
||||
import pytest
|
||||
from src.core import SimpleTokenizer, JiebaTokenizer
|
||||
|
||||
@pytest.fixture
|
||||
def stop_words():
|
||||
return {"the", "and", "a"}
|
||||
|
||||
def test_simple_tokenizer(stop_words):
|
||||
tokenizer = SimpleTokenizer()
|
||||
text = "the quick brown fox and a dog"
|
||||
assert tokenizer.tokenize(text, stop_words) == ["quick", "brown", "fox", "dog"]
|
||||
|
||||
def test_jieba_tokenizer(stop_words):
|
||||
tokenizer = JiebaTokenizer()
|
||||
text = "我爱北京天安门"
|
||||
assert tokenizer.tokenize(text, set()) == ["我", "爱", "北京", "天安门"]
|
Loading…
Reference in new issue