parent
8e7c4a3117
commit
a270414ff0
@ -1,7 +1,9 @@
|
|||||||
from collections import Counter
|
from collections import Counter
|
||||||
from cppy.cp_util import *
|
from cppy.cp_util import *
|
||||||
|
|
||||||
|
#
|
||||||
|
# 遇到异常,退出程序
|
||||||
|
#
|
||||||
def extract_words(path_to_file):
|
def extract_words(path_to_file):
|
||||||
assert(type(path_to_file) is str), "Must be a string!"
|
assert(type(path_to_file) is str), "Must be a string!"
|
||||||
assert(path_to_file), "Must be a non-empty string!"
|
assert(path_to_file), "Must be a non-empty string!"
|
@ -1,6 +1,8 @@
|
|||||||
from cppy.cp_util import *
|
from cppy.cp_util import *
|
||||||
|
|
||||||
|
#
|
||||||
|
# 用断言从事发点给出出错的准确信息
|
||||||
|
#
|
||||||
def extractWords(path_to_file):
|
def extractWords(path_to_file):
|
||||||
assert(type(path_to_file) is str), "Must be a string"
|
assert(type(path_to_file) is str), "Must be a string"
|
||||||
assert(path_to_file), "Must be a non-empty string"
|
assert(path_to_file), "Must be a non-empty string"
|
@ -0,0 +1,7 @@
|
|||||||
|
|
||||||
|
data_dir: "./data"
|
||||||
|
output_file: "./results.csv"
|
||||||
|
stop_words_file: "./config/stopwords.txt"
|
||||||
|
top_n: 10
|
||||||
|
tokenizer: "simple" # 可选 simple/jieba
|
||||||
|
|
@ -0,0 +1,10 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
import csv
|
||||||
|
|
||||||
|
def save_results(results: dict, output_path: str) -> None:
|
||||||
|
"""保存结果到CSV"""
|
||||||
|
Path(output_path).parent.mkdir(exist_ok=True)
|
||||||
|
with open(output_path, 'w', newline='') as f:
|
||||||
|
writer = csv.writer(f)
|
||||||
|
writer.writerow(["word", "count"])
|
||||||
|
writer.writerows(results.items())
|
@ -0,0 +1,36 @@
|
|||||||
|
import pytest
|
||||||
|
from src.core import WordCounter
|
||||||
|
from src.file_io import save_results
|
||||||
|
import csv
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_data(tmp_path):
|
||||||
|
data_dir = tmp_path / "data"
|
||||||
|
data_dir.mkdir()
|
||||||
|
(data_dir / "test1.txt").write_text("apple banana apple")
|
||||||
|
(data_dir / "test2.txt").write_text("banana cherry")
|
||||||
|
return data_dir
|
||||||
|
|
||||||
|
def test_full_pipeline(sample_data, tmp_path):
|
||||||
|
config = {
|
||||||
|
"data_dir": str(sample_data),
|
||||||
|
"stop_words_file": str(tmp_path / "stopwords.txt"),
|
||||||
|
"output_file": str(tmp_path / "results.csv"),
|
||||||
|
"top_n": 2,
|
||||||
|
"tokenizer": "simple"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 生成停用词文件
|
||||||
|
(tmp_path / "stopwords.txt").write_text("cherry")
|
||||||
|
|
||||||
|
# 执行完整流程
|
||||||
|
counter = WordCounter()
|
||||||
|
counter.config = config
|
||||||
|
results = counter.process_files()
|
||||||
|
save_results(results, config['output_file'])
|
||||||
|
|
||||||
|
# 验证输出
|
||||||
|
with open(config['output_file']) as f:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
next(reader) # Skip header
|
||||||
|
assert list(reader) == [["apple", "2"], ["banana", "2"]]
|
@ -0,0 +1,34 @@
|
|||||||
|
import pytest
|
||||||
|
from src.core import WordCounter
|
||||||
|
import tempfile
|
||||||
|
import random
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def large_data():
|
||||||
|
"""生成1MB测试数据"""
|
||||||
|
words = [f"word{i}" for i in range(1000)]
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
data_dir = Path(tmpdir) / "data"
|
||||||
|
data_dir.mkdir()
|
||||||
|
for i in range(10):
|
||||||
|
with open(data_dir / f"bigfile{i}.txt", 'w') as f:
|
||||||
|
content = " ".join(random.choices(words, k=100000))
|
||||||
|
f.write(content)
|
||||||
|
yield str(data_dir)
|
||||||
|
|
||||||
|
def test_processing_performance(benchmark, large_data):
|
||||||
|
"""性能基准测试"""
|
||||||
|
counter = WordCounter()
|
||||||
|
counter.config = {
|
||||||
|
"data_dir": large_data,
|
||||||
|
"stop_words_file": "nonexistent",
|
||||||
|
"output_file": "/dev/null",
|
||||||
|
"top_n": 10,
|
||||||
|
"tokenizer": "simple"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 执行基准测试
|
||||||
|
result = benchmark(counter.process_files)
|
||||||
|
|
||||||
|
# 验证性能指标
|
||||||
|
assert benchmark.stats['mean'] < 1.0 # 平均执行时间 < 1秒
|
@ -0,0 +1,16 @@
|
|||||||
|
import pytest
|
||||||
|
from src.core import SimpleTokenizer, JiebaTokenizer
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def stop_words():
|
||||||
|
return {"the", "and", "a"}
|
||||||
|
|
||||||
|
def test_simple_tokenizer(stop_words):
|
||||||
|
tokenizer = SimpleTokenizer()
|
||||||
|
text = "the quick brown fox and a dog"
|
||||||
|
assert tokenizer.tokenize(text, stop_words) == ["quick", "brown", "fox", "dog"]
|
||||||
|
|
||||||
|
def test_jieba_tokenizer(stop_words):
|
||||||
|
tokenizer = JiebaTokenizer()
|
||||||
|
text = "我爱北京天安门"
|
||||||
|
assert tokenizer.tokenize(text, set()) == ["我", "爱", "北京", "天安门"]
|
Loading…
Reference in new issue