You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
588 lines
21 KiB
588 lines
21 KiB
#!/usr/bin/env python3
|
|
"""
|
|
LLM生成测试运行器
|
|
|
|
本模块是测试框架的主要入口点,提供命令行界面来运行各种类型的测试,
|
|
包括单元测试、集成测试、性能测试和CLI测试。支持多种运行模式和报告生成。
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import asyncio
|
|
import argparse
|
|
import json
|
|
import time
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any, Optional
|
|
from datetime import datetime
|
|
import subprocess
|
|
|
|
# 添加项目根目录到Python路径
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
# 导入测试模块
|
|
from test_llm_generation import TestRunner, run_llm_tests
|
|
from test_data.simple_c_examples import (
|
|
get_basic_test_suite,
|
|
get_comprehensive_test_suite,
|
|
get_test_cases_by_complexity,
|
|
get_test_cases_by_category
|
|
)
|
|
from integration.test_parser_llm_pipeline import TestParserLLMPipeline
|
|
from performance.test_llm_performance import LLMPerformanceTester, run_performance_benchmark
|
|
from tools.test_llm_cli import LLMTestCLI, TestConfig
|
|
from utils.cbmc_spec_validator import CBMCSpecificationValidator
|
|
from utils.logger import get_logger, setup_logging
|
|
from src.spec.llm_generator import LLMGenerator
|
|
|
|
# 设置日志
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class TestOrchestrator:
|
|
"""测试编排器,负责协调和运行各种测试"""
|
|
|
|
def __init__(self, config_path: Optional[str] = None):
|
|
self.config_path = config_path or "tests/config/test_config.yaml"
|
|
self.results_dir = Path("test_results")
|
|
self.results_dir.mkdir(exist_ok=True)
|
|
self.test_results = {}
|
|
self.start_time = None
|
|
|
|
def print_header(self):
|
|
"""打印头部信息"""
|
|
print("=" * 60)
|
|
print("LLM Generation Test Framework")
|
|
print("=" * 60)
|
|
print(f"Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
print(f"Config: {self.config_path}")
|
|
print("=" * 60)
|
|
|
|
def print_summary(self):
|
|
"""打印测试摘要"""
|
|
if not self.start_time:
|
|
return
|
|
|
|
duration = time.time() - self.start_time
|
|
print(f"\\n{'=' * 60}")
|
|
print("Test Summary")
|
|
print("=" * 60)
|
|
print(f"Total Duration: {duration:.2f}s")
|
|
print(f"Results Directory: {self.results_dir}")
|
|
|
|
for test_type, result in self.test_results.items():
|
|
status = "✓ PASSED" if result.get('success', False) else "✗ FAILED"
|
|
print(f"{test_type}: {status}")
|
|
|
|
if 'summary' in result:
|
|
summary = result['summary']
|
|
if 'total_tests' in summary:
|
|
print(f" Total Tests: {summary['total_tests']}")
|
|
if 'success_rate' in summary:
|
|
print(f" Success Rate: {summary['success_rate']:.1%}")
|
|
if 'average_quality' in summary:
|
|
print(f" Average Quality: {summary['average_quality']:.2f}")
|
|
|
|
print("=" * 60)
|
|
|
|
async def run_unit_tests(self) -> Dict[str, Any]:
|
|
"""运行单元测试"""
|
|
logger.info("Running unit tests")
|
|
|
|
try:
|
|
# 使用pytest运行单元测试
|
|
result = subprocess.run([
|
|
sys.executable, "-m", "pytest",
|
|
"tests/unit/",
|
|
"-v", "--tb=short",
|
|
"--junitxml=test_results/unit_test_results.xml"
|
|
], capture_output=True, text=True, cwd=project_root)
|
|
|
|
success = result.returncode == 0
|
|
output = result.stdout + result.stderr
|
|
|
|
return {
|
|
'success': success,
|
|
'output': output,
|
|
'return_code': result.returncode,
|
|
'summary': {
|
|
'total_tests': 'Unknown (see output)',
|
|
'success_rate': 1.0 if success else 0.0
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Unit tests failed: {e}")
|
|
return {
|
|
'success': False,
|
|
'error': str(e),
|
|
'summary': {'total_tests': 0, 'success_rate': 0.0}
|
|
}
|
|
|
|
async def run_integration_tests(self) -> Dict[str, Any]:
|
|
"""运行集成测试"""
|
|
logger.info("Running integration tests")
|
|
|
|
try:
|
|
# 使用pytest运行集成测试
|
|
result = subprocess.run([
|
|
sys.executable, "-m", "pytest",
|
|
"tests/integration/",
|
|
"-v", "--tb=short",
|
|
"--junitxml=test_results/integration_test_results.xml"
|
|
], capture_output=True, text=True, cwd=project_root)
|
|
|
|
success = result.returncode == 0
|
|
output = result.stdout + result.stderr
|
|
|
|
return {
|
|
'success': success,
|
|
'output': output,
|
|
'return_code': result.returncode,
|
|
'summary': {
|
|
'total_tests': 'Unknown (see output)',
|
|
'success_rate': 1.0 if success else 0.0
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Integration tests failed: {e}")
|
|
return {
|
|
'success': False,
|
|
'error': str(e),
|
|
'summary': {'total_tests': 0, 'success_rate': 0.0}
|
|
}
|
|
|
|
async def run_llm_generation_tests(self, test_type: str = "basic") -> Dict[str, Any]:
|
|
"""运行LLM生成测试"""
|
|
logger.info(f"Running LLM generation tests: {test_type}")
|
|
|
|
try:
|
|
# 获取测试用例
|
|
if test_type == "basic":
|
|
test_cases = get_basic_test_suite()
|
|
elif test_type == "comprehensive":
|
|
test_cases = get_comprehensive_test_suite()
|
|
elif test_type in ["basic", "intermediate", "advanced"]:
|
|
test_cases = get_test_cases_by_complexity(test_type)
|
|
else:
|
|
test_cases = get_basic_test_suite()
|
|
|
|
logger.info(f"Loaded {len(test_cases)} test cases")
|
|
|
|
# 运行测试
|
|
runner, results, report = await run_llm_tests(test_cases, self.config_path, parallel=False)
|
|
|
|
# 保存结果
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
report_file = self.results_dir / f"llm_generation_report_{timestamp}.json"
|
|
with open(report_file, 'w') as f:
|
|
json.dump(report, f, indent=2)
|
|
|
|
# 保存详细结果
|
|
detailed_dir = self.results_dir / f"llm_generation_detailed_{timestamp}"
|
|
runner.save_detailed_results(str(detailed_dir))
|
|
|
|
return {
|
|
'success': True,
|
|
'results': results,
|
|
'report': report,
|
|
'report_file': str(report_file),
|
|
'summary': report.get('summary', {})
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"LLM generation tests failed: {e}")
|
|
return {
|
|
'success': False,
|
|
'error': str(e),
|
|
'summary': {'total_tests': 0, 'success_rate': 0.0}
|
|
}
|
|
|
|
finally:
|
|
# 清理runner
|
|
if 'runner' in locals():
|
|
runner.cleanup()
|
|
|
|
async def run_performance_tests(self) -> Dict[str, Any]:
|
|
"""运行性能测试"""
|
|
logger.info("Running performance tests")
|
|
|
|
try:
|
|
api_key = os.getenv('SILICONFLOW_API_KEY')
|
|
if not api_key:
|
|
raise ValueError("SILICONFLOW_API_KEY environment variable not set")
|
|
|
|
# 运行性能基准测试
|
|
results = await run_performance_benchmark(api_key)
|
|
|
|
# 生成性能报告
|
|
from performance.test_llm_performance import save_performance_report
|
|
report_file = save_performance_report(results, str(self.results_dir))
|
|
|
|
# 生成摘要
|
|
summary = {}
|
|
for test_name, benchmark_result in results.items():
|
|
summary[test_name] = {
|
|
'total_tests': benchmark_result.total_tests,
|
|
'successful_tests': benchmark_result.successful_tests,
|
|
'average_generation_time': benchmark_result.average_generation_time,
|
|
'average_quality_score': benchmark_result.average_quality_score,
|
|
'average_token_rate': benchmark_result.average_token_rate
|
|
}
|
|
|
|
return {
|
|
'success': True,
|
|
'results': results,
|
|
'report_file': report_file,
|
|
'summary': summary
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Performance tests failed: {e}")
|
|
return {
|
|
'success': False,
|
|
'error': str(e),
|
|
'summary': {'total_tests': 0, 'success_rate': 0.0}
|
|
}
|
|
|
|
async def run_cli_tests(self) -> Dict[str, Any]:
|
|
"""运行CLI测试"""
|
|
logger.info("Running CLI tests")
|
|
|
|
try:
|
|
api_key = os.getenv('SILICONFLOW_API_KEY')
|
|
if not api_key:
|
|
raise ValueError("SILICONFLOW_API_KEY environment variable not set")
|
|
|
|
# 运行健康检查
|
|
config = TestConfig(api_key=api_key)
|
|
cli_tool = LLMTestCLI(config)
|
|
|
|
async with LLMGenerator(api_key=api_key) as generator:
|
|
health = await generator.health_check()
|
|
|
|
success = health.get('status') == 'healthy'
|
|
|
|
return {
|
|
'success': success,
|
|
'health_check': health,
|
|
'summary': {
|
|
'total_tests': 1,
|
|
'successful_tests': 1 if success else 0,
|
|
'success_rate': 1.0 if success else 0.0
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"CLI tests failed: {e}")
|
|
return {
|
|
'success': False,
|
|
'error': str(e),
|
|
'summary': {'total_tests': 0, 'success_rate': 0.0}
|
|
}
|
|
|
|
async def run_validation_tests(self) -> Dict[str, Any]:
|
|
"""运行验证测试"""
|
|
logger.info("Running validation tests")
|
|
|
|
try:
|
|
validator = CBMCSpecificationValidator()
|
|
|
|
# 测试用例
|
|
test_specs = [
|
|
("\\\\requires a >= 0;\\\\n\\\\ensures return == a + b;", {
|
|
'name': 'test_add',
|
|
'return_type': 'int',
|
|
'parameters': [
|
|
{'name': 'a', 'type': 'int'},
|
|
{'name': 'b', 'type': 'int'}
|
|
]
|
|
}),
|
|
("", {'name': 'empty', 'return_type': 'void', 'parameters': []}) # 空规范测试
|
|
]
|
|
|
|
results = []
|
|
successful = 0
|
|
|
|
for spec, func_info in test_specs:
|
|
try:
|
|
result = validator.validate_specification(spec, func_info)
|
|
|
|
# 处理验证结果对象
|
|
result_dict = result.to_dict()
|
|
results.append(result_dict)
|
|
|
|
# 检查验证结果 - 即使无效也认为测试成功(只要没有异常)
|
|
if result.is_valid:
|
|
successful += 1
|
|
else:
|
|
logger.info(f"Validation failed as expected for {func_info.get('name', 'unknown')}: {len(result.errors)} errors")
|
|
|
|
# 记录警告和建议
|
|
if result.warnings:
|
|
logger.info(f"Validation warnings for {func_info.get('name', 'unknown')}: {len(result.warnings)} warnings")
|
|
if result.suggestions:
|
|
logger.info(f"Validation suggestions for {func_info.get('name', 'unknown')}: {len(result.suggestions)} suggestions")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Validation test failed: {e}")
|
|
|
|
# 生成验证报告
|
|
report = validator.generate_validation_report(results)
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
report_file = self.results_dir / f"validation_report_{timestamp}.json"
|
|
with open(report_file, 'w') as f:
|
|
json.dump(report, f, indent=2)
|
|
|
|
return {
|
|
'success': True,
|
|
'results': results,
|
|
'report': report,
|
|
'report_file': str(report_file),
|
|
'summary': {
|
|
'total_tests': len(test_specs),
|
|
'successful_tests': successful,
|
|
'success_rate': successful / len(test_specs) if test_specs else 0
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Validation tests failed: {e}")
|
|
return {
|
|
'success': False,
|
|
'error': str(e),
|
|
'summary': {'total_tests': 0, 'success_rate': 0.0}
|
|
}
|
|
|
|
async def run_all_tests(self) -> Dict[str, Any]:
|
|
"""运行所有测试"""
|
|
logger.info("Running all tests")
|
|
|
|
test_functions = [
|
|
('unit', self.run_unit_tests),
|
|
('integration', self.run_integration_tests),
|
|
('validation', self.run_validation_tests),
|
|
('llm_generation', lambda: self.run_llm_generation_tests('basic')),
|
|
('performance', self.run_performance_tests),
|
|
('cli', self.run_cli_tests)
|
|
]
|
|
|
|
overall_success = True
|
|
|
|
for test_name, test_func in test_functions:
|
|
logger.info(f"Running {test_name} tests...")
|
|
try:
|
|
result = await test_func()
|
|
self.test_results[test_name] = result
|
|
if not result.get('success', False):
|
|
overall_success = False
|
|
except Exception as e:
|
|
logger.error(f"{test_name} tests failed with exception: {e}")
|
|
self.test_results[test_name] = {
|
|
'success': False,
|
|
'error': str(e),
|
|
'summary': {'total_tests': 0, 'success_rate': 0.0}
|
|
}
|
|
overall_success = False
|
|
|
|
# 生成综合报告
|
|
await self.generate_comprehensive_report()
|
|
|
|
return {
|
|
'success': overall_success,
|
|
'test_results': self.test_results,
|
|
'summary': self.generate_overall_summary()
|
|
}
|
|
|
|
async def generate_comprehensive_report(self):
|
|
"""生成综合测试报告"""
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
report_file = self.results_dir / f"comprehensive_report_{timestamp}.html"
|
|
|
|
# 生成HTML报告
|
|
html_content = self._generate_html_report()
|
|
|
|
with open(report_file, 'w', encoding='utf-8') as f:
|
|
f.write(html_content)
|
|
|
|
logger.info(f"Comprehensive report generated: {report_file}")
|
|
|
|
def _generate_html_report(self) -> str:
|
|
"""生成HTML报告"""
|
|
html = f"""
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>LLM Generation Test Report</title>
|
|
<style>
|
|
body {{ font-family: Arial, sans-serif; margin: 20px; }}
|
|
.header {{ background-color: #f0f0f0; padding: 20px; border-radius: 5px; }}
|
|
.test-section {{ margin: 20px 0; padding: 15px; border: 1px solid #ddd; border-radius: 5px; }}
|
|
.success {{ background-color: #d4edda; }}
|
|
.failure {{ background-color: #f8d7da; }}
|
|
.summary {{ background-color: #e8f4f8; padding: 15px; border-radius: 5px; margin: 20px 0; }}
|
|
.metric {{ display: inline-block; margin: 10px; padding: 10px; background-color: #f8f9fa; border-radius: 5px; }}
|
|
table {{ border-collapse: collapse; width: 100%; margin: 10px 0; }}
|
|
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
|
|
th {{ background-color: #f2f2f2; }}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<div class="header">
|
|
<h1>LLM Generation Test Report</h1>
|
|
<p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
|
|
<p>Config: {self.config_path}</p>
|
|
</div>
|
|
"""
|
|
|
|
# 添加总体摘要
|
|
overall_summary = self.generate_overall_summary()
|
|
html += f"""
|
|
<div class="summary">
|
|
<h2>Overall Summary</h2>
|
|
<div class="metric">Total Test Types: {overall_summary.get('total_types', 0)}</div>
|
|
<div class="metric">Successful Types: {overall_summary.get('successful_types', 0)}</div>
|
|
<div class="metric">Success Rate: {overall_summary.get('success_rate', 0):.1%}</div>
|
|
<div class="metric">Total Duration: {overall_summary.get('duration', 0):.2f}s</div>
|
|
</div>
|
|
"""
|
|
|
|
# 添加各个测试的结果
|
|
for test_name, result in self.test_results.items():
|
|
status_class = "success" if result.get('success', False) else "failure"
|
|
html += f"""
|
|
<div class="test-section {status_class}">
|
|
<h3>{test_name.title()} Tests</h3>
|
|
"""
|
|
|
|
summary = result.get('summary', {})
|
|
if 'total_tests' in summary:
|
|
html += f"""
|
|
<div class="metric">Total Tests: {summary['total_tests']}</div>
|
|
<div class="metric">Successful: {summary.get('successful_tests', 0)}</div>
|
|
<div class="metric">Success Rate: {summary.get('success_rate', 0):.1%}</div>
|
|
"""
|
|
|
|
if 'average_quality' in summary:
|
|
html += f""" <div class="metric">Average Quality: {summary['average_quality']:.2f}</div>"""
|
|
|
|
if 'average_generation_time' in summary:
|
|
html += f""" <div class="metric">Avg Generation Time: {summary['average_generation_time']:.2f}s</div>"""
|
|
|
|
if 'error' in result:
|
|
html += f""" <p><strong>Error:</strong> {result['error']}</p>"""
|
|
|
|
html += """
|
|
</div>
|
|
"""
|
|
|
|
html += """
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
return html
|
|
|
|
def generate_overall_summary(self) -> Dict[str, Any]:
|
|
"""生成总体摘要"""
|
|
if not self.test_results:
|
|
return {}
|
|
|
|
total_types = len(self.test_results)
|
|
successful_types = sum(1 for result in self.test_results.values() if result.get('success', False))
|
|
success_rate = successful_types / total_types if total_types > 0 else 0
|
|
|
|
total_tests = sum(result.get('summary', {}).get('total_tests', 0) for result in self.test_results.values())
|
|
successful_tests = sum(result.get('summary', {}).get('successful_tests', 0) for result in self.test_results.values())
|
|
|
|
duration = time.time() - self.start_time if self.start_time else 0
|
|
|
|
return {
|
|
'total_types': total_types,
|
|
'successful_types': successful_types,
|
|
'success_rate': success_rate,
|
|
'total_tests': total_tests,
|
|
'successful_tests': successful_tests,
|
|
'duration': duration
|
|
}
|
|
|
|
|
|
async def main():
|
|
"""主函数"""
|
|
parser = argparse.ArgumentParser(description="LLM Generation Test Runner")
|
|
parser.add_argument('--config', '-c', help="Test configuration file path")
|
|
parser.add_argument('--test-type', '-t', choices=[
|
|
'all', 'unit', 'integration', 'llm', 'performance', 'cli', 'validation'
|
|
], default='all', help="Type of tests to run")
|
|
parser.add_argument('--llm-type', choices=[
|
|
'basic', 'comprehensive', 'intermediate', 'advanced'
|
|
], default='basic', help="LLM generation test type")
|
|
parser.add_argument('--verbose', '-v', action='store_true', help="Verbose output")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# 设置日志级别
|
|
if args.verbose:
|
|
setup_logging(level=logging.DEBUG)
|
|
else:
|
|
setup_logging(level=logging.INFO)
|
|
|
|
# 创建测试编排器
|
|
orchestrator = TestOrchestrator(args.config)
|
|
orchestrator.start_time = time.time()
|
|
orchestrator.print_header()
|
|
|
|
try:
|
|
# 运行测试
|
|
if args.test_type == 'all':
|
|
result = await orchestrator.run_all_tests()
|
|
elif args.test_type == 'unit':
|
|
result = await orchestrator.run_unit_tests()
|
|
elif args.test_type == 'integration':
|
|
result = await orchestrator.run_integration_tests()
|
|
elif args.test_type == 'llm':
|
|
result = await orchestrator.run_llm_generation_tests(args.llm_type)
|
|
elif args.test_type == 'performance':
|
|
result = await orchestrator.run_performance_tests()
|
|
elif args.test_type == 'cli':
|
|
result = await orchestrator.run_cli_tests()
|
|
elif args.test_type == 'validation':
|
|
result = await orchestrator.run_validation_tests()
|
|
|
|
# 打印摘要
|
|
orchestrator.print_summary()
|
|
|
|
# 设置退出码
|
|
sys.exit(0 if result.get('success', False) else 1)
|
|
|
|
except KeyboardInterrupt:
|
|
logger.info("Tests interrupted by user")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
logger.error(f"Test execution failed: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# 检查依赖
|
|
try:
|
|
import aiohttp
|
|
import pytest
|
|
import yaml
|
|
import click
|
|
import colorama
|
|
except ImportError as e:
|
|
print(f"Missing dependency: {e}")
|
|
print("Please install required dependencies with: pip install -r requirements.txt")
|
|
sys.exit(1)
|
|
|
|
# 检查API密钥
|
|
if not os.getenv('SILICONFLOW_API_KEY'):
|
|
print("Warning: SILICONFLOW_API_KEY environment variable not set")
|
|
print("Some tests may fail without API access")
|
|
|
|
# 运行主函数
|
|
asyncio.run(main()) |