You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
cbmc/codedetect/tests/run_llm_tests.py

588 lines
21 KiB

#!/usr/bin/env python3
"""
LLM生成测试运行器
本模块是测试框架的主要入口点,提供命令行界面来运行各种类型的测试,
包括单元测试、集成测试、性能测试和CLI测试。支持多种运行模式和报告生成。
"""
import os
import sys
import asyncio
import argparse
import json
import time
import logging
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
import subprocess
# 添加项目根目录到Python路径
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
# 导入测试模块
from test_llm_generation import TestRunner, run_llm_tests
from test_data.simple_c_examples import (
get_basic_test_suite,
get_comprehensive_test_suite,
get_test_cases_by_complexity,
get_test_cases_by_category
)
from integration.test_parser_llm_pipeline import TestParserLLMPipeline
from performance.test_llm_performance import LLMPerformanceTester, run_performance_benchmark
from tools.test_llm_cli import LLMTestCLI, TestConfig
from utils.cbmc_spec_validator import CBMCSpecificationValidator
from utils.logger import get_logger, setup_logging
from src.spec.llm_generator import LLMGenerator
# 设置日志
logger = get_logger(__name__)
class TestOrchestrator:
"""测试编排器,负责协调和运行各种测试"""
def __init__(self, config_path: Optional[str] = None):
self.config_path = config_path or "tests/config/test_config.yaml"
self.results_dir = Path("test_results")
self.results_dir.mkdir(exist_ok=True)
self.test_results = {}
self.start_time = None
def print_header(self):
"""打印头部信息"""
print("=" * 60)
print("LLM Generation Test Framework")
print("=" * 60)
print(f"Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Config: {self.config_path}")
print("=" * 60)
def print_summary(self):
"""打印测试摘要"""
if not self.start_time:
return
duration = time.time() - self.start_time
print(f"\\n{'=' * 60}")
print("Test Summary")
print("=" * 60)
print(f"Total Duration: {duration:.2f}s")
print(f"Results Directory: {self.results_dir}")
for test_type, result in self.test_results.items():
status = "✓ PASSED" if result.get('success', False) else "✗ FAILED"
print(f"{test_type}: {status}")
if 'summary' in result:
summary = result['summary']
if 'total_tests' in summary:
print(f" Total Tests: {summary['total_tests']}")
if 'success_rate' in summary:
print(f" Success Rate: {summary['success_rate']:.1%}")
if 'average_quality' in summary:
print(f" Average Quality: {summary['average_quality']:.2f}")
print("=" * 60)
async def run_unit_tests(self) -> Dict[str, Any]:
"""运行单元测试"""
logger.info("Running unit tests")
try:
# 使用pytest运行单元测试
result = subprocess.run([
sys.executable, "-m", "pytest",
"tests/unit/",
"-v", "--tb=short",
"--junitxml=test_results/unit_test_results.xml"
], capture_output=True, text=True, cwd=project_root)
success = result.returncode == 0
output = result.stdout + result.stderr
return {
'success': success,
'output': output,
'return_code': result.returncode,
'summary': {
'total_tests': 'Unknown (see output)',
'success_rate': 1.0 if success else 0.0
}
}
except Exception as e:
logger.error(f"Unit tests failed: {e}")
return {
'success': False,
'error': str(e),
'summary': {'total_tests': 0, 'success_rate': 0.0}
}
async def run_integration_tests(self) -> Dict[str, Any]:
"""运行集成测试"""
logger.info("Running integration tests")
try:
# 使用pytest运行集成测试
result = subprocess.run([
sys.executable, "-m", "pytest",
"tests/integration/",
"-v", "--tb=short",
"--junitxml=test_results/integration_test_results.xml"
], capture_output=True, text=True, cwd=project_root)
success = result.returncode == 0
output = result.stdout + result.stderr
return {
'success': success,
'output': output,
'return_code': result.returncode,
'summary': {
'total_tests': 'Unknown (see output)',
'success_rate': 1.0 if success else 0.0
}
}
except Exception as e:
logger.error(f"Integration tests failed: {e}")
return {
'success': False,
'error': str(e),
'summary': {'total_tests': 0, 'success_rate': 0.0}
}
async def run_llm_generation_tests(self, test_type: str = "basic") -> Dict[str, Any]:
"""运行LLM生成测试"""
logger.info(f"Running LLM generation tests: {test_type}")
try:
# 获取测试用例
if test_type == "basic":
test_cases = get_basic_test_suite()
elif test_type == "comprehensive":
test_cases = get_comprehensive_test_suite()
elif test_type in ["basic", "intermediate", "advanced"]:
test_cases = get_test_cases_by_complexity(test_type)
else:
test_cases = get_basic_test_suite()
logger.info(f"Loaded {len(test_cases)} test cases")
# 运行测试
runner, results, report = await run_llm_tests(test_cases, self.config_path, parallel=False)
# 保存结果
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_file = self.results_dir / f"llm_generation_report_{timestamp}.json"
with open(report_file, 'w') as f:
json.dump(report, f, indent=2)
# 保存详细结果
detailed_dir = self.results_dir / f"llm_generation_detailed_{timestamp}"
runner.save_detailed_results(str(detailed_dir))
return {
'success': True,
'results': results,
'report': report,
'report_file': str(report_file),
'summary': report.get('summary', {})
}
except Exception as e:
logger.error(f"LLM generation tests failed: {e}")
return {
'success': False,
'error': str(e),
'summary': {'total_tests': 0, 'success_rate': 0.0}
}
finally:
# 清理runner
if 'runner' in locals():
runner.cleanup()
async def run_performance_tests(self) -> Dict[str, Any]:
"""运行性能测试"""
logger.info("Running performance tests")
try:
api_key = os.getenv('SILICONFLOW_API_KEY')
if not api_key:
raise ValueError("SILICONFLOW_API_KEY environment variable not set")
# 运行性能基准测试
results = await run_performance_benchmark(api_key)
# 生成性能报告
from performance.test_llm_performance import save_performance_report
report_file = save_performance_report(results, str(self.results_dir))
# 生成摘要
summary = {}
for test_name, benchmark_result in results.items():
summary[test_name] = {
'total_tests': benchmark_result.total_tests,
'successful_tests': benchmark_result.successful_tests,
'average_generation_time': benchmark_result.average_generation_time,
'average_quality_score': benchmark_result.average_quality_score,
'average_token_rate': benchmark_result.average_token_rate
}
return {
'success': True,
'results': results,
'report_file': report_file,
'summary': summary
}
except Exception as e:
logger.error(f"Performance tests failed: {e}")
return {
'success': False,
'error': str(e),
'summary': {'total_tests': 0, 'success_rate': 0.0}
}
async def run_cli_tests(self) -> Dict[str, Any]:
"""运行CLI测试"""
logger.info("Running CLI tests")
try:
api_key = os.getenv('SILICONFLOW_API_KEY')
if not api_key:
raise ValueError("SILICONFLOW_API_KEY environment variable not set")
# 运行健康检查
config = TestConfig(api_key=api_key)
cli_tool = LLMTestCLI(config)
async with LLMGenerator(api_key=api_key) as generator:
health = await generator.health_check()
success = health.get('status') == 'healthy'
return {
'success': success,
'health_check': health,
'summary': {
'total_tests': 1,
'successful_tests': 1 if success else 0,
'success_rate': 1.0 if success else 0.0
}
}
except Exception as e:
logger.error(f"CLI tests failed: {e}")
return {
'success': False,
'error': str(e),
'summary': {'total_tests': 0, 'success_rate': 0.0}
}
async def run_validation_tests(self) -> Dict[str, Any]:
"""运行验证测试"""
logger.info("Running validation tests")
try:
validator = CBMCSpecificationValidator()
# 测试用例
test_specs = [
("\\\\requires a >= 0;\\\\n\\\\ensures return == a + b;", {
'name': 'test_add',
'return_type': 'int',
'parameters': [
{'name': 'a', 'type': 'int'},
{'name': 'b', 'type': 'int'}
]
}),
("", {'name': 'empty', 'return_type': 'void', 'parameters': []}) # 空规范测试
]
results = []
successful = 0
for spec, func_info in test_specs:
try:
result = validator.validate_specification(spec, func_info)
# 处理验证结果对象
result_dict = result.to_dict()
results.append(result_dict)
# 检查验证结果 - 即使无效也认为测试成功(只要没有异常)
if result.is_valid:
successful += 1
else:
logger.info(f"Validation failed as expected for {func_info.get('name', 'unknown')}: {len(result.errors)} errors")
# 记录警告和建议
if result.warnings:
logger.info(f"Validation warnings for {func_info.get('name', 'unknown')}: {len(result.warnings)} warnings")
if result.suggestions:
logger.info(f"Validation suggestions for {func_info.get('name', 'unknown')}: {len(result.suggestions)} suggestions")
except Exception as e:
logger.error(f"Validation test failed: {e}")
# 生成验证报告
report = validator.generate_validation_report(results)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_file = self.results_dir / f"validation_report_{timestamp}.json"
with open(report_file, 'w') as f:
json.dump(report, f, indent=2)
return {
'success': True,
'results': results,
'report': report,
'report_file': str(report_file),
'summary': {
'total_tests': len(test_specs),
'successful_tests': successful,
'success_rate': successful / len(test_specs) if test_specs else 0
}
}
except Exception as e:
logger.error(f"Validation tests failed: {e}")
return {
'success': False,
'error': str(e),
'summary': {'total_tests': 0, 'success_rate': 0.0}
}
async def run_all_tests(self) -> Dict[str, Any]:
"""运行所有测试"""
logger.info("Running all tests")
test_functions = [
('unit', self.run_unit_tests),
('integration', self.run_integration_tests),
('validation', self.run_validation_tests),
('llm_generation', lambda: self.run_llm_generation_tests('basic')),
('performance', self.run_performance_tests),
('cli', self.run_cli_tests)
]
overall_success = True
for test_name, test_func in test_functions:
logger.info(f"Running {test_name} tests...")
try:
result = await test_func()
self.test_results[test_name] = result
if not result.get('success', False):
overall_success = False
except Exception as e:
logger.error(f"{test_name} tests failed with exception: {e}")
self.test_results[test_name] = {
'success': False,
'error': str(e),
'summary': {'total_tests': 0, 'success_rate': 0.0}
}
overall_success = False
# 生成综合报告
await self.generate_comprehensive_report()
return {
'success': overall_success,
'test_results': self.test_results,
'summary': self.generate_overall_summary()
}
async def generate_comprehensive_report(self):
"""生成综合测试报告"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_file = self.results_dir / f"comprehensive_report_{timestamp}.html"
# 生成HTML报告
html_content = self._generate_html_report()
with open(report_file, 'w', encoding='utf-8') as f:
f.write(html_content)
logger.info(f"Comprehensive report generated: {report_file}")
def _generate_html_report(self) -> str:
"""生成HTML报告"""
html = f"""
<!DOCTYPE html>
<html>
<head>
<title>LLM Generation Test Report</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; }}
.header {{ background-color: #f0f0f0; padding: 20px; border-radius: 5px; }}
.test-section {{ margin: 20px 0; padding: 15px; border: 1px solid #ddd; border-radius: 5px; }}
.success {{ background-color: #d4edda; }}
.failure {{ background-color: #f8d7da; }}
.summary {{ background-color: #e8f4f8; padding: 15px; border-radius: 5px; margin: 20px 0; }}
.metric {{ display: inline-block; margin: 10px; padding: 10px; background-color: #f8f9fa; border-radius: 5px; }}
table {{ border-collapse: collapse; width: 100%; margin: 10px 0; }}
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
th {{ background-color: #f2f2f2; }}
</style>
</head>
<body>
<div class="header">
<h1>LLM Generation Test Report</h1>
<p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
<p>Config: {self.config_path}</p>
</div>
"""
# 添加总体摘要
overall_summary = self.generate_overall_summary()
html += f"""
<div class="summary">
<h2>Overall Summary</h2>
<div class="metric">Total Test Types: {overall_summary.get('total_types', 0)}</div>
<div class="metric">Successful Types: {overall_summary.get('successful_types', 0)}</div>
<div class="metric">Success Rate: {overall_summary.get('success_rate', 0):.1%}</div>
<div class="metric">Total Duration: {overall_summary.get('duration', 0):.2f}s</div>
</div>
"""
# 添加各个测试的结果
for test_name, result in self.test_results.items():
status_class = "success" if result.get('success', False) else "failure"
html += f"""
<div class="test-section {status_class}">
<h3>{test_name.title()} Tests</h3>
"""
summary = result.get('summary', {})
if 'total_tests' in summary:
html += f"""
<div class="metric">Total Tests: {summary['total_tests']}</div>
<div class="metric">Successful: {summary.get('successful_tests', 0)}</div>
<div class="metric">Success Rate: {summary.get('success_rate', 0):.1%}</div>
"""
if 'average_quality' in summary:
html += f""" <div class="metric">Average Quality: {summary['average_quality']:.2f}</div>"""
if 'average_generation_time' in summary:
html += f""" <div class="metric">Avg Generation Time: {summary['average_generation_time']:.2f}s</div>"""
if 'error' in result:
html += f""" <p><strong>Error:</strong> {result['error']}</p>"""
html += """
</div>
"""
html += """
</body>
</html>
"""
return html
def generate_overall_summary(self) -> Dict[str, Any]:
"""生成总体摘要"""
if not self.test_results:
return {}
total_types = len(self.test_results)
successful_types = sum(1 for result in self.test_results.values() if result.get('success', False))
success_rate = successful_types / total_types if total_types > 0 else 0
total_tests = sum(result.get('summary', {}).get('total_tests', 0) for result in self.test_results.values())
successful_tests = sum(result.get('summary', {}).get('successful_tests', 0) for result in self.test_results.values())
duration = time.time() - self.start_time if self.start_time else 0
return {
'total_types': total_types,
'successful_types': successful_types,
'success_rate': success_rate,
'total_tests': total_tests,
'successful_tests': successful_tests,
'duration': duration
}
async def main():
"""主函数"""
parser = argparse.ArgumentParser(description="LLM Generation Test Runner")
parser.add_argument('--config', '-c', help="Test configuration file path")
parser.add_argument('--test-type', '-t', choices=[
'all', 'unit', 'integration', 'llm', 'performance', 'cli', 'validation'
], default='all', help="Type of tests to run")
parser.add_argument('--llm-type', choices=[
'basic', 'comprehensive', 'intermediate', 'advanced'
], default='basic', help="LLM generation test type")
parser.add_argument('--verbose', '-v', action='store_true', help="Verbose output")
args = parser.parse_args()
# 设置日志级别
if args.verbose:
setup_logging(level=logging.DEBUG)
else:
setup_logging(level=logging.INFO)
# 创建测试编排器
orchestrator = TestOrchestrator(args.config)
orchestrator.start_time = time.time()
orchestrator.print_header()
try:
# 运行测试
if args.test_type == 'all':
result = await orchestrator.run_all_tests()
elif args.test_type == 'unit':
result = await orchestrator.run_unit_tests()
elif args.test_type == 'integration':
result = await orchestrator.run_integration_tests()
elif args.test_type == 'llm':
result = await orchestrator.run_llm_generation_tests(args.llm_type)
elif args.test_type == 'performance':
result = await orchestrator.run_performance_tests()
elif args.test_type == 'cli':
result = await orchestrator.run_cli_tests()
elif args.test_type == 'validation':
result = await orchestrator.run_validation_tests()
# 打印摘要
orchestrator.print_summary()
# 设置退出码
sys.exit(0 if result.get('success', False) else 1)
except KeyboardInterrupt:
logger.info("Tests interrupted by user")
sys.exit(1)
except Exception as e:
logger.error(f"Test execution failed: {e}")
sys.exit(1)
if __name__ == "__main__":
# 检查依赖
try:
import aiohttp
import pytest
import yaml
import click
import colorama
except ImportError as e:
print(f"Missing dependency: {e}")
print("Please install required dependencies with: pip install -r requirements.txt")
sys.exit(1)
# 检查API密钥
if not os.getenv('SILICONFLOW_API_KEY'):
print("Warning: SILICONFLOW_API_KEY environment variable not set")
print("Some tests may fail without API access")
# 运行主函数
asyncio.run(main())