#!/usr/bin/env python3 """ LLM生成测试运行器 本模块是测试框架的主要入口点,提供命令行界面来运行各种类型的测试, 包括单元测试、集成测试、性能测试和CLI测试。支持多种运行模式和报告生成。 """ import os import sys import asyncio import argparse import json import time import logging from pathlib import Path from typing import List, Dict, Any, Optional from datetime import datetime import subprocess # 添加项目根目录到Python路径 project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) # 导入测试模块 from test_llm_generation import TestRunner, run_llm_tests from test_data.simple_c_examples import ( get_basic_test_suite, get_comprehensive_test_suite, get_test_cases_by_complexity, get_test_cases_by_category ) from integration.test_parser_llm_pipeline import TestParserLLMPipeline from performance.test_llm_performance import LLMPerformanceTester, run_performance_benchmark from tools.test_llm_cli import LLMTestCLI, TestConfig from utils.cbmc_spec_validator import CBMCSpecificationValidator from utils.logger import get_logger, setup_logging from src.spec.llm_generator import LLMGenerator # 设置日志 logger = get_logger(__name__) class TestOrchestrator: """测试编排器,负责协调和运行各种测试""" def __init__(self, config_path: Optional[str] = None): self.config_path = config_path or "tests/config/test_config.yaml" self.results_dir = Path("test_results") self.results_dir.mkdir(exist_ok=True) self.test_results = {} self.start_time = None def print_header(self): """打印头部信息""" print("=" * 60) print("LLM Generation Test Framework") print("=" * 60) print(f"Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"Config: {self.config_path}") print("=" * 60) def print_summary(self): """打印测试摘要""" if not self.start_time: return duration = time.time() - self.start_time print(f"\\n{'=' * 60}") print("Test Summary") print("=" * 60) print(f"Total Duration: {duration:.2f}s") print(f"Results Directory: {self.results_dir}") for test_type, result in self.test_results.items(): status = "✓ PASSED" if result.get('success', False) else "✗ FAILED" print(f"{test_type}: {status}") if 'summary' in result: summary = result['summary'] if 'total_tests' in summary: print(f" Total Tests: {summary['total_tests']}") if 'success_rate' in summary: print(f" Success Rate: {summary['success_rate']:.1%}") if 'average_quality' in summary: print(f" Average Quality: {summary['average_quality']:.2f}") print("=" * 60) async def run_unit_tests(self) -> Dict[str, Any]: """运行单元测试""" logger.info("Running unit tests") try: # 使用pytest运行单元测试 result = subprocess.run([ sys.executable, "-m", "pytest", "tests/unit/", "-v", "--tb=short", "--junitxml=test_results/unit_test_results.xml" ], capture_output=True, text=True, cwd=project_root) success = result.returncode == 0 output = result.stdout + result.stderr return { 'success': success, 'output': output, 'return_code': result.returncode, 'summary': { 'total_tests': 'Unknown (see output)', 'success_rate': 1.0 if success else 0.0 } } except Exception as e: logger.error(f"Unit tests failed: {e}") return { 'success': False, 'error': str(e), 'summary': {'total_tests': 0, 'success_rate': 0.0} } async def run_integration_tests(self) -> Dict[str, Any]: """运行集成测试""" logger.info("Running integration tests") try: # 使用pytest运行集成测试 result = subprocess.run([ sys.executable, "-m", "pytest", "tests/integration/", "-v", "--tb=short", "--junitxml=test_results/integration_test_results.xml" ], capture_output=True, text=True, cwd=project_root) success = result.returncode == 0 output = result.stdout + result.stderr return { 'success': success, 'output': output, 'return_code': result.returncode, 'summary': { 'total_tests': 'Unknown (see output)', 'success_rate': 1.0 if success else 0.0 } } except Exception as e: logger.error(f"Integration tests failed: {e}") return { 'success': False, 'error': str(e), 'summary': {'total_tests': 0, 'success_rate': 0.0} } async def run_llm_generation_tests(self, test_type: str = "basic") -> Dict[str, Any]: """运行LLM生成测试""" logger.info(f"Running LLM generation tests: {test_type}") try: # 获取测试用例 if test_type == "basic": test_cases = get_basic_test_suite() elif test_type == "comprehensive": test_cases = get_comprehensive_test_suite() elif test_type in ["basic", "intermediate", "advanced"]: test_cases = get_test_cases_by_complexity(test_type) else: test_cases = get_basic_test_suite() logger.info(f"Loaded {len(test_cases)} test cases") # 运行测试 runner, results, report = await run_llm_tests(test_cases, self.config_path, parallel=False) # 保存结果 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") report_file = self.results_dir / f"llm_generation_report_{timestamp}.json" with open(report_file, 'w') as f: json.dump(report, f, indent=2) # 保存详细结果 detailed_dir = self.results_dir / f"llm_generation_detailed_{timestamp}" runner.save_detailed_results(str(detailed_dir)) return { 'success': True, 'results': results, 'report': report, 'report_file': str(report_file), 'summary': report.get('summary', {}) } except Exception as e: logger.error(f"LLM generation tests failed: {e}") return { 'success': False, 'error': str(e), 'summary': {'total_tests': 0, 'success_rate': 0.0} } finally: # 清理runner if 'runner' in locals(): runner.cleanup() async def run_performance_tests(self) -> Dict[str, Any]: """运行性能测试""" logger.info("Running performance tests") try: api_key = os.getenv('SILICONFLOW_API_KEY') if not api_key: raise ValueError("SILICONFLOW_API_KEY environment variable not set") # 运行性能基准测试 results = await run_performance_benchmark(api_key) # 生成性能报告 from performance.test_llm_performance import save_performance_report report_file = save_performance_report(results, str(self.results_dir)) # 生成摘要 summary = {} for test_name, benchmark_result in results.items(): summary[test_name] = { 'total_tests': benchmark_result.total_tests, 'successful_tests': benchmark_result.successful_tests, 'average_generation_time': benchmark_result.average_generation_time, 'average_quality_score': benchmark_result.average_quality_score, 'average_token_rate': benchmark_result.average_token_rate } return { 'success': True, 'results': results, 'report_file': report_file, 'summary': summary } except Exception as e: logger.error(f"Performance tests failed: {e}") return { 'success': False, 'error': str(e), 'summary': {'total_tests': 0, 'success_rate': 0.0} } async def run_cli_tests(self) -> Dict[str, Any]: """运行CLI测试""" logger.info("Running CLI tests") try: api_key = os.getenv('SILICONFLOW_API_KEY') if not api_key: raise ValueError("SILICONFLOW_API_KEY environment variable not set") # 运行健康检查 config = TestConfig(api_key=api_key) cli_tool = LLMTestCLI(config) async with LLMGenerator(api_key=api_key) as generator: health = await generator.health_check() success = health.get('status') == 'healthy' return { 'success': success, 'health_check': health, 'summary': { 'total_tests': 1, 'successful_tests': 1 if success else 0, 'success_rate': 1.0 if success else 0.0 } } except Exception as e: logger.error(f"CLI tests failed: {e}") return { 'success': False, 'error': str(e), 'summary': {'total_tests': 0, 'success_rate': 0.0} } async def run_validation_tests(self) -> Dict[str, Any]: """运行验证测试""" logger.info("Running validation tests") try: validator = CBMCSpecificationValidator() # 测试用例 test_specs = [ ("\\\\requires a >= 0;\\\\n\\\\ensures return == a + b;", { 'name': 'test_add', 'return_type': 'int', 'parameters': [ {'name': 'a', 'type': 'int'}, {'name': 'b', 'type': 'int'} ] }), ("", {'name': 'empty', 'return_type': 'void', 'parameters': []}) # 空规范测试 ] results = [] successful = 0 for spec, func_info in test_specs: try: result = validator.validate_specification(spec, func_info) # 处理验证结果对象 result_dict = result.to_dict() results.append(result_dict) # 检查验证结果 - 即使无效也认为测试成功(只要没有异常) if result.is_valid: successful += 1 else: logger.info(f"Validation failed as expected for {func_info.get('name', 'unknown')}: {len(result.errors)} errors") # 记录警告和建议 if result.warnings: logger.info(f"Validation warnings for {func_info.get('name', 'unknown')}: {len(result.warnings)} warnings") if result.suggestions: logger.info(f"Validation suggestions for {func_info.get('name', 'unknown')}: {len(result.suggestions)} suggestions") except Exception as e: logger.error(f"Validation test failed: {e}") # 生成验证报告 report = validator.generate_validation_report(results) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") report_file = self.results_dir / f"validation_report_{timestamp}.json" with open(report_file, 'w') as f: json.dump(report, f, indent=2) return { 'success': True, 'results': results, 'report': report, 'report_file': str(report_file), 'summary': { 'total_tests': len(test_specs), 'successful_tests': successful, 'success_rate': successful / len(test_specs) if test_specs else 0 } } except Exception as e: logger.error(f"Validation tests failed: {e}") return { 'success': False, 'error': str(e), 'summary': {'total_tests': 0, 'success_rate': 0.0} } async def run_all_tests(self) -> Dict[str, Any]: """运行所有测试""" logger.info("Running all tests") test_functions = [ ('unit', self.run_unit_tests), ('integration', self.run_integration_tests), ('validation', self.run_validation_tests), ('llm_generation', lambda: self.run_llm_generation_tests('basic')), ('performance', self.run_performance_tests), ('cli', self.run_cli_tests) ] overall_success = True for test_name, test_func in test_functions: logger.info(f"Running {test_name} tests...") try: result = await test_func() self.test_results[test_name] = result if not result.get('success', False): overall_success = False except Exception as e: logger.error(f"{test_name} tests failed with exception: {e}") self.test_results[test_name] = { 'success': False, 'error': str(e), 'summary': {'total_tests': 0, 'success_rate': 0.0} } overall_success = False # 生成综合报告 await self.generate_comprehensive_report() return { 'success': overall_success, 'test_results': self.test_results, 'summary': self.generate_overall_summary() } async def generate_comprehensive_report(self): """生成综合测试报告""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") report_file = self.results_dir / f"comprehensive_report_{timestamp}.html" # 生成HTML报告 html_content = self._generate_html_report() with open(report_file, 'w', encoding='utf-8') as f: f.write(html_content) logger.info(f"Comprehensive report generated: {report_file}") def _generate_html_report(self) -> str: """生成HTML报告""" html = f""" LLM Generation Test Report

LLM Generation Test Report

Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

Config: {self.config_path}

""" # 添加总体摘要 overall_summary = self.generate_overall_summary() html += f"""

Overall Summary

Total Test Types: {overall_summary.get('total_types', 0)}
Successful Types: {overall_summary.get('successful_types', 0)}
Success Rate: {overall_summary.get('success_rate', 0):.1%}
Total Duration: {overall_summary.get('duration', 0):.2f}s
""" # 添加各个测试的结果 for test_name, result in self.test_results.items(): status_class = "success" if result.get('success', False) else "failure" html += f"""

{test_name.title()} Tests

""" summary = result.get('summary', {}) if 'total_tests' in summary: html += f"""
Total Tests: {summary['total_tests']}
Successful: {summary.get('successful_tests', 0)}
Success Rate: {summary.get('success_rate', 0):.1%}
""" if 'average_quality' in summary: html += f"""
Average Quality: {summary['average_quality']:.2f}
""" if 'average_generation_time' in summary: html += f"""
Avg Generation Time: {summary['average_generation_time']:.2f}s
""" if 'error' in result: html += f"""

Error: {result['error']}

""" html += """
""" html += """ """ return html def generate_overall_summary(self) -> Dict[str, Any]: """生成总体摘要""" if not self.test_results: return {} total_types = len(self.test_results) successful_types = sum(1 for result in self.test_results.values() if result.get('success', False)) success_rate = successful_types / total_types if total_types > 0 else 0 total_tests = sum(result.get('summary', {}).get('total_tests', 0) for result in self.test_results.values()) successful_tests = sum(result.get('summary', {}).get('successful_tests', 0) for result in self.test_results.values()) duration = time.time() - self.start_time if self.start_time else 0 return { 'total_types': total_types, 'successful_types': successful_types, 'success_rate': success_rate, 'total_tests': total_tests, 'successful_tests': successful_tests, 'duration': duration } async def main(): """主函数""" parser = argparse.ArgumentParser(description="LLM Generation Test Runner") parser.add_argument('--config', '-c', help="Test configuration file path") parser.add_argument('--test-type', '-t', choices=[ 'all', 'unit', 'integration', 'llm', 'performance', 'cli', 'validation' ], default='all', help="Type of tests to run") parser.add_argument('--llm-type', choices=[ 'basic', 'comprehensive', 'intermediate', 'advanced' ], default='basic', help="LLM generation test type") parser.add_argument('--verbose', '-v', action='store_true', help="Verbose output") args = parser.parse_args() # 设置日志级别 if args.verbose: setup_logging(level=logging.DEBUG) else: setup_logging(level=logging.INFO) # 创建测试编排器 orchestrator = TestOrchestrator(args.config) orchestrator.start_time = time.time() orchestrator.print_header() try: # 运行测试 if args.test_type == 'all': result = await orchestrator.run_all_tests() elif args.test_type == 'unit': result = await orchestrator.run_unit_tests() elif args.test_type == 'integration': result = await orchestrator.run_integration_tests() elif args.test_type == 'llm': result = await orchestrator.run_llm_generation_tests(args.llm_type) elif args.test_type == 'performance': result = await orchestrator.run_performance_tests() elif args.test_type == 'cli': result = await orchestrator.run_cli_tests() elif args.test_type == 'validation': result = await orchestrator.run_validation_tests() # 打印摘要 orchestrator.print_summary() # 设置退出码 sys.exit(0 if result.get('success', False) else 1) except KeyboardInterrupt: logger.info("Tests interrupted by user") sys.exit(1) except Exception as e: logger.error(f"Test execution failed: {e}") sys.exit(1) if __name__ == "__main__": # 检查依赖 try: import aiohttp import pytest import yaml import click import colorama except ImportError as e: print(f"Missing dependency: {e}") print("Please install required dependencies with: pip install -r requirements.txt") sys.exit(1) # 检查API密钥 if not os.getenv('SILICONFLOW_API_KEY'): print("Warning: SILICONFLOW_API_KEY environment variable not set") print("Some tests may fail without API access") # 运行主函数 asyncio.run(main())