""" 代码分析和上下文分析模块 """ import re from pathlib import Path from typing import List, Optional, Tuple, Set from .models import CppcheckIssue, CodeContext def analyze_code_context(file_path: Path, target_line: Optional[int] = None, project_root: Optional[Path] = None) -> CodeContext: """深入分析代码上下文,理解函数、类、变量等结构""" actual_file_path = file_path # 如果文件不存在且提供了项目根目录,尝试查找匹配的文件 if not file_path.exists() and project_root: filename = file_path.name potential_files = list(project_root.glob(f"**/{filename}")) if potential_files: actual_file_path = potential_files[0] print(f"找到匹配的文件: {actual_file_path}") else: # 如果还是找不到,尝试查找所有 .cpp 文件 cpp_files = list(project_root.glob("**/*.cpp")) if cpp_files: actual_file_path = cpp_files[0] print(f"使用示例文件: {actual_file_path}") try: content = actual_file_path.read_text(encoding="utf-8", errors="replace") lines = content.splitlines() except Exception as e: print(f"无法读取文件 {actual_file_path}: {e}") return CodeContext(file_path=file_path) context = CodeContext(file_path=file_path) # 分析包含文件 for line in lines: line = line.strip() if line.startswith('#include'): include_path = line[8:].strip().strip('"<>') context.includes.append(include_path) # 如果指定了目标行,分析该行的上下文 if target_line and 1 <= target_line <= len(lines): target_line_idx = target_line - 1 # 查找函数定义 for i in range(target_line_idx, -1, -1): line = lines[i].strip() if re.match(r'^\w+.*\s+\w+\s*\([^)]*\)\s*\{?\s*$', line): # 提取函数名 match = re.search(r'(\w+)\s*\([^)]*\)', line) if match: context.function_name = match.group(1) break # 查找类定义 for i in range(target_line_idx, -1, -1): line = lines[i].strip() if re.match(r'^\s*(class|struct)\s+\w+', line): match = re.search(r'(class|struct)\s+(\w+)', line) if match: context.class_name = match.group(2) break # 查找命名空间 for i in range(target_line_idx, -1, -1): line = lines[i].strip() if line.startswith('namespace '): match = re.search(r'namespace\s+(\w+)', line) if match: context.namespace = match.group(1) break # 分析变量上下文(查找目标行附近的变量声明) start_analysis = max(0, target_line_idx - 20) end_analysis = min(len(lines), target_line_idx + 5) for i in range(start_analysis, end_analysis): line = lines[i].strip() # 查找变量声明 if re.match(r'^\w+.*\s+\w+\s*[=;]', line) and not re.match(r'^\w+.*\s+\w+\s*\([^)]*\)', line): # 提取变量名 match = re.search(r'(\w+)\s*[=;]', line) if match: context.variable_context.append(match.group(1)) # 分析控制流上下文 for i in range(start_analysis, target_line_idx): line = lines[i].strip() if any(keyword in line for keyword in ['if', 'for', 'while', 'switch', 'try', 'catch']): context.control_flow_context.append(line) return context def analyze_issue_relevance(issue: CppcheckIssue, code_context: CodeContext) -> dict: """分析问题与代码上下文的相关性,判断是否为真实问题""" relevance_score = 0 analysis_details = [] # 基于问题类型分析相关性 issue_id = issue.id.lower() severity = issue.severity.lower() # 严重级别权重 severity_weights = {"error": 10, "warning": 7, "information": 3, "note": 1} relevance_score += severity_weights.get(severity, 0) analysis_details.append(f"严重级别权重: {severity_weights.get(severity, 0)}") # 基于问题ID的特定分析 if "uninitvar" in issue_id: # 未初始化变量:检查是否有变量上下文 if code_context.variable_context: relevance_score += 5 analysis_details.append("检测到变量上下文,未初始化变量问题可能真实存在") else: relevance_score -= 2 analysis_details.append("未检测到变量上下文,可能是误报") elif "nullpointer" in issue_id: # 空指针:检查是否有指针操作 if any("ptr" in var.lower() or "*" in var for var in code_context.variable_context): relevance_score += 6 analysis_details.append("检测到指针变量,空指针问题可能真实存在") else: relevance_score -= 1 analysis_details.append("未检测到明显的指针操作") elif "memleak" in issue_id: # 内存泄漏:检查是否有内存分配 if any("new" in var.lower() or "malloc" in var.lower() for var in code_context.variable_context): relevance_score += 7 analysis_details.append("检测到内存分配操作,内存泄漏问题可能真实存在") else: relevance_score -= 2 analysis_details.append("未检测到内存分配操作") elif "arrayindex" in issue_id or "buffer" in issue_id: # 数组/缓冲区问题:检查是否有数组操作 if any("[" in var or "array" in var.lower() for var in code_context.variable_context): relevance_score += 6 analysis_details.append("检测到数组操作,数组越界问题可能真实存在") else: relevance_score -= 1 analysis_details.append("未检测到明显的数组操作") # 基于函数上下文的分析 if code_context.function_name: relevance_score += 2 analysis_details.append(f"问题位于函数 {code_context.function_name} 中") if code_context.class_name: relevance_score += 1 analysis_details.append(f"问题位于类 {code_context.class_name} 中") # 基于控制流的分析 if code_context.control_flow_context: relevance_score += 1 analysis_details.append(f"问题位于复杂控制流中,包含 {len(code_context.control_flow_context)} 个控制结构") return { "relevance_score": relevance_score, "is_likely_real": relevance_score >= 5, "analysis_details": analysis_details, "confidence": min(100, max(0, relevance_score * 10)) } def analyze_project_structure(project_root: Path) -> dict: """分析项目结构,理解代码组织和依赖关系""" project_info = { "root": project_root, "source_files": [], "header_files": [], "include_dirs": [], "dependencies": set(), "build_files": [], "test_files": [] } if not project_root.exists(): return project_info # 查找源文件 for pattern in ["**/*.cpp", "**/*.c", "**/*.cc", "**/*.cxx"]: project_info["source_files"].extend(project_root.glob(pattern)) # 查找头文件 for pattern in ["**/*.h", "**/*.hpp", "**/*.hxx"]: project_info["header_files"].extend(project_root.glob(pattern)) # 查找构建文件 for pattern in ["**/CMakeLists.txt", "**/Makefile", "**/*.mk", "**/*.pro", "**/*.vcxproj"]: project_info["build_files"].extend(project_root.glob(pattern)) # 查找测试文件 for pattern in ["**/test_*.cpp", "**/*_test.cpp", "**/tests/**/*.cpp"]: project_info["test_files"].extend(project_root.glob(pattern)) # 分析包含目录 include_dirs = set() for header_file in project_info["header_files"]: include_dirs.add(header_file.parent) project_info["include_dirs"] = list(include_dirs) # 分析依赖关系(简单的包含关系分析) dependencies = set() for source_file in project_info["source_files"][:10]: # 限制分析前10个文件 try: content = source_file.read_text(encoding="utf-8", errors="replace") for line in content.splitlines(): line = line.strip() if line.startswith('#include'): include_path = line[8:].strip().strip('"<>') dependencies.add(include_path) except Exception: continue project_info["dependencies"] = list(dependencies) return project_info def get_enhanced_issue_analysis(issue: CppcheckIssue, project_info: Optional[dict] = None) -> Tuple[CodeContext, dict]: """获取增强的问题分析,包含代码上下文和相关性分析""" primary = issue.locations[0] if issue.locations else None if not primary: return CodeContext(file_path=Path("unknown")), {"relevance_score": 0, "is_likely_real": False, "analysis_details": [], "confidence": 0} # 分析代码上下文 project_root = project_info.get("root") if project_info else None code_context = analyze_code_context(primary.file_path, primary.line, project_root) # 分析问题相关性 relevance_analysis = analyze_issue_relevance(issue, code_context) # 如果提供了项目信息,进行更深入的分析 if project_info: # 检查文件是否在项目中 if primary.file_path in project_info.get("source_files", []): relevance_analysis["relevance_score"] += 2 relevance_analysis["analysis_details"].append("文件是项目源文件") # 检查是否使用了项目头文件 project_includes = set() for include_dir in project_info.get("include_dirs", []): for header_file in include_dir.glob("*.h"): project_includes.add(header_file.name) for include_file in code_context.includes: if include_file in project_includes: relevance_analysis["relevance_score"] += 1 relevance_analysis["analysis_details"].append(f"使用了项目头文件: {include_file}") break # 重新计算置信度 relevance_analysis["confidence"] = min(100, max(0, relevance_analysis["relevance_score"] * 10)) relevance_analysis["is_likely_real"] = relevance_analysis["relevance_score"] >= 5 return code_context, relevance_analysis def extract_issue_context_from_source(issue: CppcheckIssue, project_root: Optional[Path] = None) -> dict: """从原项目源码中提取问题相关的真实代码上下文""" print(f"开始提取问题上下文: {issue.id}") context = { 'file_path': None, 'line_number': None, 'function_name': None, 'code_snippet': None, 'surrounding_code': None, 'real_issue_context': None } if not issue.locations: print("没有位置信息") return context primary_location = issue.locations[0] context['file_path'] = primary_location.file_path context['line_number'] = primary_location.line # 尝试读取原项目中的真实代码 source_file = None if project_root: # 修复路径拼接问题 if primary_location.file_path.is_absolute(): source_file = primary_location.file_path else: source_file = project_root / primary_location.file_path # 如果文件不存在,尝试在项目根目录中查找同名文件 if not source_file.exists(): filename = primary_location.file_path.name print(f"查找文件: {filename}") potential_files = list(project_root.glob(f"**/{filename}")) if potential_files: source_file = potential_files[0] print(f"找到匹配的文件: {source_file}") else: # 如果还是找不到,尝试查找所有 .cpp 文件 cpp_files = list(project_root.glob("**/*.cpp")) if cpp_files: # 使用第一个找到的 .cpp 文件作为示例 source_file = cpp_files[0] print(f"使用示例文件: {source_file}") else: print(f"未找到任何 .cpp 文件") else: source_file = primary_location.file_path if source_file and source_file.exists(): try: print(f"正在读取源文件: {source_file}") # 读取问题行周围的代码 from .parsers import read_code_snippet code_snippet = read_code_snippet(source_file, primary_location.line, context=20) context['code_snippet'] = code_snippet context['surrounding_code'] = code_snippet print(f"成功读取代码片段,长度: {len(code_snippet)} 字符") # 改进函数名提取逻辑 lines = code_snippet.split('\n') for line in lines: line = line.strip() # 查找函数定义模式 if re.match(r'^\w+.*\s+\w+\s*\([^)]*\)\s*\{?\s*$', line): # 提取函数名 match = re.search(r'(\w+)\s*\([^)]*\)', line) if match: context['function_name'] = match.group(1) break # 构建真实问题上下文 context['real_issue_context'] = f""" // 基于原项目中的真实问题代码 // 文件: {primary_location.file_path} // 行号: {primary_location.line} // 问题: {issue.message} // 原始代码片段: {code_snippet} """ except Exception as e: print(f"警告: 无法读取源文件 {source_file}: {e}") return context def filter_and_clean_issues(issues: List[CppcheckIssue], project_info: Optional[dict] = None) -> List[CppcheckIssue]: """过滤和清理问题,移除不可靠的问题""" print("正在过滤和清理问题...") cleaned_issues = [] filtered_count = 0 for issue in issues: # 获取增强分析 code_context, relevance_analysis = get_enhanced_issue_analysis(issue, project_info) # 基于分析结果决定是否保留问题 should_keep = False # 1. 检查相关性分数 if relevance_analysis["relevance_score"] >= 5: should_keep = True # 2. 检查问题类型 - 排除明显误报 issue_id = issue.id.lower() if issue_id in ["missinginclude", "missingincludesystem", "toomanyconfigs", "normalchecklevelmaxbranches", "checklevelnormal", "unknown"]: should_keep = False # 3. 检查严重级别 - 优先保留error和warning if issue.severity.lower() in ["error", "warning"]: should_keep = True elif issue.severity.lower() in ["information", "note"]: # 对于information和note,需要更高的相关性分数 if relevance_analysis["relevance_score"] >= 7: should_keep = True # 4. 检查是否有代码上下文 if code_context.function_name or code_context.class_name: should_keep = True if should_keep: cleaned_issues.append(issue) else: filtered_count += 1 print(f" 过滤问题: {issue.id} - {issue.message[:50]}... (相关性分数: {relevance_analysis['relevance_score']})") print(f"问题过滤完成: 保留 {len(cleaned_issues)} 个问题,过滤掉 {filtered_count} 个不可靠问题") return cleaned_issues def write_cleaned_report(issues: List[CppcheckIssue], output_path: Path) -> None: """将清理后的问题写入新的报告文件""" print(f"正在生成清理后的报告: {output_path}") with open(output_path, 'w', encoding='utf-8') as f: for issue in issues: for location in issue.locations: f.write(f"{location.file_path}:{location.line}:0: {issue.severity}: {issue.message} [{issue.id}]\n") print(f"清理后的报告已保存: {output_path}") def prioritize_issues(issues: List[CppcheckIssue]) -> List[CppcheckIssue]: """对问题进行优先级排序,提高智能选择的效果""" def get_priority(issue: CppcheckIssue) -> tuple: # 严重级别优先级:error > warning > information > note severity_priority = {"error": 0, "warning": 1, "information": 2, "note": 3} severity_score = severity_priority.get(issue.severity.lower(), 4) # 规则ID优先级:常见重要问题优先 important_rules = { "nullPointer", "uninitvar", "arrayIndexOutOfBounds", "memleak", "resourceLeak", "useAfterFree", "doubleFree", "bufferAccessOutOfBounds", "unusedVariable", "unusedFunction", "deadcode", "unreachableCode" } rule_score = 0 if issue.id in important_rules else 1 # 文件多样性:优先选择不同文件的问题 file_name = str(issue.locations[0].file_path) if issue.locations else "" file_score = hash(file_name) % 1000 # 简单的文件哈希,用于分散 return (severity_score, rule_score, file_score) return sorted(issues, key=get_priority) def analyze_issues_with_context(issues: List[CppcheckIssue]) -> List[Tuple[CppcheckIssue, dict]]: """分析所有问题的上下文相关性""" print("正在分析问题上下文相关性...") analyzed_issues = [] for i, issue in enumerate(issues): print(f"分析问题 {i+1}/{len(issues)}: {issue.id}") primary = issue.locations[0] if issue.locations else None if not primary: continue # 分析代码上下文 code_context = analyze_code_context(primary.file_path, primary.line) # 分析问题相关性 relevance_analysis = analyze_issue_relevance(issue, code_context) analyzed_issues.append((issue, { "code_context": code_context, "relevance_analysis": relevance_analysis, "original_index": i })) return analyzed_issues