You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

460 lines
18 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""
代码分析和上下文分析模块
"""
import re
from pathlib import Path
from typing import List, Optional, Tuple, Set
from .models import CppcheckIssue, CodeContext
def analyze_code_context(file_path: Path, target_line: Optional[int] = None, project_root: Optional[Path] = None) -> CodeContext:
"""深入分析代码上下文,理解函数、类、变量等结构"""
actual_file_path = file_path
# 如果文件不存在且提供了项目根目录,尝试查找匹配的文件
if not file_path.exists() and project_root:
filename = file_path.name
potential_files = list(project_root.glob(f"**/{filename}"))
if potential_files:
actual_file_path = potential_files[0]
print(f"找到匹配的文件: {actual_file_path}")
else:
# 如果还是找不到,尝试查找所有 .cpp 文件
cpp_files = list(project_root.glob("**/*.cpp"))
if cpp_files:
actual_file_path = cpp_files[0]
print(f"使用示例文件: {actual_file_path}")
try:
content = actual_file_path.read_text(encoding="utf-8", errors="replace")
lines = content.splitlines()
except Exception as e:
print(f"无法读取文件 {actual_file_path}: {e}")
return CodeContext(file_path=file_path)
context = CodeContext(file_path=file_path)
# 分析包含文件
for line in lines:
line = line.strip()
if line.startswith('#include'):
include_path = line[8:].strip().strip('"<>')
context.includes.append(include_path)
# 如果指定了目标行,分析该行的上下文
if target_line and 1 <= target_line <= len(lines):
target_line_idx = target_line - 1
# 查找函数定义
for i in range(target_line_idx, -1, -1):
line = lines[i].strip()
if re.match(r'^\w+.*\s+\w+\s*\([^)]*\)\s*\{?\s*$', line):
# 提取函数名
match = re.search(r'(\w+)\s*\([^)]*\)', line)
if match:
context.function_name = match.group(1)
break
# 查找类定义
for i in range(target_line_idx, -1, -1):
line = lines[i].strip()
if re.match(r'^\s*(class|struct)\s+\w+', line):
match = re.search(r'(class|struct)\s+(\w+)', line)
if match:
context.class_name = match.group(2)
break
# 查找命名空间
for i in range(target_line_idx, -1, -1):
line = lines[i].strip()
if line.startswith('namespace '):
match = re.search(r'namespace\s+(\w+)', line)
if match:
context.namespace = match.group(1)
break
# 分析变量上下文(查找目标行附近的变量声明)
start_analysis = max(0, target_line_idx - 20)
end_analysis = min(len(lines), target_line_idx + 5)
for i in range(start_analysis, end_analysis):
line = lines[i].strip()
# 查找变量声明
if re.match(r'^\w+.*\s+\w+\s*[=;]', line) and not re.match(r'^\w+.*\s+\w+\s*\([^)]*\)', line):
# 提取变量名
match = re.search(r'(\w+)\s*[=;]', line)
if match:
context.variable_context.append(match.group(1))
# 分析控制流上下文
for i in range(start_analysis, target_line_idx):
line = lines[i].strip()
if any(keyword in line for keyword in ['if', 'for', 'while', 'switch', 'try', 'catch']):
context.control_flow_context.append(line)
return context
def analyze_issue_relevance(issue: CppcheckIssue, code_context: CodeContext) -> dict:
"""分析问题与代码上下文的相关性,判断是否为真实问题"""
relevance_score = 0
analysis_details = []
# 基于问题类型分析相关性
issue_id = issue.id.lower()
severity = issue.severity.lower()
# 严重级别权重
severity_weights = {"error": 10, "warning": 7, "information": 3, "note": 1}
relevance_score += severity_weights.get(severity, 0)
analysis_details.append(f"严重级别权重: {severity_weights.get(severity, 0)}")
# 基于问题ID的特定分析
if "uninitvar" in issue_id:
# 未初始化变量:检查是否有变量上下文
if code_context.variable_context:
relevance_score += 5
analysis_details.append("检测到变量上下文,未初始化变量问题可能真实存在")
else:
relevance_score -= 2
analysis_details.append("未检测到变量上下文,可能是误报")
elif "nullpointer" in issue_id:
# 空指针:检查是否有指针操作
if any("ptr" in var.lower() or "*" in var for var in code_context.variable_context):
relevance_score += 6
analysis_details.append("检测到指针变量,空指针问题可能真实存在")
else:
relevance_score -= 1
analysis_details.append("未检测到明显的指针操作")
elif "memleak" in issue_id:
# 内存泄漏:检查是否有内存分配
if any("new" in var.lower() or "malloc" in var.lower() for var in code_context.variable_context):
relevance_score += 7
analysis_details.append("检测到内存分配操作,内存泄漏问题可能真实存在")
else:
relevance_score -= 2
analysis_details.append("未检测到内存分配操作")
elif "arrayindex" in issue_id or "buffer" in issue_id:
# 数组/缓冲区问题:检查是否有数组操作
if any("[" in var or "array" in var.lower() for var in code_context.variable_context):
relevance_score += 6
analysis_details.append("检测到数组操作,数组越界问题可能真实存在")
else:
relevance_score -= 1
analysis_details.append("未检测到明显的数组操作")
# 基于函数上下文的分析
if code_context.function_name:
relevance_score += 2
analysis_details.append(f"问题位于函数 {code_context.function_name}")
if code_context.class_name:
relevance_score += 1
analysis_details.append(f"问题位于类 {code_context.class_name}")
# 基于控制流的分析
if code_context.control_flow_context:
relevance_score += 1
analysis_details.append(f"问题位于复杂控制流中,包含 {len(code_context.control_flow_context)} 个控制结构")
return {
"relevance_score": relevance_score,
"is_likely_real": relevance_score >= 5,
"analysis_details": analysis_details,
"confidence": min(100, max(0, relevance_score * 10))
}
def analyze_project_structure(project_root: Path) -> dict:
"""分析项目结构,理解代码组织和依赖关系"""
project_info = {
"root": project_root,
"source_files": [],
"header_files": [],
"include_dirs": [],
"dependencies": set(),
"build_files": [],
"test_files": []
}
if not project_root.exists():
return project_info
# 查找源文件
for pattern in ["**/*.cpp", "**/*.c", "**/*.cc", "**/*.cxx"]:
project_info["source_files"].extend(project_root.glob(pattern))
# 查找头文件
for pattern in ["**/*.h", "**/*.hpp", "**/*.hxx"]:
project_info["header_files"].extend(project_root.glob(pattern))
# 查找构建文件
for pattern in ["**/CMakeLists.txt", "**/Makefile", "**/*.mk", "**/*.pro", "**/*.vcxproj"]:
project_info["build_files"].extend(project_root.glob(pattern))
# 查找测试文件
for pattern in ["**/test_*.cpp", "**/*_test.cpp", "**/tests/**/*.cpp"]:
project_info["test_files"].extend(project_root.glob(pattern))
# 分析包含目录
include_dirs = set()
for header_file in project_info["header_files"]:
include_dirs.add(header_file.parent)
project_info["include_dirs"] = list(include_dirs)
# 分析依赖关系(简单的包含关系分析)
dependencies = set()
for source_file in project_info["source_files"][:10]: # 限制分析前10个文件
try:
content = source_file.read_text(encoding="utf-8", errors="replace")
for line in content.splitlines():
line = line.strip()
if line.startswith('#include'):
include_path = line[8:].strip().strip('"<>')
dependencies.add(include_path)
except Exception:
continue
project_info["dependencies"] = list(dependencies)
return project_info
def get_enhanced_issue_analysis(issue: CppcheckIssue, project_info: Optional[dict] = None) -> Tuple[CodeContext, dict]:
"""获取增强的问题分析,包含代码上下文和相关性分析"""
primary = issue.locations[0] if issue.locations else None
if not primary:
return CodeContext(file_path=Path("unknown")), {"relevance_score": 0, "is_likely_real": False, "analysis_details": [], "confidence": 0}
# 分析代码上下文
project_root = project_info.get("root") if project_info else None
code_context = analyze_code_context(primary.file_path, primary.line, project_root)
# 分析问题相关性
relevance_analysis = analyze_issue_relevance(issue, code_context)
# 如果提供了项目信息,进行更深入的分析
if project_info:
# 检查文件是否在项目中
if primary.file_path in project_info.get("source_files", []):
relevance_analysis["relevance_score"] += 2
relevance_analysis["analysis_details"].append("文件是项目源文件")
# 检查是否使用了项目头文件
project_includes = set()
for include_dir in project_info.get("include_dirs", []):
for header_file in include_dir.glob("*.h"):
project_includes.add(header_file.name)
for include_file in code_context.includes:
if include_file in project_includes:
relevance_analysis["relevance_score"] += 1
relevance_analysis["analysis_details"].append(f"使用了项目头文件: {include_file}")
break
# 重新计算置信度
relevance_analysis["confidence"] = min(100, max(0, relevance_analysis["relevance_score"] * 10))
relevance_analysis["is_likely_real"] = relevance_analysis["relevance_score"] >= 5
return code_context, relevance_analysis
def extract_issue_context_from_source(issue: CppcheckIssue, project_root: Optional[Path] = None) -> dict:
"""从原项目源码中提取问题相关的真实代码上下文"""
print(f"开始提取问题上下文: {issue.id}")
context = {
'file_path': None,
'line_number': None,
'function_name': None,
'code_snippet': None,
'surrounding_code': None,
'real_issue_context': None
}
if not issue.locations:
print("没有位置信息")
return context
primary_location = issue.locations[0]
context['file_path'] = primary_location.file_path
context['line_number'] = primary_location.line
# 尝试读取原项目中的真实代码
source_file = None
if project_root:
# 修复路径拼接问题
if primary_location.file_path.is_absolute():
source_file = primary_location.file_path
else:
source_file = project_root / primary_location.file_path
# 如果文件不存在,尝试在项目根目录中查找同名文件
if not source_file.exists():
filename = primary_location.file_path.name
print(f"查找文件: {filename}")
potential_files = list(project_root.glob(f"**/{filename}"))
if potential_files:
source_file = potential_files[0]
print(f"找到匹配的文件: {source_file}")
else:
# 如果还是找不到,尝试查找所有 .cpp 文件
cpp_files = list(project_root.glob("**/*.cpp"))
if cpp_files:
# 使用第一个找到的 .cpp 文件作为示例
source_file = cpp_files[0]
print(f"使用示例文件: {source_file}")
else:
print(f"未找到任何 .cpp 文件")
else:
source_file = primary_location.file_path
if source_file and source_file.exists():
try:
print(f"正在读取源文件: {source_file}")
# 读取问题行周围的代码
from .parsers import read_code_snippet
code_snippet = read_code_snippet(source_file, primary_location.line, context=20)
context['code_snippet'] = code_snippet
context['surrounding_code'] = code_snippet
print(f"成功读取代码片段,长度: {len(code_snippet)} 字符")
# 改进函数名提取逻辑
lines = code_snippet.split('\n')
for line in lines:
line = line.strip()
# 查找函数定义模式
if re.match(r'^\w+.*\s+\w+\s*\([^)]*\)\s*\{?\s*$', line):
# 提取函数名
match = re.search(r'(\w+)\s*\([^)]*\)', line)
if match:
context['function_name'] = match.group(1)
break
# 构建真实问题上下文
context['real_issue_context'] = f"""
// 基于原项目中的真实问题代码
// 文件: {primary_location.file_path}
// 行号: {primary_location.line}
// 问题: {issue.message}
// 原始代码片段:
{code_snippet}
"""
except Exception as e:
print(f"警告: 无法读取源文件 {source_file}: {e}")
return context
def filter_and_clean_issues(issues: List[CppcheckIssue], project_info: Optional[dict] = None) -> List[CppcheckIssue]:
"""过滤和清理问题,移除不可靠的问题"""
print("正在过滤和清理问题...")
cleaned_issues = []
filtered_count = 0
for issue in issues:
# 获取增强分析
code_context, relevance_analysis = get_enhanced_issue_analysis(issue, project_info)
# 基于分析结果决定是否保留问题
should_keep = False
# 1. 检查相关性分数
if relevance_analysis["relevance_score"] >= 5:
should_keep = True
# 2. 检查问题类型 - 排除明显误报
issue_id = issue.id.lower()
if issue_id in ["missinginclude", "missingincludesystem", "toomanyconfigs",
"normalchecklevelmaxbranches", "checklevelnormal", "unknown"]:
should_keep = False
# 3. 检查严重级别 - 优先保留error和warning
if issue.severity.lower() in ["error", "warning"]:
should_keep = True
elif issue.severity.lower() in ["information", "note"]:
# 对于information和note需要更高的相关性分数
if relevance_analysis["relevance_score"] >= 7:
should_keep = True
# 4. 检查是否有代码上下文
if code_context.function_name or code_context.class_name:
should_keep = True
if should_keep:
cleaned_issues.append(issue)
else:
filtered_count += 1
print(f" 过滤问题: {issue.id} - {issue.message[:50]}... (相关性分数: {relevance_analysis['relevance_score']})")
print(f"问题过滤完成: 保留 {len(cleaned_issues)} 个问题,过滤掉 {filtered_count} 个不可靠问题")
return cleaned_issues
def write_cleaned_report(issues: List[CppcheckIssue], output_path: Path) -> None:
"""将清理后的问题写入新的报告文件"""
print(f"正在生成清理后的报告: {output_path}")
with open(output_path, 'w', encoding='utf-8') as f:
for issue in issues:
for location in issue.locations:
f.write(f"{location.file_path}:{location.line}:0: {issue.severity}: {issue.message} [{issue.id}]\n")
print(f"清理后的报告已保存: {output_path}")
def prioritize_issues(issues: List[CppcheckIssue]) -> List[CppcheckIssue]:
"""对问题进行优先级排序,提高智能选择的效果"""
def get_priority(issue: CppcheckIssue) -> tuple:
# 严重级别优先级error > warning > information > note
severity_priority = {"error": 0, "warning": 1, "information": 2, "note": 3}
severity_score = severity_priority.get(issue.severity.lower(), 4)
# 规则ID优先级常见重要问题优先
important_rules = {
"nullPointer", "uninitvar", "arrayIndexOutOfBounds", "memleak",
"resourceLeak", "useAfterFree", "doubleFree", "bufferAccessOutOfBounds",
"unusedVariable", "unusedFunction", "deadcode", "unreachableCode"
}
rule_score = 0 if issue.id in important_rules else 1
# 文件多样性:优先选择不同文件的问题
file_name = str(issue.locations[0].file_path) if issue.locations else ""
file_score = hash(file_name) % 1000 # 简单的文件哈希,用于分散
return (severity_score, rule_score, file_score)
return sorted(issues, key=get_priority)
def analyze_issues_with_context(issues: List[CppcheckIssue]) -> List[Tuple[CppcheckIssue, dict]]:
"""分析所有问题的上下文相关性"""
print("正在分析问题上下文相关性...")
analyzed_issues = []
for i, issue in enumerate(issues):
print(f"分析问题 {i+1}/{len(issues)}: {issue.id}")
primary = issue.locations[0] if issue.locations else None
if not primary:
continue
# 分析代码上下文
code_context = analyze_code_context(primary.file_path, primary.line)
# 分析问题相关性
relevance_analysis = analyze_issue_relevance(issue, code_context)
analyzed_issues.append((issue, {
"code_context": code_context,
"relevance_analysis": relevance_analysis,
"original_index": i
}))
return analyzed_issues