code-analysis/Report-Generation/unified_report/cppcheck_adapter.py

#!/usr/bin/env python3
"""
Adapter that converts cppcheck_test_generator outputs into the unified Issue schema.

Typical usage:
    python3 cppcheck_adapter.py \
        --report /path/to/cppcheck_report.xml \
        --issues-dir /path/to/cppcheck_tests \
        --output unified_report/cppcheck_issues.json \
        --verification /path/to/cppcheck_tests/verification_results.json
"""

from __future__ import annotations

import argparse
import json
import re
import sys
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple

# Ensure we can import the cppcheck_test_generator package.
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from cppcheck_test_generator.models import CppcheckIssue  # type: ignore  # noqa: E402
from cppcheck_test_generator.parsers import parse_cppcheck_xml  # type: ignore  # noqa: E402


SEVERITY_MAP = {
    "error": "HIGH",
    "warning": "MEDIUM",
    "style": "LOW",
    "performance": "LOW",
    "portability": "LOW",
    "information": "INFO",
    "note": "INFO",
}

CVSS_BASE = {
    "CRITICAL": 9.0,
    "HIGH": 8.0,
    "MEDIUM": 6.5,
    "LOW": 3.5,
    "INFO": 0.0,
}

CATEGORY_MAP = {
    "memleak": "resource_management",
    "nullpointer": "memory_safety",
    "arrayindexoutofbounds": "memory_safety",
    "doublefree": "memory_safety",
    "useafterfree": "memory_safety",
    "uninitvar": "logic_bug",
    "zerodiv": "logic_bug",
    "mismatchallocdealloc": "resource_management",
}

IMPACT_HINT = {
    "memleak": "持续泄漏会耗尽资源，导致服务性能下降或崩溃。",
    "nullpointer": "空指针解引用可能导致进程崩溃，可被用作拒绝服务。",
    "arrayindexoutofbounds": "数组越界访问可能破坏内存，造成未定义行为或远程代码执行。",
    "doublefree": "重复释放可能被利用进行堆喷射，从而执行任意代码。",
    "useafterfree": "释放后继续使用指针可能导致信息泄露或执行任意代码。",
    "uninitvar": "使用未初始化变量会导致不可预测行为或安全边界被绕过。",
    "zerodiv": "除零错误可能导致服务崩溃。",
}

REFERENCE_MAP = {
    "memleak": ["https://cwe.mitre.org/data/definitions/401.html"],
    "nullpointer": ["https://cwe.mitre.org/data/definitions/476.html"],
    "arrayindexoutofbounds": ["https://cwe.mitre.org/data/definitions/119.html"],
    "doublefree": ["https://cwe.mitre.org/data/definitions/415.html"],
    "useafterfree": ["https://cwe.mitre.org/data/definitions/416.html"],
    "uninitvar": ["https://cwe.mitre.org/data/definitions/457.html"],
    "zerodiv": ["https://cwe.mitre.org/data/definitions/369.html"],
}


@dataclass
class Issue:
    id: str
    source: Dict[str, Any]
    basic: Dict[str, Any]
    location: Dict[str, Any]
    severity: Dict[str, Any]
    status: Dict[str, Any]
    description: Dict[str, Any]
    reproduce: Dict[str, Any]
    root_cause: Dict[str, Any]
    impact: Dict[str, Any]
    fix: Dict[str, Any]


def _normalize_severity(raw: str) -> str:
    if not raw:
        return "INFO"
    return SEVERITY_MAP.get(raw.lower(), raw.upper())


def _cvss_for(severity: str) -> float:
    return CVSS_BASE.get(severity, 0.0)


def _category_for(issue: CppcheckIssue) -> str:
    key = issue.id.lower()
    return CATEGORY_MAP.get(key, "logic_bug")


def _impact_for(issue: CppcheckIssue) -> str:
    return IMPACT_HINT.get(issue.id.lower(), "可能影响系统稳定性与可用性。")


def _references_for(issue: CppcheckIssue) -> List[str]:
    return REFERENCE_MAP.get(issue.id.lower(), [])


def _parse_markdown_sections(text: str) -> Dict[str, str]:
    sections: Dict[str, str] = {}
    current = "body"
    buffer: List[str] = []
    heading_pattern = re.compile(r"^(#{1,6})\s+(.*)")

    def flush():
        if buffer:
            sections[current] = "\n".join(buffer).strip()
        elif current not in sections:
            sections[current] = ""

    for line in text.splitlines():
        match = heading_pattern.match(line.strip())
        if match:
            flush()
            current = match.group(2).strip().lower()
            buffer = []
        else:
            buffer.append(line)
    flush()
    return sections


def _extract_section(sections: Dict[str, str], keywords: Iterable[str]) -> Optional[str]:
    for key in keywords:
        key_lower = key.lower()
        for section_key, value in sections.items():
            if key_lower in section_key:
                return value.strip()
    return None


def _extract_list(text: Optional[str]) -> List[str]:
    if not text:
        return []
    items = []
    for line in text.splitlines():
        stripped = line.strip()
        if not stripped:
            continue
        if stripped[0] in ("-", "*"):
            stripped = stripped[1:].strip()
        items.append(stripped)
    return items


def _load_verification_map(verification_path: Optional[Path], issues_dir: Path) -> Dict[str, Dict[str, Any]]:
    mapping: Dict[str, Dict[str, Any]] = {}

    def record(entry: Dict[str, Any]) -> None:
        file_name = entry.get("file")
        if not file_name:
            return
        key = Path(file_name).stem  # e.g., issue_001_memleak
        mapping[key] = entry

    if verification_path and verification_path.exists():
        try:
            data = json.loads(verification_path.read_text(encoding="utf-8"))
            for entry in data.get("results", []):
                record(entry)
        except Exception:
            pass

    # Also load per-issue verification JSON files if present
    for json_file in issues_dir.glob("verification_*.json"):
        try:
            entry = json.loads(json_file.read_text(encoding="utf-8"))
            record(entry)
        except Exception:
            continue

    return mapping


def _match_issue_files(issues_dir: Path) -> Dict[str, Tuple[Path, Optional[Path]]]:
    mapping: Dict[str, Tuple[Path, Optional[Path]]] = {}
    for md_file in sorted(issues_dir.glob("issue_*_*.md")):
        base = md_file.stem  # issue_001_rule
        parts = base.split("_", 2)
        if len(parts) < 3:
            continue
        issue_id = parts[2]
        cpp_path = md_file.with_suffix(".cpp")
        mapping[issue_id.lower()] = (md_file, cpp_path if cpp_path.exists() else None)
    return mapping


def _build_issue(
    cpp_issue: CppcheckIssue,
    md_path: Path,
    cpp_path: Optional[Path],
    verification_info: Optional[Dict[str, Any]],
    report_path: Path,
) -> Issue:
    severity_level = _normalize_severity(cpp_issue.severity)
    issue_id = f"CPPC-{cpp_issue.id}-{md_path.stem.split('_')[1]}"
    location = cpp_issue.locations[0] if cpp_issue.locations else None

    sections = _parse_markdown_sections(md_path.read_text(encoding="utf-8"))
    description_section = _extract_section(sections, ["漏洞描述", "问题描述", "description"])
    repro_section = _extract_section(sections, ["复现步骤", "重现步骤", "reproduction"])
    root_cause_section = _extract_section(sections, ["根本原因", "原因分析", "root cause"])
    impact_section = _extract_section(sections, ["潜在影响", "影响", "impact"])
    fix_section = _extract_section(sections, ["修复建议", "修复方案", "mitigation"])
    reference_section = _extract_section(sections, ["参考链接", "references"])

    confirmed = False
    confirmed_by: List[str] = []
    if verification_info:
        confirmed = bool(
            verification_info.get("vulnerability_confirmed")
            or verification_info.get("triggers_cppcheck")
        )
        if confirmed:
            confirmed_by.append("generated_test")

    reproduce_steps = _extract_list(repro_section) or [
        f"参阅 {md_path.name} 中的复现说明。",
        "编译并运行对应的测试用例以验证漏洞。",
    ]
    if cpp_path:
        reproduce_steps.append(f"测试用例: {cpp_path}")

    artifacts: Dict[str, Any] = {"analysis_markdown": str(md_path)}
    if cpp_path:
        artifacts["generated_test"] = str(cpp_path)
    if verification_info:
        artifacts["verification"] = verification_info

    return Issue(
        id=issue_id,
        source={
            "engine": "cppcheck_ai",
            "sub_tool": "cppcheck",
            "raw_ids": [cpp_issue.id],
            "report_path": str(report_path),
        },
        basic={
            "title": f"{cpp_issue.id} - {location.file_path if location else '未知文件'}",
            "type": cpp_issue.id,
            "cwe": None,
            "category": _category_for(cpp_issue),
        },
        location={
            "file": str(location.file_path) if location else None,
            "function": None,
            "line": location.line if location else None,
            "column": None,
            "snippet": description_section or cpp_issue.message,
        },
        severity={
            "level": severity_level,
            "cvss": _cvss_for(severity_level),
            "cvss_vector": None,
        },
        status={
            "state": "confirmed" if confirmed else "new",
            "confirmed_by": confirmed_by,
            "first_seen": None,
            "last_seen": None,
        },
        description={
            "summary": cpp_issue.message,
            "details": description_section or sections.get("body", cpp_issue.message),
        },
        reproduce={
            "steps": reproduce_steps,
            "inputs": {},
            "artifacts": artifacts,
        },
        root_cause={
            "short": root_cause_section or cpp_issue.message,
            "technical_details": root_cause_section or "",
        },
        impact={
            "technical": impact_section or _impact_for(cpp_issue),
            "business": "可能影响系统稳定性与可用性。",
        },
        fix={
            "recommendation": _extract_list(fix_section) or ["参考安全开发规范修复该漏洞。"],
            "code_patch_hint": fix_section or "",
            "references": _extract_list(reference_section) or _references_for(cpp_issue),
        },
    )


def convert(report_path: Path, issues_dir: Path, output_path: Path, verification_path: Optional[Path]) -> None:
    cppcheck_issues = parse_cppcheck_xml(report_path)
    issue_map = {issue.id.lower(): issue for issue in cppcheck_issues}

    files_map = _match_issue_files(issues_dir)
    verification_map = _load_verification_map(verification_path, issues_dir)

    unified_issues: List[Issue] = []
    for issue_key, (md_path, cpp_path) in files_map.items():
        cpp_issue = issue_map.get(issue_key)
        if not cpp_issue:
            # 尝试更宽松匹配（移除非字母数字）
            normalized = re.sub(r"[^a-z0-9]", "", issue_key)
            cpp_issue = next(
                (iss for key, iss in issue_map.items() if re.sub(r"[^a-z0-9]", "", key) == normalized),
                None,
            )
        if not cpp_issue:
            print(f"[cppcheck_adapter] 跳过 {md_path.name}: 在报告中找不到对应的 issue id")
            continue

        verification_info = None
        base_key = md_path.stem.replace(".md", "")
        if base_key in verification_map:
            verification_info = verification_map[base_key]

        issue = _build_issue(cpp_issue, md_path, cpp_path, verification_info, report_path)
        unified_issues.append(issue)

    output_path.parent.mkdir(parents=True, exist_ok=True)
    with output_path.open("w", encoding="utf-8") as f:
        json.dump([asdict(issue) for issue in unified_issues], f, ensure_ascii=False, indent=2)

    print(f"[cppcheck_adapter] Converted {len(unified_issues)} issues -> {output_path}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Convert cppcheck_test_generator outputs to unified issues.")
    parser.add_argument("--report", type=Path, required=True, help="Path to cppcheck XML report.")
    parser.add_argument("--issues-dir", type=Path, required=True, help="Directory containing generated issue markdown/cpp files.")
    parser.add_argument("--output", type=Path, required=True, help="Path to write unified issues JSON.")
    parser.add_argument("--verification", type=Path, help="Optional verification_results.json path.")
    args = parser.parse_args()

    convert(args.report, args.issues_dir, args.output, args.verification)


if __name__ == "__main__":
    main()