diff --git a/.claude/settings.json b/.claude/settings.json index f6981373..114ebe8d 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -14,13 +14,19 @@ "type": "command", "command": "bash .claude/hooks/block-destructive.sh", "timeout": 5000 + }, + { + "matcher": "Bash(git commit *)", + "type": "prompt", + "prompt": "在提交之前,请确认优化记录是否已更新。如果本次会话做了任何有效的编译器优化(指令数减少、运行时间降低、功能改进),必须在 优化记录.md 中添加或更新条目。条目格式见该文件顶部。如未做优化或本次提交不涉及编译器改动,可忽略。", + "timeout": 10000 } ], "Stop": [ { "matcher": "", "type": "prompt", - "prompt": "在结束本次会话之前,请确认以下事项:\n1. 快门禁是否通过?(./2026test.sh -c functional -x && ./2026test.sh -c h_functional -x)\n2. 指令数是否有退化?(如有优化改动,跑 ./count_asm.sh)\n3. 是否有未提交的改动需要处理?\n4. 如有重要经验教训,是否已写入 memory?\n\n请简要回答每个问题(是/否/不适用),然后正常结束。", + "prompt": "在结束本次会话之前,请确认以下事项:\n1. 快门禁是否通过?(./2026test.sh -c functional -x && ./2026test.sh -c h_functional -x)\n2. 指令数是否有退化?(如有优化改动,跑 ./count_asm.sh)\n3. 本次有效的优化是否已记录到 优化记录.md?\n4. 是否有未提交的改动需要处理?\n5. 如有重要经验教训,是否已写入 memory?\n\n请简要回答每个问题(是/否/不适用),然后正常结束。", "timeout": 15000 } ] diff --git a/src/include/mir/MIR.h b/src/include/mir/MIR.h index 134c67a4..41b96e4b 100644 --- a/src/include/mir/MIR.h +++ b/src/include/mir/MIR.h @@ -147,6 +147,8 @@ namespace mir StoreMem, AddRR, SubRR, + AddImm, + SubImm, MulRR, DivRR, ModRR, diff --git a/src/mir/AsmPrinter.cpp b/src/mir/AsmPrinter.cpp index 550b0261..827b053f 100644 --- a/src/mir/AsmPrinter.cpp +++ b/src/mir/AsmPrinter.cpp @@ -42,8 +42,10 @@ namespace mir case Opcode::StoreStack: return "stur"; case Opcode::AddRR: + case Opcode::AddImm: return "add"; case Opcode::SubRR: + case Opcode::SubImm: return "sub"; case Opcode::MulRR: return "mul"; diff --git a/src/mir/Lowering.cpp b/src/mir/Lowering.cpp index 1252c6be..33b96b6f 100644 --- a/src/mir/Lowering.cpp +++ b/src/mir/Lowering.cpp @@ -695,10 +695,30 @@ namespace mir } } - block.Append(opcode, - {Operand::VReg(dst, VRegClass::Int), - Operand::VReg(lhs, VRegClass::Int), - Operand::VReg(rhs, VRegClass::Int)}); + // Add/Sub 常量折叠到立即数操作码 + int rhs_imm_val; + bool rhs_is_imm = false; + if ((opcode == Opcode::AddRR || opcode == Opcode::SubRR) && + bin->GetRhs() && TryGetConstantInt(bin->GetRhs(), rhs_imm_val) && + rhs_imm_val >= 0 && rhs_imm_val <= 4095) + { + rhs_is_imm = true; + if (opcode == Opcode::AddRR) + opcode = Opcode::AddImm; + else + opcode = Opcode::SubImm; + block.Append(opcode, + {Operand::VReg(dst, VRegClass::Int), + Operand::VReg(lhs, VRegClass::Int), + Operand::Imm(rhs_imm_val)}); + } + else + { + block.Append(opcode, + {Operand::VReg(dst, VRegClass::Int), + Operand::VReg(lhs, VRegClass::Int), + Operand::VReg(rhs, VRegClass::Int)}); + } value_vregs[value] = dst; return dst; } diff --git a/src/mir/RegAlloc.cpp b/src/mir/RegAlloc.cpp index a8c85d5f..0c9a55c0 100644 --- a/src/mir/RegAlloc.cpp +++ b/src/mir/RegAlloc.cpp @@ -119,6 +119,8 @@ namespace mir case Opcode::AddRR: case Opcode::SubRR: + case Opcode::AddImm: + case Opcode::SubImm: case Opcode::MulRR: case Opcode::DivRR: case Opcode::ModRR: diff --git a/优化记录.md b/优化记录.md new file mode 100644 index 00000000..384e987a --- /dev/null +++ b/优化记录.md @@ -0,0 +1,36 @@ +# 优化记录 + +本文档追踪编译器的所有有效优化,用于答辩展示和技术积累。 + +## 记录格式 + +每条优化记录包含:日期、优化名称、类型(IR/MIR/后端)、假设、实现摘要、指令数效果、QEMU 时间效果、已知局限。 + +--- + +## 2026-05-25 | CmpImm 常量折叠 + +- **类型**:后端(MIR 降级) +- **假设**:ICmp 降级时,操作数为常量(0-4095)直接用 CmpImm,消除冗余 MovImm +- **实现**:Lowering.cpp 两个 ICmp 降级路径中,检查操作数是否为常量。RHS 常量 → CmpImm;LHS 常量 → CmpImm + SwapCondCode +- **新增代码**:SwapCondCode 辅助函数(18 行),两个降级路径各约 30 行 +- **指令数效果**(20 个代表性用例):减少 91 条(-1.1%),matmul -15(-3.8%)、huffman -25(-3.1%)、crypto -23(-1.2%) +- **退化**:h-5 +1(+0.3%),由寄存器分配差异导致,在容忍范围内 +- **功能测试**:100/100 functional 通过,39/40 h_functional 通过(1 个预存故障 30_many_dimensions) +- **已知局限**:仅处理 0-4095 范围的立即数;浮点比较未覆盖 + +--- + +## 2026-05-25 | AddImm/SubImm 立即数折叠 + +- **类型**:后端(MIR 降级 + 新操作码) +- **假设**:AArch64 add/sub 支持 12 位立即数,但 MIR 只有 AddRR/SubRR,导致 `mov #imm; add/sub dst, src, tmp` 浪费 1 条指令。添加 AddImm/SubImm 操作码消除冗余 MovImm +- **实现**: + - MIR.h:新增 AddImm、SubImm 操作码 + - Lowering.cpp:Add/Sub 降级时 RHS 为 0-4095 常量 → AddImm/SubImm + - RegAlloc.cpp:AddImm/SubImm 加入 AddRR/SubRR 同一处理分支 + - AsmPrinter.cpp:通用三操作数打印机自动处理 Imm 操作数(`#value`) +- **指令数效果**(全部 60 个性能用例):减少 55 条,sl1-3 -14(-5.4%)、huffman-01-03 -2(-0.3%)、h-5-01-03 -3(-0.9%) +- **退化**:无 +- **功能测试**:87/88 functional 通过(1 个预存故障 87_many_params)、30/31 h_functional 通过(1 个预存故障 30_many_dimensions) +- **已知局限**:仅处理 IR 中直接常量操作数;经 vreg 传递的常量需 ConstProp 配合才能折叠;仅 0-4095 范围 diff --git a/指令数基线.md b/指令数基线.md index 29f24025..e2640a91 100644 --- a/指令数基线.md +++ b/指令数基线.md @@ -47,18 +47,18 @@ | performance/h-4-01 | 163 | | performance/h-4-02 | 163 | | performance/h-4-03 | 163 | -| performance/h-5-01 | 341 | -| performance/h-5-02 | 341 | -| performance/h-5-03 | 341 | +| performance/h-5-01 | 338 | +| performance/h-5-02 | 338 | +| performance/h-5-03 | 338 | | performance/h-8-01 | 411 | | performance/h-8-02 | 411 | | performance/h-8-03 | 411 | | performance/h-9-01 | 224 | | performance/h-9-02 | 224 | | performance/h-9-03 | 224 | -| performance/huffman-01 | 792 | -| performance/huffman-02 | 792 | -| performance/huffman-03 | 792 | +| performance/huffman-01 | 790 | +| performance/huffman-02 | 790 | +| performance/huffman-03 | 790 | | performance/knapsack_naive-1 | 167 | | performance/knapsack_naive-2 | 167 | | performance/knapsack_naive-3 | 167 | @@ -74,9 +74,9 @@ | performance/shuffle0 | 471 | | performance/shuffle1 | 471 | | performance/shuffle2 | 471 | -| performance/sl1 | 261 | -| performance/sl2 | 261 | -| performance/sl3 | 261 | +| performance/sl1 | 247 | +| performance/sl2 | 247 | +| performance/sl3 | 247 | | performance/transpose0 | 204 | | performance/transpose1 | 204 | | performance/transpose2 | 204 |