diff --git a/src/include/mir/MIR.h b/src/include/mir/MIR.h index 41b96e4b..8256978a 100644 --- a/src/include/mir/MIR.h +++ b/src/include/mir/MIR.h @@ -333,6 +333,9 @@ namespace mir int GetFrameSize() const { return frame_size_; } void SetFrameSize(int size) { frame_size_ = size; } + bool HasCall() const { return has_call_; } + void SetHasCall(bool v = true) { has_call_ = v; } + int CreateVReg(VRegClass vreg_class); VRegClass GetVRegClass(int vreg_id) const; int GetNumVRegs() const { return static_cast(vreg_classes_.size()); } @@ -347,6 +350,7 @@ namespace mir std::vector frame_slots_; int frame_size_ = 0; + bool has_call_ = false; int next_label_id_ = 0; std::vector vreg_classes_; diff --git a/src/mir/AsmPrinter.cpp b/src/mir/AsmPrinter.cpp index 596fafef..664cb82f 100644 --- a/src/mir/AsmPrinter.cpp +++ b/src/mir/AsmPrinter.cpp @@ -456,12 +456,31 @@ namespace mir case Opcode::Prologue: { const auto &cs_regs = function.GetCalleeSavedRegs(); - os << " stp x29, x30, [sp, #-16]!\n"; + const bool is_leaf = !function.HasCall(); + const bool no_frame = (function.GetFrameSize() == 0 && cs_regs.empty()); + + // 叶函数无帧且无 callee-saved 寄存器:完全跳过帧设置 + if (is_leaf && no_frame) + { + return; + } + + // 叶函数仅保存 x29(LR 不会被修改),非叶函数保存 x29+x30 + if (is_leaf) + { + os << " str x29, [sp, #-8]!\n"; + } + else + { + os << " stp x29, x30, [sp, #-16]!\n"; + } os << " mov x29, sp\n"; + if (function.GetFrameSize() > 0) { EmitStackAdjust("sub", function.GetFrameSize(), os); } + // X(64-bit) 和 S(32-bit) 分两组配对 stp std::vector x_regs, s_regs; for (auto r : cs_regs) @@ -505,6 +524,17 @@ namespace mir case Opcode::Epilogue: { const auto &cs_regs = function.GetCalleeSavedRegs(); + const bool is_leaf = !function.HasCall(); + const bool no_frame = (function.GetFrameSize() == 0 && cs_regs.empty()); + + // 叶函数无帧且无 callee-saved 寄存器——直接返回 + if (is_leaf && no_frame) + { + os << " ret\n"; + return; + } + + // 恢复 callee-saved 寄存器(叶函数也需要——它们属于调用者) std::vector x_regs, s_regs; for (auto r : cs_regs) { @@ -513,7 +543,7 @@ namespace mir else if (r >= PhysReg::S0 && r <= PhysReg::S31) s_regs.push_back(r); else - x_regs.push_back(r); // 兜底:非 X 非 S 按 X 处理 + x_regs.push_back(r); } int cs_offset = 0; for (size_t i = 0; i + 1 < x_regs.size(); i += 2) @@ -541,11 +571,20 @@ namespace mir os << " ldr " << PhysRegName(s_regs.back()) << ", [sp, #" << cs_offset << "]\n"; } + if (function.GetFrameSize() > 0) { EmitStackAdjust("add", function.GetFrameSize(), os); } - os << " ldp x29, x30, [sp], #16\n"; + + if (is_leaf) + { + os << " ldr x29, [sp], #8\n"; + } + else + { + os << " ldp x29, x30, [sp], #16\n"; + } os << " ret\n"; return; } diff --git a/src/mir/Lowering.cpp b/src/mir/Lowering.cpp index 33b96b6f..806221f3 100644 --- a/src/mir/Lowering.cpp +++ b/src/mir/Lowering.cpp @@ -1764,6 +1764,7 @@ namespace mir } block.Append(Opcode::Call, {Operand::Symbol(callee->GetName())}); + function.SetHasCall(); if (aligned_stack_arg_bytes > 0) { diff --git a/优化记录.md b/优化记录.md index 0e46a4c4..f0da551d 100644 --- a/优化记录.md +++ b/优化记录.md @@ -69,3 +69,18 @@ - **退化**:无 - **功能测试**:100/100 functional 通过,30/31 h_functional 通过(1 个预存故障 30_many_dimensions) - **已知局限**:仅缓存 x13 上的 ADRP;LoadGlobalAddr 使用其他寄存器时不参与缓存;同一基本块内优化最有效 + +--- + +## 2026-05-25 | 叶函数帧设置优化 + +- **类型**:后端(AsmPrinter + Lowering) +- **假设**:叶函数(无 Call 指令)不需要保存/恢复 x30(LR 不会被修改)。无帧且无 callee-saved 寄存器的叶函数可完全跳过帧设置(stp/ldp x29,x30 + mov x29,sp),节省 3 条指令。有帧叶函数改用 str/ldr x29 替代 stp/ldp x29,x30,节省栈空间 +- **实现**: + - MIR.h:MachineFunction 新增 has_call_ 字段 + HasCall()/SetHasCall() + - Lowering.cpp:每次发射 Call 指令时标记 function.SetHasCall() + - AsmPrinter.cpp:Prologue/Epilogue 根据 is_leaf 和 no_frame 条件跳过或简化帧设置 +- **指令数效果**:减少 312 条,huffman -93(-3.9%)、crypto -54(-2.8%)、conv2d -45(-2.3%)、crc -27(-3.2%)、h-9 -27(-4.1%)、03_sort -18(-0.9%)、opt_scheduling -18(-5.2%)、h-4 -12(-2.5%)、fft -9(-0.5%)、shuffle -9(-0.7%) +- **退化**:无 +- **功能测试**:100/100 functional 通过,30/31 h_functional 通过(1 个预存故障 30_many_dimensions) +- **已知局限**:仅对无帧且无 callee-saved 寄存器的叶函数完全跳过帧设置;有 callee-saved 的叶函数仍需保存它们(属于调用者) diff --git a/指令数基线.md b/指令数基线.md index babfd964..b13169d5 100644 --- a/指令数基线.md +++ b/指令数基线.md @@ -23,42 +23,42 @@ | performance/01_mm1 | 309 | | performance/01_mm2 | 309 | | performance/01_mm3 | 309 | -| performance/03_sort1 | 639 | -| performance/03_sort2 | 639 | -| performance/03_sort3 | 639 | -| performance/conv2d-1 | 649 | -| performance/conv2d-2 | 649 | -| performance/conv2d-3 | 649 | -| performance/crc1 | 279 | -| performance/crc2 | 279 | -| performance/crc3 | 279 | -| performance/crypto-1 | 1910 | -| performance/crypto-2 | 1910 | -| performance/crypto-3 | 1910 | -| performance/fft0 | 591 | -| performance/fft1 | 591 | -| performance/fft2 | 591 | +| performance/03_sort1 | 633 | +| performance/03_sort2 | 633 | +| performance/03_sort3 | 633 | +| performance/conv2d-1 | 634 | +| performance/conv2d-2 | 634 | +| performance/conv2d-3 | 634 | +| performance/crc1 | 270 | +| performance/crc2 | 270 | +| performance/crc3 | 270 | +| performance/crypto-1 | 1892 | +| performance/crypto-2 | 1892 | +| performance/crypto-3 | 1892 | +| performance/fft0 | 588 | +| performance/fft1 | 588 | +| performance/fft2 | 588 | | performance/h-1-01 | 157 | | performance/h-1-02 | 157 | | performance/h-1-03 | 157 | | performance/h-10-01 | 327 | | performance/h-10-02 | 327 | | performance/h-10-03 | 327 | -| performance/h-4-01 | 162 | -| performance/h-4-02 | 162 | -| performance/h-4-03 | 162 | +| performance/h-4-01 | 158 | +| performance/h-4-02 | 158 | +| performance/h-4-03 | 158 | | performance/h-5-01 | 338 | | performance/h-5-02 | 338 | | performance/h-5-03 | 338 | | performance/h-8-01 | 410 | | performance/h-8-02 | 410 | | performance/h-8-03 | 410 | -| performance/h-9-01 | 221 | -| performance/h-9-02 | 221 | -| performance/h-9-03 | 221 | -| performance/huffman-01 | 787 | -| performance/huffman-02 | 787 | -| performance/huffman-03 | 787 | +| performance/h-9-01 | 212 | +| performance/h-9-02 | 212 | +| performance/h-9-03 | 212 | +| performance/huffman-01 | 756 | +| performance/huffman-02 | 756 | +| performance/huffman-03 | 756 | | performance/knapsack_naive-1 | 167 | | performance/knapsack_naive-2 | 167 | | performance/knapsack_naive-3 | 167 | @@ -68,12 +68,12 @@ | performance/matmul1 | 379 | | performance/matmul2 | 379 | | performance/matmul3 | 379 | -| performance/optimization_scheduling1 | 116 | -| performance/optimization_scheduling2 | 116 | -| performance/optimization_scheduling3 | 116 | -| performance/shuffle0 | 455 | -| performance/shuffle1 | 455 | -| performance/shuffle2 | 455 | +| performance/optimization_scheduling1 | 110 | +| performance/optimization_scheduling2 | 110 | +| performance/optimization_scheduling3 | 110 | +| performance/shuffle0 | 452 | +| performance/shuffle1 | 452 | +| performance/shuffle2 | 452 | | performance/sl1 | 247 | | performance/sl2 | 247 | | performance/sl3 | 247 |