perf(backend): 叶函数跳过帧设置,节省 x29/x30 保存/恢复

MachineFunction 添加 HasCall 标记,Lowering 在发射 Call 时设置。
叶函数无帧且无 callee-saved 寄存器的函数完全跳过 prologue/epilogue;
有帧叶函数改用 str/ldr x29 替代 stp/ldp x29,x30。

huffman -93, crypto -54, conv2d -45, crc -27, h-9 -27,
03_sort -18, opt_scheduling -18, h-4 -12, fft -9, shuffle -9。
总计 -312 条,零退化。
lzk
lzkk 6 days ago
parent 854168fb4e
commit befdca6451

@ -333,6 +333,9 @@ namespace mir
int GetFrameSize() const { return frame_size_; }
void SetFrameSize(int size) { frame_size_ = size; }
bool HasCall() const { return has_call_; }
void SetHasCall(bool v = true) { has_call_ = v; }
int CreateVReg(VRegClass vreg_class);
VRegClass GetVRegClass(int vreg_id) const;
int GetNumVRegs() const { return static_cast<int>(vreg_classes_.size()); }
@ -347,6 +350,7 @@ namespace mir
std::vector<FrameSlot> frame_slots_;
int frame_size_ = 0;
bool has_call_ = false;
int next_label_id_ = 0;
std::vector<VRegClass> vreg_classes_;

@ -456,12 +456,31 @@ namespace mir
case Opcode::Prologue:
{
const auto &cs_regs = function.GetCalleeSavedRegs();
os << " stp x29, x30, [sp, #-16]!\n";
const bool is_leaf = !function.HasCall();
const bool no_frame = (function.GetFrameSize() == 0 && cs_regs.empty());
// 叶函数无帧且无 callee-saved 寄存器:完全跳过帧设置
if (is_leaf && no_frame)
{
return;
}
// 叶函数仅保存 x29LR 不会被修改),非叶函数保存 x29+x30
if (is_leaf)
{
os << " str x29, [sp, #-8]!\n";
}
else
{
os << " stp x29, x30, [sp, #-16]!\n";
}
os << " mov x29, sp\n";
if (function.GetFrameSize() > 0)
{
EmitStackAdjust("sub", function.GetFrameSize(), os);
}
// X(64-bit) 和 S(32-bit) 分两组配对 stp
std::vector<PhysReg> x_regs, s_regs;
for (auto r : cs_regs)
@ -505,6 +524,17 @@ namespace mir
case Opcode::Epilogue:
{
const auto &cs_regs = function.GetCalleeSavedRegs();
const bool is_leaf = !function.HasCall();
const bool no_frame = (function.GetFrameSize() == 0 && cs_regs.empty());
// 叶函数无帧且无 callee-saved 寄存器——直接返回
if (is_leaf && no_frame)
{
os << " ret\n";
return;
}
// 恢复 callee-saved 寄存器(叶函数也需要——它们属于调用者)
std::vector<PhysReg> x_regs, s_regs;
for (auto r : cs_regs)
{
@ -513,7 +543,7 @@ namespace mir
else if (r >= PhysReg::S0 && r <= PhysReg::S31)
s_regs.push_back(r);
else
x_regs.push_back(r); // 兜底:非 X 非 S 按 X 处理
x_regs.push_back(r);
}
int cs_offset = 0;
for (size_t i = 0; i + 1 < x_regs.size(); i += 2)
@ -541,11 +571,20 @@ namespace mir
os << " ldr " << PhysRegName(s_regs.back())
<< ", [sp, #" << cs_offset << "]\n";
}
if (function.GetFrameSize() > 0)
{
EmitStackAdjust("add", function.GetFrameSize(), os);
}
os << " ldp x29, x30, [sp], #16\n";
if (is_leaf)
{
os << " ldr x29, [sp], #8\n";
}
else
{
os << " ldp x29, x30, [sp], #16\n";
}
os << " ret\n";
return;
}

@ -1764,6 +1764,7 @@ namespace mir
}
block.Append(Opcode::Call, {Operand::Symbol(callee->GetName())});
function.SetHasCall();
if (aligned_stack_arg_bytes > 0)
{

@ -69,3 +69,18 @@
- **退化**:无
- **功能测试**100/100 functional 通过30/31 h_functional 通过1 个预存故障 30_many_dimensions
- **已知局限**:仅缓存 x13 上的 ADRPLoadGlobalAddr 使用其他寄存器时不参与缓存;同一基本块内优化最有效
---
## 2026-05-25 | 叶函数帧设置优化
- **类型**后端AsmPrinter + Lowering
- **假设**:叶函数(无 Call 指令)不需要保存/恢复 x30LR 不会被修改)。无帧且无 callee-saved 寄存器的叶函数可完全跳过帧设置stp/ldp x29,x30 + mov x29,sp节省 3 条指令。有帧叶函数改用 str/ldr x29 替代 stp/ldp x29,x30节省栈空间
- **实现**
- MIR.hMachineFunction 新增 has_call_ 字段 + HasCall()/SetHasCall()
- Lowering.cpp每次发射 Call 指令时标记 function.SetHasCall()
- AsmPrinter.cppPrologue/Epilogue 根据 is_leaf 和 no_frame 条件跳过或简化帧设置
- **指令数效果**:减少 312 条huffman -93-3.9%、crypto -54-2.8%、conv2d -45-2.3%、crc -27-3.2%、h-9 -27-4.1%、03_sort -18-0.9%、opt_scheduling -18-5.2%、h-4 -12-2.5%、fft -9-0.5%、shuffle -9-0.7%
- **退化**:无
- **功能测试**100/100 functional 通过30/31 h_functional 通过1 个预存故障 30_many_dimensions
- **已知局限**:仅对无帧且无 callee-saved 寄存器的叶函数完全跳过帧设置;有 callee-saved 的叶函数仍需保存它们(属于调用者)

@ -23,42 +23,42 @@
| performance/01_mm1 | 309 |
| performance/01_mm2 | 309 |
| performance/01_mm3 | 309 |
| performance/03_sort1 | 639 |
| performance/03_sort2 | 639 |
| performance/03_sort3 | 639 |
| performance/conv2d-1 | 649 |
| performance/conv2d-2 | 649 |
| performance/conv2d-3 | 649 |
| performance/crc1 | 279 |
| performance/crc2 | 279 |
| performance/crc3 | 279 |
| performance/crypto-1 | 1910 |
| performance/crypto-2 | 1910 |
| performance/crypto-3 | 1910 |
| performance/fft0 | 591 |
| performance/fft1 | 591 |
| performance/fft2 | 591 |
| performance/03_sort1 | 633 |
| performance/03_sort2 | 633 |
| performance/03_sort3 | 633 |
| performance/conv2d-1 | 634 |
| performance/conv2d-2 | 634 |
| performance/conv2d-3 | 634 |
| performance/crc1 | 270 |
| performance/crc2 | 270 |
| performance/crc3 | 270 |
| performance/crypto-1 | 1892 |
| performance/crypto-2 | 1892 |
| performance/crypto-3 | 1892 |
| performance/fft0 | 588 |
| performance/fft1 | 588 |
| performance/fft2 | 588 |
| performance/h-1-01 | 157 |
| performance/h-1-02 | 157 |
| performance/h-1-03 | 157 |
| performance/h-10-01 | 327 |
| performance/h-10-02 | 327 |
| performance/h-10-03 | 327 |
| performance/h-4-01 | 162 |
| performance/h-4-02 | 162 |
| performance/h-4-03 | 162 |
| performance/h-4-01 | 158 |
| performance/h-4-02 | 158 |
| performance/h-4-03 | 158 |
| performance/h-5-01 | 338 |
| performance/h-5-02 | 338 |
| performance/h-5-03 | 338 |
| performance/h-8-01 | 410 |
| performance/h-8-02 | 410 |
| performance/h-8-03 | 410 |
| performance/h-9-01 | 221 |
| performance/h-9-02 | 221 |
| performance/h-9-03 | 221 |
| performance/huffman-01 | 787 |
| performance/huffman-02 | 787 |
| performance/huffman-03 | 787 |
| performance/h-9-01 | 212 |
| performance/h-9-02 | 212 |
| performance/h-9-03 | 212 |
| performance/huffman-01 | 756 |
| performance/huffman-02 | 756 |
| performance/huffman-03 | 756 |
| performance/knapsack_naive-1 | 167 |
| performance/knapsack_naive-2 | 167 |
| performance/knapsack_naive-3 | 167 |
@ -68,12 +68,12 @@
| performance/matmul1 | 379 |
| performance/matmul2 | 379 |
| performance/matmul3 | 379 |
| performance/optimization_scheduling1 | 116 |
| performance/optimization_scheduling2 | 116 |
| performance/optimization_scheduling3 | 116 |
| performance/shuffle0 | 455 |
| performance/shuffle1 | 455 |
| performance/shuffle2 | 455 |
| performance/optimization_scheduling1 | 110 |
| performance/optimization_scheduling2 | 110 |
| performance/optimization_scheduling3 | 110 |
| performance/shuffle0 | 452 |
| performance/shuffle1 | 452 |
| performance/shuffle2 | 452 |
| performance/sl1 | 247 |
| performance/sl2 | 247 |
| performance/sl3 | 247 |

Loading…
Cancel
Save