From 2d3a5ff9984d550adc88cef2da1485b0e987074e Mon Sep 17 00:00:00 2001 From: lzkk <956449176@qq.com> Date: Mon, 25 May 2026 22:01:34 +0800 Subject: [PATCH] =?UTF-8?q?perf(backend):=20Peephole=20=E6=96=B0=E5=A2=9E?= =?UTF-8?q?=E5=85=A8=E5=B1=80=E5=8F=98=E9=87=8F=20store-load=20=E8=BD=AC?= =?UTF-8?q?=E5=8F=91=E5=92=8C=20load=20CSE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit StoreGlobal 后紧跟 LoadGlobal 同一符号时,同寄存器则删除 load, 否则转为 MovReg。LoadGlobal 连续出现时同样处理。 shuffle -6, conv2d -3, crypto -3, h-9 -3。总计 -15 条,零退化。 --- src/mir/passes/Peephole.cpp | 81 +++++++++++++++++++++++++++++++++++++ 优化记录.md | 12 ++++++ 指令数基线.md | 24 +++++------ 3 files changed, 105 insertions(+), 12 deletions(-) diff --git a/src/mir/passes/Peephole.cpp b/src/mir/passes/Peephole.cpp index aa3cf2ac..44cf3025 100644 --- a/src/mir/passes/Peephole.cpp +++ b/src/mir/passes/Peephole.cpp @@ -93,6 +93,36 @@ namespace mir s_ops[0].GetReg() != l_ops[0].GetReg(); } + // 全局变量:StoreGlobal 后紧跟 LoadGlobal 同一符号 → 用 MovReg 替代 LoadGlobal + static bool IsGlobalFwdStoreLoad(const MachineInstr &a, const MachineInstr &b) + { + if (a.GetOpcode() != Opcode::StoreGlobal || b.GetOpcode() != Opcode::LoadGlobal) + return false; + const auto &a_ops = a.GetOperands(); + const auto &b_ops = b.GetOperands(); + if (a_ops.size() < 2 || b_ops.size() < 2) + return false; + if (a_ops[1].GetKind() != Operand::Kind::Symbol || + b_ops[1].GetKind() != Operand::Kind::Symbol) + return false; + return a_ops[1].GetSymbol() == b_ops[1].GetSymbol(); + } + + // 全局变量:LoadGlobal 后紧跟 LoadGlobal 同一符号 → 用 MovReg 替代第二个 + static bool IsGlobalRedundantLoad(const MachineInstr &a, const MachineInstr &b) + { + if (a.GetOpcode() != Opcode::LoadGlobal || b.GetOpcode() != Opcode::LoadGlobal) + return false; + const auto &a_ops = a.GetOperands(); + const auto &b_ops = b.GetOperands(); + if (a_ops.size() < 2 || b_ops.size() < 2) + return false; + if (a_ops[1].GetKind() != Operand::Kind::Symbol || + b_ops[1].GetKind() != Operand::Kind::Symbol) + return false; + return a_ops[1].GetSymbol() == b_ops[1].GetSymbol(); + } + static bool TryMergeZeroStores(MachineInstr &first, MachineInstr &second) { if (first.GetOpcode() != Opcode::StoreStack || @@ -212,6 +242,57 @@ namespace mir } } + // 全局变量 StoreGlobal → LoadGlobal 同一符号转发 + if (!changed) + { + for (auto it = insts.begin(); it != insts.end(); ++it) + { + if (it->GetOpcode() == Opcode::StoreGlobal) + { + auto next = std::next(it); + if (next != insts.end() && IsGlobalFwdStoreLoad(*it, *next)) + { + const auto &s_ops = it->GetOperands(); + const auto &l_ops = next->GetOperands(); + // 若已是同一寄存器则直接删除 load,否则用 MovReg 替代 + if (s_ops[0].GetKind() == l_ops[0].GetKind() && + s_ops[0].GetKind() == Operand::Kind::Reg && + s_ops[0].GetReg() == l_ops[0].GetReg()) + next = insts.erase(next); + else + *next = MachineInstr(Opcode::MovReg, {l_ops[0], s_ops[0]}); + changed = true; + break; + } + } + } + } + + // 全局变量 LoadGlobal → LoadGlobal 同一符号消除 + if (!changed) + { + for (auto it = insts.begin(); it != insts.end(); ++it) + { + if (it->GetOpcode() == Opcode::LoadGlobal) + { + auto next = std::next(it); + if (next != insts.end() && IsGlobalRedundantLoad(*it, *next)) + { + const auto &first_ops = it->GetOperands(); + const auto &second_ops = next->GetOperands(); + if (first_ops[0].GetKind() == second_ops[0].GetKind() && + first_ops[0].GetKind() == Operand::Kind::Reg && + first_ops[0].GetReg() == second_ops[0].GetReg()) + next = insts.erase(next); + else + *next = MachineInstr(Opcode::MovReg, {second_ops[0], first_ops[0]}); + changed = true; + break; + } + } + } + } + // 分支 fallthrough: 末尾 Br 的目标是紧邻下一个块 → 删除 Br if (!insts.empty()) { diff --git a/优化记录.md b/优化记录.md index 3eb3153e..cb815ef1 100644 --- a/优化记录.md +++ b/优化记录.md @@ -96,3 +96,15 @@ - **退化**:matmul +3(+0.3%),寄存器分配差异,在容忍范围内 - **功能测试**:87/88 functional 通过(1 个不稳定故障 87_many_params),30/31 h_functional 通过(1 个预存故障 30_many_dimensions) - **已知局限**:sdiv 在 Cortex-A53 上延迟较高(4-12 周期),但 QEMU 不精确模拟流水线,且指令数减少足以弥补 + +--- + +## 2026-05-25 | 全局变量 Peephole 优化 + +- **类型**:后端(MIR Peephole) +- **假设**:同一基本块内,StoreGlobal 后紧跟 LoadGlobal 同一符号时可转发存储值(或相同寄存器则直接消除);LoadGlobal 后紧跟 LoadGlobal 同一符号时可复用第一次加载的值 +- **实现**:Peephole.cpp 新增 IsGlobalFwdStoreLoad/IsGlobalRedundantLoad 检测函数,RunPeepholeOnBlock 新增两个迭代 pass +- **指令数效果**:减少 15 条,shuffle -6、conv2d -3、crypto -3、h-9 -3 +- **退化**:无(matmul +3 是之前 sdiv 优化的残留退化) +- **功能测试**:87/88 functional 通过(1 个不稳定故障 87_many_params) +- **已知局限**:仅处理同寄存器复用的特例;不同寄存器间的转发/复用转为 MovReg(指令数不减少) diff --git a/指令数基线.md b/指令数基线.md index 6712f5de..e3280d36 100644 --- a/指令数基线.md +++ b/指令数基线.md @@ -26,15 +26,15 @@ | performance/03_sort1 | 625 | | performance/03_sort2 | 625 | | performance/03_sort3 | 625 | -| performance/conv2d-1 | 627 | -| performance/conv2d-2 | 627 | -| performance/conv2d-3 | 627 | +| performance/conv2d-1 | 626 | +| performance/conv2d-2 | 626 | +| performance/conv2d-3 | 626 | | performance/crc1 | 242 | | performance/crc2 | 242 | | performance/crc3 | 242 | -| performance/crypto-1 | 1809 | -| performance/crypto-2 | 1809 | -| performance/crypto-3 | 1809 | +| performance/crypto-1 | 1808 | +| performance/crypto-2 | 1808 | +| performance/crypto-3 | 1808 | | performance/fft0 | 564 | | performance/fft1 | 564 | | performance/fft2 | 564 | @@ -53,9 +53,9 @@ | performance/h-8-01 | 410 | | performance/h-8-02 | 410 | | performance/h-8-03 | 410 | -| performance/h-9-01 | 198 | -| performance/h-9-02 | 198 | -| performance/h-9-03 | 198 | +| performance/h-9-01 | 197 | +| performance/h-9-02 | 197 | +| performance/h-9-03 | 197 | | performance/huffman-01 | 694 | | performance/huffman-02 | 694 | | performance/huffman-03 | 694 | @@ -71,9 +71,9 @@ | performance/optimization_scheduling1 | 110 | | performance/optimization_scheduling2 | 110 | | performance/optimization_scheduling3 | 110 | -| performance/shuffle0 | 452 | -| performance/shuffle1 | 452 | -| performance/shuffle2 | 452 | +| performance/shuffle0 | 450 | +| performance/shuffle1 | 450 | +| performance/shuffle2 | 450 | | performance/sl1 | 246 | | performance/sl2 | 246 | | performance/sl3 | 246 |