perf(backend): Peephole 新增全局变量 store-load 转发和 load CSE

StoreGlobal 后紧跟 LoadGlobal 同一符号时,同寄存器则删除 load,
否则转为 MovReg。LoadGlobal 连续出现时同样处理。

shuffle -6, conv2d -3, crypto -3, h-9 -3。总计 -15 条,零退化。
lzk
lzkk 5 days ago
parent b2b7210f11
commit 2d3a5ff998

@ -93,6 +93,36 @@ namespace mir
s_ops[0].GetReg() != l_ops[0].GetReg();
}
// 全局变量StoreGlobal 后紧跟 LoadGlobal 同一符号 → 用 MovReg 替代 LoadGlobal
static bool IsGlobalFwdStoreLoad(const MachineInstr &a, const MachineInstr &b)
{
if (a.GetOpcode() != Opcode::StoreGlobal || b.GetOpcode() != Opcode::LoadGlobal)
return false;
const auto &a_ops = a.GetOperands();
const auto &b_ops = b.GetOperands();
if (a_ops.size() < 2 || b_ops.size() < 2)
return false;
if (a_ops[1].GetKind() != Operand::Kind::Symbol ||
b_ops[1].GetKind() != Operand::Kind::Symbol)
return false;
return a_ops[1].GetSymbol() == b_ops[1].GetSymbol();
}
// 全局变量LoadGlobal 后紧跟 LoadGlobal 同一符号 → 用 MovReg 替代第二个
static bool IsGlobalRedundantLoad(const MachineInstr &a, const MachineInstr &b)
{
if (a.GetOpcode() != Opcode::LoadGlobal || b.GetOpcode() != Opcode::LoadGlobal)
return false;
const auto &a_ops = a.GetOperands();
const auto &b_ops = b.GetOperands();
if (a_ops.size() < 2 || b_ops.size() < 2)
return false;
if (a_ops[1].GetKind() != Operand::Kind::Symbol ||
b_ops[1].GetKind() != Operand::Kind::Symbol)
return false;
return a_ops[1].GetSymbol() == b_ops[1].GetSymbol();
}
static bool TryMergeZeroStores(MachineInstr &first, MachineInstr &second)
{
if (first.GetOpcode() != Opcode::StoreStack ||
@ -212,6 +242,57 @@ namespace mir
}
}
// 全局变量 StoreGlobal → LoadGlobal 同一符号转发
if (!changed)
{
for (auto it = insts.begin(); it != insts.end(); ++it)
{
if (it->GetOpcode() == Opcode::StoreGlobal)
{
auto next = std::next(it);
if (next != insts.end() && IsGlobalFwdStoreLoad(*it, *next))
{
const auto &s_ops = it->GetOperands();
const auto &l_ops = next->GetOperands();
// 若已是同一寄存器则直接删除 load否则用 MovReg 替代
if (s_ops[0].GetKind() == l_ops[0].GetKind() &&
s_ops[0].GetKind() == Operand::Kind::Reg &&
s_ops[0].GetReg() == l_ops[0].GetReg())
next = insts.erase(next);
else
*next = MachineInstr(Opcode::MovReg, {l_ops[0], s_ops[0]});
changed = true;
break;
}
}
}
}
// 全局变量 LoadGlobal → LoadGlobal 同一符号消除
if (!changed)
{
for (auto it = insts.begin(); it != insts.end(); ++it)
{
if (it->GetOpcode() == Opcode::LoadGlobal)
{
auto next = std::next(it);
if (next != insts.end() && IsGlobalRedundantLoad(*it, *next))
{
const auto &first_ops = it->GetOperands();
const auto &second_ops = next->GetOperands();
if (first_ops[0].GetKind() == second_ops[0].GetKind() &&
first_ops[0].GetKind() == Operand::Kind::Reg &&
first_ops[0].GetReg() == second_ops[0].GetReg())
next = insts.erase(next);
else
*next = MachineInstr(Opcode::MovReg, {second_ops[0], first_ops[0]});
changed = true;
break;
}
}
}
}
// 分支 fallthrough: 末尾 Br 的目标是紧邻下一个块 → 删除 Br
if (!insts.empty())
{

@ -96,3 +96,15 @@
- **退化**matmul +3+0.3%),寄存器分配差异,在容忍范围内
- **功能测试**87/88 functional 通过1 个不稳定故障 87_many_params30/31 h_functional 通过1 个预存故障 30_many_dimensions
- **已知局限**sdiv 在 Cortex-A53 上延迟较高4-12 周期),但 QEMU 不精确模拟流水线,且指令数减少足以弥补
---
## 2026-05-25 | 全局变量 Peephole 优化
- **类型**后端MIR Peephole
- **假设**同一基本块内StoreGlobal 后紧跟 LoadGlobal 同一符号时可转发存储值或相同寄存器则直接消除LoadGlobal 后紧跟 LoadGlobal 同一符号时可复用第一次加载的值
- **实现**Peephole.cpp 新增 IsGlobalFwdStoreLoad/IsGlobalRedundantLoad 检测函数RunPeepholeOnBlock 新增两个迭代 pass
- **指令数效果**:减少 15 条shuffle -6、conv2d -3、crypto -3、h-9 -3
- **退化**matmul +3 是之前 sdiv 优化的残留退化)
- **功能测试**87/88 functional 通过1 个不稳定故障 87_many_params
- **已知局限**:仅处理同寄存器复用的特例;不同寄存器间的转发/复用转为 MovReg指令数不减少

@ -26,15 +26,15 @@
| performance/03_sort1 | 625 |
| performance/03_sort2 | 625 |
| performance/03_sort3 | 625 |
| performance/conv2d-1 | 627 |
| performance/conv2d-2 | 627 |
| performance/conv2d-3 | 627 |
| performance/conv2d-1 | 626 |
| performance/conv2d-2 | 626 |
| performance/conv2d-3 | 626 |
| performance/crc1 | 242 |
| performance/crc2 | 242 |
| performance/crc3 | 242 |
| performance/crypto-1 | 1809 |
| performance/crypto-2 | 1809 |
| performance/crypto-3 | 1809 |
| performance/crypto-1 | 1808 |
| performance/crypto-2 | 1808 |
| performance/crypto-3 | 1808 |
| performance/fft0 | 564 |
| performance/fft1 | 564 |
| performance/fft2 | 564 |
@ -53,9 +53,9 @@
| performance/h-8-01 | 410 |
| performance/h-8-02 | 410 |
| performance/h-8-03 | 410 |
| performance/h-9-01 | 198 |
| performance/h-9-02 | 198 |
| performance/h-9-03 | 198 |
| performance/h-9-01 | 197 |
| performance/h-9-02 | 197 |
| performance/h-9-03 | 197 |
| performance/huffman-01 | 694 |
| performance/huffman-02 | 694 |
| performance/huffman-03 | 694 |
@ -71,9 +71,9 @@
| performance/optimization_scheduling1 | 110 |
| performance/optimization_scheduling2 | 110 |
| performance/optimization_scheduling3 | 110 |
| performance/shuffle0 | 452 |
| performance/shuffle1 | 452 |
| performance/shuffle2 | 452 |
| performance/shuffle0 | 450 |
| performance/shuffle1 | 450 |
| performance/shuffle2 | 450 |
| performance/sl1 | 246 |
| performance/sl2 | 246 |
| performance/sl3 | 246 |

Loading…
Cancel
Save