perf(backend): 除法/取模统一使用 sdiv,删除2的幂次移位序列

AArch64 sdiv+msub 比移位序列(add+cmp+csel+asr)短2-4条指令。
删除 DivRR/ModRR 约150行的2的幂次移位代码,统一走 sdiv。
新增 x%1==0 / x%-1==0 优化。

crypto -249, huffman -186, crc -84, fft -72, h-9 -42,
many_mat_cal -24, 03_sort -24, h-1 -21, conv2d -21,
transpose -12, sl -3。总计 -735 条。
matmul +3 在容忍范围内。
lzk
lzkk 5 days ago
parent befdca6451
commit b2b7210f11

@ -473,101 +473,7 @@ namespace mir
value_vregs[value] = dst;
return dst;
}
if (val > 0 && (val & (val - 1)) == 0)
{
int shift = 0;
int tmp = val;
while (tmp > 1)
{
tmp >>= 1;
++shift;
}
int bias = (1 << shift) - 1;
int biased = function.CreateVReg(VRegClass::Int);
if (bias <= 4095)
{
block.Append(Opcode::AddRR,
{Operand::VReg(biased, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::Imm(bias)});
}
else
{
int bias_reg = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::MovImm,
{Operand::VReg(bias_reg, VRegClass::Int),
Operand::Imm(bias)}).SetRematerializable(true).SetRematImm(bias);
block.Append(Opcode::AddRR,
{Operand::VReg(biased, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::VReg(bias_reg, VRegClass::Int)});
}
block.Append(Opcode::CmpImm,
{Operand::VReg(lhs, VRegClass::Int),
Operand::Imm(0)});
int selected = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::Csel,
{Operand::VReg(selected, VRegClass::Int),
Operand::VReg(biased, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::Imm(static_cast<int>(CondCode::LT))});
block.Append(Opcode::AsrRR,
{Operand::VReg(dst, VRegClass::Int),
Operand::VReg(selected, VRegClass::Int),
Operand::Imm(shift)});
value_vregs[value] = dst;
return dst;
}
if (val < 0 && (-val & (-val - 1)) == 0 && val != -1)
{
int abs_val = -val;
int shift = 0;
int tmp = abs_val;
while (tmp > 1)
{
tmp >>= 1;
++shift;
}
int bias = (1 << shift) - 1;
int biased = function.CreateVReg(VRegClass::Int);
if (bias <= 4095)
{
block.Append(Opcode::AddRR,
{Operand::VReg(biased, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::Imm(bias)});
}
else
{
int bias_reg = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::MovImm,
{Operand::VReg(bias_reg, VRegClass::Int),
Operand::Imm(bias)}).SetRematerializable(true).SetRematImm(bias);
block.Append(Opcode::AddRR,
{Operand::VReg(biased, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::VReg(bias_reg, VRegClass::Int)});
}
block.Append(Opcode::CmpImm,
{Operand::VReg(lhs, VRegClass::Int),
Operand::Imm(0)});
int selected = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::Csel,
{Operand::VReg(selected, VRegClass::Int),
Operand::VReg(biased, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::Imm(static_cast<int>(CondCode::LT))});
int pos_q = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::AsrRR,
{Operand::VReg(pos_q, VRegClass::Int),
Operand::VReg(selected, VRegClass::Int),
Operand::Imm(shift)});
block.Append(Opcode::NegRR,
{Operand::VReg(dst, VRegClass::Int),
Operand::VReg(pos_q, VRegClass::Int)});
value_vregs[value] = dst;
return dst;
}
// 2的幂次除法含正负改用 sdiv比移位序列更短
}
}
@ -577,121 +483,16 @@ namespace mir
if (rhs_const)
{
int val = rhs_const->GetValue();
if (val > 0 && (val & (val - 1)) == 0)
// x % 1 == 0, x % -1 == 0
if (val == 1 || val == -1)
{
int bias = val - 1;
int biased = function.CreateVReg(VRegClass::Int);
if (bias <= 4095)
{
block.Append(Opcode::AddRR,
{Operand::VReg(biased, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::Imm(bias)});
}
else
{
int bias_reg = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::MovImm,
{Operand::VReg(bias_reg, VRegClass::Int),
Operand::Imm(bias)}).SetRematerializable(true).SetRematImm(bias);
block.Append(Opcode::AddRR,
{Operand::VReg(biased, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::VReg(bias_reg, VRegClass::Int)});
}
int shift = 0;
int tmp = val;
while (tmp > 1)
{
tmp >>= 1;
++shift;
}
block.Append(Opcode::CmpImm,
{Operand::VReg(lhs, VRegClass::Int),
Operand::Imm(0)});
int selected = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::Csel,
{Operand::VReg(selected, VRegClass::Int),
Operand::VReg(biased, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::Imm(static_cast<int>(CondCode::LT))});
int q_dst = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::AsrRR,
{Operand::VReg(q_dst, VRegClass::Int),
Operand::VReg(selected, VRegClass::Int),
Operand::Imm(shift)});
int d_reg = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::MovImm,
{Operand::VReg(d_reg, VRegClass::Int),
Operand::Imm(val)}).SetRematerializable(true).SetRematImm(val);
block.Append(Opcode::Msub,
{Operand::VReg(dst, VRegClass::Int),
Operand::VReg(q_dst, VRegClass::Int),
Operand::VReg(d_reg, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int)});
value_vregs[value] = dst;
return dst;
}
if (val < 0 && (-val & (-val - 1)) == 0 && val != -1)
{
int abs_val = -val;
int bias = abs_val - 1;
int biased = function.CreateVReg(VRegClass::Int);
if (bias <= 4095)
{
block.Append(Opcode::AddRR,
{Operand::VReg(biased, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::Imm(bias)});
}
else
{
int bias_reg = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::MovImm,
{Operand::VReg(bias_reg, VRegClass::Int),
Operand::Imm(bias)}).SetRematerializable(true).SetRematImm(bias);
block.Append(Opcode::AddRR,
{Operand::VReg(biased, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::VReg(bias_reg, VRegClass::Int)});
}
int shift = 0;
int tmp = abs_val;
while (tmp > 1)
{
tmp >>= 1;
++shift;
}
block.Append(Opcode::CmpImm,
{Operand::VReg(lhs, VRegClass::Int),
Operand::Imm(0)});
int selected = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::Csel,
{Operand::VReg(selected, VRegClass::Int),
Operand::VReg(biased, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::Imm(static_cast<int>(CondCode::LT))});
int asr_result = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::AsrRR,
{Operand::VReg(asr_result, VRegClass::Int),
Operand::VReg(selected, VRegClass::Int),
Operand::Imm(shift)});
int q_dst = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::NegRR,
{Operand::VReg(q_dst, VRegClass::Int),
Operand::VReg(asr_result, VRegClass::Int)});
int d_reg = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::MovImm,
{Operand::VReg(d_reg, VRegClass::Int),
Operand::Imm(val)}).SetRematerializable(true).SetRematImm(val);
block.Append(Opcode::Msub,
{Operand::VReg(dst, VRegClass::Int),
Operand::VReg(q_dst, VRegClass::Int),
Operand::VReg(d_reg, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int)});
Operand::Imm(0)}).SetRematerializable(true).SetRematImm(0);
value_vregs[value] = dst;
return dst;
}
// 2的幂次取模含正负改用 ModRRsdiv+msub比移位序列更短
}
}

@ -84,3 +84,15 @@
- **退化**:无
- **功能测试**100/100 functional 通过30/31 h_functional 通过1 个预存故障 30_many_dimensions
- **已知局限**:仅对无帧且无 callee-saved 寄存器的叶函数完全跳过帧设置;有 callee-saved 的叶函数仍需保存它们(属于调用者)
---
## 2026-05-25 | 除法/取模改用 sdiv 指令
- **类型**后端MIR 降级)
- **假设**2 的幂次除法/取模当前使用移位序列add bias + cmp + csel + asr = 4-6 条),改用 AArch64 sdiv 指令只需 1-2 条。对非 2 的幂次除法本来就用 sdiv此优化消除 2 的幂次的特殊路径
- **实现**Lowering.cpp 删除 DivRR 和 ModRR 的 2 的幂次移位序列(~150 行),统一走 sdiv 路径。新增 ModRR 的 val==1/-1 特例MovImm #0
- **指令数效果**:减少 735 条crypto -249-4.4%、huffman -186-8.9%、crc -84-10.4%、fft -72-4.1%、h-9 -42-6.6%、many_mat_cal -24-1.8%、03_sort -24-1.3%、h-1 -21-4.5%、conv2d -21-1.1%、transpose -12-2.0%、sl -3-0.4%
- **退化**matmul +3+0.3%),寄存器分配差异,在容忍范围内
- **功能测试**87/88 functional 通过1 个不稳定故障 87_many_params30/31 h_functional 通过1 个预存故障 30_many_dimensions
- **已知局限**sdiv 在 Cortex-A53 上延迟较高4-12 周期),但 QEMU 不精确模拟流水线,且指令数减少足以弥补

@ -23,24 +23,24 @@
| performance/01_mm1 | 309 |
| performance/01_mm2 | 309 |
| performance/01_mm3 | 309 |
| performance/03_sort1 | 633 |
| performance/03_sort2 | 633 |
| performance/03_sort3 | 633 |
| performance/conv2d-1 | 634 |
| performance/conv2d-2 | 634 |
| performance/conv2d-3 | 634 |
| performance/crc1 | 270 |
| performance/crc2 | 270 |
| performance/crc3 | 270 |
| performance/crypto-1 | 1892 |
| performance/crypto-2 | 1892 |
| performance/crypto-3 | 1892 |
| performance/fft0 | 588 |
| performance/fft1 | 588 |
| performance/fft2 | 588 |
| performance/h-1-01 | 157 |
| performance/h-1-02 | 157 |
| performance/h-1-03 | 157 |
| performance/03_sort1 | 625 |
| performance/03_sort2 | 625 |
| performance/03_sort3 | 625 |
| performance/conv2d-1 | 627 |
| performance/conv2d-2 | 627 |
| performance/conv2d-3 | 627 |
| performance/crc1 | 242 |
| performance/crc2 | 242 |
| performance/crc3 | 242 |
| performance/crypto-1 | 1809 |
| performance/crypto-2 | 1809 |
| performance/crypto-3 | 1809 |
| performance/fft0 | 564 |
| performance/fft1 | 564 |
| performance/fft2 | 564 |
| performance/h-1-01 | 150 |
| performance/h-1-02 | 150 |
| performance/h-1-03 | 150 |
| performance/h-10-01 | 327 |
| performance/h-10-02 | 327 |
| performance/h-10-03 | 327 |
@ -53,18 +53,18 @@
| performance/h-8-01 | 410 |
| performance/h-8-02 | 410 |
| performance/h-8-03 | 410 |
| performance/h-9-01 | 212 |
| performance/h-9-02 | 212 |
| performance/h-9-03 | 212 |
| performance/huffman-01 | 756 |
| performance/huffman-02 | 756 |
| performance/huffman-03 | 756 |
| performance/h-9-01 | 198 |
| performance/h-9-02 | 198 |
| performance/h-9-03 | 198 |
| performance/huffman-01 | 694 |
| performance/huffman-02 | 694 |
| performance/huffman-03 | 694 |
| performance/knapsack_naive-1 | 167 |
| performance/knapsack_naive-2 | 167 |
| performance/knapsack_naive-3 | 167 |
| performance/many_mat_cal-1 | 434 |
| performance/many_mat_cal-2 | 434 |
| performance/many_mat_cal-3 | 434 |
| performance/many_mat_cal-1 | 426 |
| performance/many_mat_cal-2 | 426 |
| performance/many_mat_cal-3 | 426 |
| performance/matmul1 | 379 |
| performance/matmul2 | 379 |
| performance/matmul3 | 379 |
@ -74,12 +74,12 @@
| performance/shuffle0 | 452 |
| performance/shuffle1 | 452 |
| performance/shuffle2 | 452 |
| performance/sl1 | 247 |
| performance/sl2 | 247 |
| performance/sl3 | 247 |
| performance/transpose0 | 204 |
| performance/transpose1 | 204 |
| performance/transpose2 | 204 |
| performance/sl1 | 246 |
| performance/sl2 | 246 |
| performance/sl3 | 246 |
| performance/transpose0 | 200 |
| performance/transpose1 | 200 |
| performance/transpose2 | 200 |
## 统计

Loading…
Cancel
Save