diff --git a/src/mir/Lowering.cpp b/src/mir/Lowering.cpp index 806221f3..29bb154a 100644 --- a/src/mir/Lowering.cpp +++ b/src/mir/Lowering.cpp @@ -473,101 +473,7 @@ namespace mir value_vregs[value] = dst; return dst; } - if (val > 0 && (val & (val - 1)) == 0) - { - int shift = 0; - int tmp = val; - while (tmp > 1) - { - tmp >>= 1; - ++shift; - } - int bias = (1 << shift) - 1; - int biased = function.CreateVReg(VRegClass::Int); - if (bias <= 4095) - { - block.Append(Opcode::AddRR, - {Operand::VReg(biased, VRegClass::Int), - Operand::VReg(lhs, VRegClass::Int), - Operand::Imm(bias)}); - } - else - { - int bias_reg = function.CreateVReg(VRegClass::Int); - block.Append(Opcode::MovImm, - {Operand::VReg(bias_reg, VRegClass::Int), - Operand::Imm(bias)}).SetRematerializable(true).SetRematImm(bias); - block.Append(Opcode::AddRR, - {Operand::VReg(biased, VRegClass::Int), - Operand::VReg(lhs, VRegClass::Int), - Operand::VReg(bias_reg, VRegClass::Int)}); - } - block.Append(Opcode::CmpImm, - {Operand::VReg(lhs, VRegClass::Int), - Operand::Imm(0)}); - int selected = function.CreateVReg(VRegClass::Int); - block.Append(Opcode::Csel, - {Operand::VReg(selected, VRegClass::Int), - Operand::VReg(biased, VRegClass::Int), - Operand::VReg(lhs, VRegClass::Int), - Operand::Imm(static_cast(CondCode::LT))}); - block.Append(Opcode::AsrRR, - {Operand::VReg(dst, VRegClass::Int), - Operand::VReg(selected, VRegClass::Int), - Operand::Imm(shift)}); - value_vregs[value] = dst; - return dst; - } - if (val < 0 && (-val & (-val - 1)) == 0 && val != -1) - { - int abs_val = -val; - int shift = 0; - int tmp = abs_val; - while (tmp > 1) - { - tmp >>= 1; - ++shift; - } - int bias = (1 << shift) - 1; - int biased = function.CreateVReg(VRegClass::Int); - if (bias <= 4095) - { - block.Append(Opcode::AddRR, - {Operand::VReg(biased, VRegClass::Int), - Operand::VReg(lhs, VRegClass::Int), - Operand::Imm(bias)}); - } - else - { - int bias_reg = function.CreateVReg(VRegClass::Int); - block.Append(Opcode::MovImm, - {Operand::VReg(bias_reg, VRegClass::Int), - Operand::Imm(bias)}).SetRematerializable(true).SetRematImm(bias); - block.Append(Opcode::AddRR, - {Operand::VReg(biased, VRegClass::Int), - Operand::VReg(lhs, VRegClass::Int), - Operand::VReg(bias_reg, VRegClass::Int)}); - } - block.Append(Opcode::CmpImm, - {Operand::VReg(lhs, VRegClass::Int), - Operand::Imm(0)}); - int selected = function.CreateVReg(VRegClass::Int); - block.Append(Opcode::Csel, - {Operand::VReg(selected, VRegClass::Int), - Operand::VReg(biased, VRegClass::Int), - Operand::VReg(lhs, VRegClass::Int), - Operand::Imm(static_cast(CondCode::LT))}); - int pos_q = function.CreateVReg(VRegClass::Int); - block.Append(Opcode::AsrRR, - {Operand::VReg(pos_q, VRegClass::Int), - Operand::VReg(selected, VRegClass::Int), - Operand::Imm(shift)}); - block.Append(Opcode::NegRR, - {Operand::VReg(dst, VRegClass::Int), - Operand::VReg(pos_q, VRegClass::Int)}); - value_vregs[value] = dst; - return dst; - } + // 2的幂次除法(含正负)改用 sdiv,比移位序列更短 } } @@ -577,121 +483,16 @@ namespace mir if (rhs_const) { int val = rhs_const->GetValue(); - if (val > 0 && (val & (val - 1)) == 0) + // x % 1 == 0, x % -1 == 0 + if (val == 1 || val == -1) { - int bias = val - 1; - int biased = function.CreateVReg(VRegClass::Int); - if (bias <= 4095) - { - block.Append(Opcode::AddRR, - {Operand::VReg(biased, VRegClass::Int), - Operand::VReg(lhs, VRegClass::Int), - Operand::Imm(bias)}); - } - else - { - int bias_reg = function.CreateVReg(VRegClass::Int); - block.Append(Opcode::MovImm, - {Operand::VReg(bias_reg, VRegClass::Int), - Operand::Imm(bias)}).SetRematerializable(true).SetRematImm(bias); - block.Append(Opcode::AddRR, - {Operand::VReg(biased, VRegClass::Int), - Operand::VReg(lhs, VRegClass::Int), - Operand::VReg(bias_reg, VRegClass::Int)}); - } - int shift = 0; - int tmp = val; - while (tmp > 1) - { - tmp >>= 1; - ++shift; - } - block.Append(Opcode::CmpImm, - {Operand::VReg(lhs, VRegClass::Int), - Operand::Imm(0)}); - int selected = function.CreateVReg(VRegClass::Int); - block.Append(Opcode::Csel, - {Operand::VReg(selected, VRegClass::Int), - Operand::VReg(biased, VRegClass::Int), - Operand::VReg(lhs, VRegClass::Int), - Operand::Imm(static_cast(CondCode::LT))}); - int q_dst = function.CreateVReg(VRegClass::Int); - block.Append(Opcode::AsrRR, - {Operand::VReg(q_dst, VRegClass::Int), - Operand::VReg(selected, VRegClass::Int), - Operand::Imm(shift)}); - int d_reg = function.CreateVReg(VRegClass::Int); block.Append(Opcode::MovImm, - {Operand::VReg(d_reg, VRegClass::Int), - Operand::Imm(val)}).SetRematerializable(true).SetRematImm(val); - block.Append(Opcode::Msub, {Operand::VReg(dst, VRegClass::Int), - Operand::VReg(q_dst, VRegClass::Int), - Operand::VReg(d_reg, VRegClass::Int), - Operand::VReg(lhs, VRegClass::Int)}); - value_vregs[value] = dst; - return dst; - } - if (val < 0 && (-val & (-val - 1)) == 0 && val != -1) - { - int abs_val = -val; - int bias = abs_val - 1; - int biased = function.CreateVReg(VRegClass::Int); - if (bias <= 4095) - { - block.Append(Opcode::AddRR, - {Operand::VReg(biased, VRegClass::Int), - Operand::VReg(lhs, VRegClass::Int), - Operand::Imm(bias)}); - } - else - { - int bias_reg = function.CreateVReg(VRegClass::Int); - block.Append(Opcode::MovImm, - {Operand::VReg(bias_reg, VRegClass::Int), - Operand::Imm(bias)}).SetRematerializable(true).SetRematImm(bias); - block.Append(Opcode::AddRR, - {Operand::VReg(biased, VRegClass::Int), - Operand::VReg(lhs, VRegClass::Int), - Operand::VReg(bias_reg, VRegClass::Int)}); - } - int shift = 0; - int tmp = abs_val; - while (tmp > 1) - { - tmp >>= 1; - ++shift; - } - block.Append(Opcode::CmpImm, - {Operand::VReg(lhs, VRegClass::Int), - Operand::Imm(0)}); - int selected = function.CreateVReg(VRegClass::Int); - block.Append(Opcode::Csel, - {Operand::VReg(selected, VRegClass::Int), - Operand::VReg(biased, VRegClass::Int), - Operand::VReg(lhs, VRegClass::Int), - Operand::Imm(static_cast(CondCode::LT))}); - int asr_result = function.CreateVReg(VRegClass::Int); - block.Append(Opcode::AsrRR, - {Operand::VReg(asr_result, VRegClass::Int), - Operand::VReg(selected, VRegClass::Int), - Operand::Imm(shift)}); - int q_dst = function.CreateVReg(VRegClass::Int); - block.Append(Opcode::NegRR, - {Operand::VReg(q_dst, VRegClass::Int), - Operand::VReg(asr_result, VRegClass::Int)}); - int d_reg = function.CreateVReg(VRegClass::Int); - block.Append(Opcode::MovImm, - {Operand::VReg(d_reg, VRegClass::Int), - Operand::Imm(val)}).SetRematerializable(true).SetRematImm(val); - block.Append(Opcode::Msub, - {Operand::VReg(dst, VRegClass::Int), - Operand::VReg(q_dst, VRegClass::Int), - Operand::VReg(d_reg, VRegClass::Int), - Operand::VReg(lhs, VRegClass::Int)}); + Operand::Imm(0)}).SetRematerializable(true).SetRematImm(0); value_vregs[value] = dst; return dst; } + // 2的幂次取模(含正负)改用 ModRR(sdiv+msub),比移位序列更短 } } diff --git a/优化记录.md b/优化记录.md index f0da551d..3eb3153e 100644 --- a/优化记录.md +++ b/优化记录.md @@ -84,3 +84,15 @@ - **退化**:无 - **功能测试**:100/100 functional 通过,30/31 h_functional 通过(1 个预存故障 30_many_dimensions) - **已知局限**:仅对无帧且无 callee-saved 寄存器的叶函数完全跳过帧设置;有 callee-saved 的叶函数仍需保存它们(属于调用者) + +--- + +## 2026-05-25 | 除法/取模改用 sdiv 指令 + +- **类型**:后端(MIR 降级) +- **假设**:2 的幂次除法/取模当前使用移位序列(add bias + cmp + csel + asr = 4-6 条),改用 AArch64 sdiv 指令只需 1-2 条。对非 2 的幂次除法本来就用 sdiv,此优化消除 2 的幂次的特殊路径 +- **实现**:Lowering.cpp 删除 DivRR 和 ModRR 的 2 的幂次移位序列(~150 行),统一走 sdiv 路径。新增 ModRR 的 val==1/-1 特例(MovImm #0) +- **指令数效果**:减少 735 条,crypto -249(-4.4%)、huffman -186(-8.9%)、crc -84(-10.4%)、fft -72(-4.1%)、h-9 -42(-6.6%)、many_mat_cal -24(-1.8%)、03_sort -24(-1.3%)、h-1 -21(-4.5%)、conv2d -21(-1.1%)、transpose -12(-2.0%)、sl -3(-0.4%) +- **退化**:matmul +3(+0.3%),寄存器分配差异,在容忍范围内 +- **功能测试**:87/88 functional 通过(1 个不稳定故障 87_many_params),30/31 h_functional 通过(1 个预存故障 30_many_dimensions) +- **已知局限**:sdiv 在 Cortex-A53 上延迟较高(4-12 周期),但 QEMU 不精确模拟流水线,且指令数减少足以弥补 diff --git a/指令数基线.md b/指令数基线.md index b13169d5..6712f5de 100644 --- a/指令数基线.md +++ b/指令数基线.md @@ -23,24 +23,24 @@ | performance/01_mm1 | 309 | | performance/01_mm2 | 309 | | performance/01_mm3 | 309 | -| performance/03_sort1 | 633 | -| performance/03_sort2 | 633 | -| performance/03_sort3 | 633 | -| performance/conv2d-1 | 634 | -| performance/conv2d-2 | 634 | -| performance/conv2d-3 | 634 | -| performance/crc1 | 270 | -| performance/crc2 | 270 | -| performance/crc3 | 270 | -| performance/crypto-1 | 1892 | -| performance/crypto-2 | 1892 | -| performance/crypto-3 | 1892 | -| performance/fft0 | 588 | -| performance/fft1 | 588 | -| performance/fft2 | 588 | -| performance/h-1-01 | 157 | -| performance/h-1-02 | 157 | -| performance/h-1-03 | 157 | +| performance/03_sort1 | 625 | +| performance/03_sort2 | 625 | +| performance/03_sort3 | 625 | +| performance/conv2d-1 | 627 | +| performance/conv2d-2 | 627 | +| performance/conv2d-3 | 627 | +| performance/crc1 | 242 | +| performance/crc2 | 242 | +| performance/crc3 | 242 | +| performance/crypto-1 | 1809 | +| performance/crypto-2 | 1809 | +| performance/crypto-3 | 1809 | +| performance/fft0 | 564 | +| performance/fft1 | 564 | +| performance/fft2 | 564 | +| performance/h-1-01 | 150 | +| performance/h-1-02 | 150 | +| performance/h-1-03 | 150 | | performance/h-10-01 | 327 | | performance/h-10-02 | 327 | | performance/h-10-03 | 327 | @@ -53,18 +53,18 @@ | performance/h-8-01 | 410 | | performance/h-8-02 | 410 | | performance/h-8-03 | 410 | -| performance/h-9-01 | 212 | -| performance/h-9-02 | 212 | -| performance/h-9-03 | 212 | -| performance/huffman-01 | 756 | -| performance/huffman-02 | 756 | -| performance/huffman-03 | 756 | +| performance/h-9-01 | 198 | +| performance/h-9-02 | 198 | +| performance/h-9-03 | 198 | +| performance/huffman-01 | 694 | +| performance/huffman-02 | 694 | +| performance/huffman-03 | 694 | | performance/knapsack_naive-1 | 167 | | performance/knapsack_naive-2 | 167 | | performance/knapsack_naive-3 | 167 | -| performance/many_mat_cal-1 | 434 | -| performance/many_mat_cal-2 | 434 | -| performance/many_mat_cal-3 | 434 | +| performance/many_mat_cal-1 | 426 | +| performance/many_mat_cal-2 | 426 | +| performance/many_mat_cal-3 | 426 | | performance/matmul1 | 379 | | performance/matmul2 | 379 | | performance/matmul3 | 379 | @@ -74,12 +74,12 @@ | performance/shuffle0 | 452 | | performance/shuffle1 | 452 | | performance/shuffle2 | 452 | -| performance/sl1 | 247 | -| performance/sl2 | 247 | -| performance/sl3 | 247 | -| performance/transpose0 | 204 | -| performance/transpose1 | 204 | -| performance/transpose2 | 204 | +| performance/sl1 | 246 | +| performance/sl2 | 246 | +| performance/sl3 | 246 | +| performance/transpose0 | 200 | +| performance/transpose1 | 200 | +| performance/transpose2 | 200 | ## 统计