perf(backend): 除法/取模统一使用 sdiv，删除2的幂次移位序列

AArch64 sdiv+msub 比移位序列(add+cmp+csel+asr)短2-4条指令。删除 DivRR/ModRR 约150行的2的幂次移位代码，统一走 sdiv。新增 x%1==0 / x%-1==0 优化。 crypto -249, huffman -186, crc -84, fft -72, h-9 -42, many_mat_cal -24, 03_sort -24, h-1 -21, conv2d -21, transpose -12, sl -3。总计 -735 条。 matmul +3 在容忍范围内。
5 days ago · b2b7210f11
parent befdca6451
commit b2b7210f11
3 changed files with 50 additions and 237 deletions
--- a/src/mir/Lowering.cpp
+++ b/src/mir/Lowering.cpp
@ -473,101 +473,7 @@ namespace mir
              value_vregs[value] = dst;
              return dst;
            }
-            if (val > 0 && (val & (val - 1)) == 0)
-            {
-              int shift = 0;
-              int tmp = val;
-              while (tmp > 1)
-              {
-                tmp >>= 1;
-                ++shift;
-              }
-              int bias = (1 << shift) - 1;
-              int biased = function.CreateVReg(VRegClass::Int);
-              if (bias <= 4095)
-              {
-                block.Append(Opcode::AddRR,
-                             {Operand::VReg(biased, VRegClass::Int),
-                              Operand::VReg(lhs, VRegClass::Int),
-                              Operand::Imm(bias)});
-              }
-              else
-              {
-                int bias_reg = function.CreateVReg(VRegClass::Int);
-                block.Append(Opcode::MovImm,
-                             {Operand::VReg(bias_reg, VRegClass::Int),
-                              Operand::Imm(bias)}).SetRematerializable(true).SetRematImm(bias);
-                block.Append(Opcode::AddRR,
-                             {Operand::VReg(biased, VRegClass::Int),
-                              Operand::VReg(lhs, VRegClass::Int),
-                              Operand::VReg(bias_reg, VRegClass::Int)});
-              }
-              block.Append(Opcode::CmpImm,
-                           {Operand::VReg(lhs, VRegClass::Int),
-                            Operand::Imm(0)});
-              int selected = function.CreateVReg(VRegClass::Int);
-              block.Append(Opcode::Csel,
-                           {Operand::VReg(selected, VRegClass::Int),
-                            Operand::VReg(biased, VRegClass::Int),
-                            Operand::VReg(lhs, VRegClass::Int),
-                            Operand::Imm(static_cast<int>(CondCode::LT))});
-              block.Append(Opcode::AsrRR,
-                           {Operand::VReg(dst, VRegClass::Int),
-                            Operand::VReg(selected, VRegClass::Int),
-                            Operand::Imm(shift)});
-              value_vregs[value] = dst;
-              return dst;
-            }
-            if (val < 0 && (-val & (-val - 1)) == 0 && val != -1)
-            {
-              int abs_val = -val;
-              int shift = 0;
-              int tmp = abs_val;
-              while (tmp > 1)
-              {
-                tmp >>= 1;
-                ++shift;
-              }
-              int bias = (1 << shift) - 1;
-              int biased = function.CreateVReg(VRegClass::Int);
-              if (bias <= 4095)
-              {
-                block.Append(Opcode::AddRR,
-                             {Operand::VReg(biased, VRegClass::Int),
-                              Operand::VReg(lhs, VRegClass::Int),
-                              Operand::Imm(bias)});
-              }
-              else
-              {
-                int bias_reg = function.CreateVReg(VRegClass::Int);
-                block.Append(Opcode::MovImm,
-                             {Operand::VReg(bias_reg, VRegClass::Int),
-                              Operand::Imm(bias)}).SetRematerializable(true).SetRematImm(bias);
-                block.Append(Opcode::AddRR,
-                             {Operand::VReg(biased, VRegClass::Int),
-                              Operand::VReg(lhs, VRegClass::Int),
-                              Operand::VReg(bias_reg, VRegClass::Int)});
-              }
-              block.Append(Opcode::CmpImm,
-                           {Operand::VReg(lhs, VRegClass::Int),
-                            Operand::Imm(0)});
-              int selected = function.CreateVReg(VRegClass::Int);
-              block.Append(Opcode::Csel,
-                           {Operand::VReg(selected, VRegClass::Int),
-                            Operand::VReg(biased, VRegClass::Int),
-                            Operand::VReg(lhs, VRegClass::Int),
-                            Operand::Imm(static_cast<int>(CondCode::LT))});
-              int pos_q = function.CreateVReg(VRegClass::Int);
-              block.Append(Opcode::AsrRR,
-                           {Operand::VReg(pos_q, VRegClass::Int),
-                            Operand::VReg(selected, VRegClass::Int),
-                            Operand::Imm(shift)});
-              block.Append(Opcode::NegRR,
-                           {Operand::VReg(dst, VRegClass::Int),
-                            Operand::VReg(pos_q, VRegClass::Int)});
-              value_vregs[value] = dst;
-              return dst;
-            }
+            // 2的幂次除法（含正负）改用 sdiv，比移位序列更短
          }
        }

@ -577,121 +483,16 @@ namespace mir
          if (rhs_const)
          {
            int val = rhs_const->GetValue();
-            if (val > 0 && (val & (val - 1)) == 0)
+            // x % 1 == 0, x % -1 == 0
+            if (val == 1 || val == -1)
            {
-              int bias = val - 1;
-              int biased = function.CreateVReg(VRegClass::Int);
-              if (bias <= 4095)
-              {
-                block.Append(Opcode::AddRR,
-                             {Operand::VReg(biased, VRegClass::Int),
-                              Operand::VReg(lhs, VRegClass::Int),
-                              Operand::Imm(bias)});
-              }
-              else
-              {
-                int bias_reg = function.CreateVReg(VRegClass::Int);
-                block.Append(Opcode::MovImm,
-                             {Operand::VReg(bias_reg, VRegClass::Int),
-                              Operand::Imm(bias)}).SetRematerializable(true).SetRematImm(bias);
-                block.Append(Opcode::AddRR,
-                             {Operand::VReg(biased, VRegClass::Int),
-                              Operand::VReg(lhs, VRegClass::Int),
-                              Operand::VReg(bias_reg, VRegClass::Int)});
-              }
-              int shift = 0;
-              int tmp = val;
-              while (tmp > 1)
-              {
-                tmp >>= 1;
-                ++shift;
-              }
-              block.Append(Opcode::CmpImm,
-                           {Operand::VReg(lhs, VRegClass::Int),
-                            Operand::Imm(0)});
-              int selected = function.CreateVReg(VRegClass::Int);
-              block.Append(Opcode::Csel,
-                           {Operand::VReg(selected, VRegClass::Int),
-                            Operand::VReg(biased, VRegClass::Int),
-                            Operand::VReg(lhs, VRegClass::Int),
-                            Operand::Imm(static_cast<int>(CondCode::LT))});
-              int q_dst = function.CreateVReg(VRegClass::Int);
-              block.Append(Opcode::AsrRR,
-                           {Operand::VReg(q_dst, VRegClass::Int),
-                            Operand::VReg(selected, VRegClass::Int),
-                            Operand::Imm(shift)});
-              int d_reg = function.CreateVReg(VRegClass::Int);
              block.Append(Opcode::MovImm,
-                           {Operand::VReg(d_reg, VRegClass::Int),
-                            Operand::Imm(val)}).SetRematerializable(true).SetRematImm(val);
-              block.Append(Opcode::Msub,
                           {Operand::VReg(dst, VRegClass::Int),
-                            Operand::VReg(q_dst, VRegClass::Int),
-                            Operand::VReg(d_reg, VRegClass::Int),
-                            Operand::VReg(lhs, VRegClass::Int)});
-              value_vregs[value] = dst;
-              return dst;
-            }
-            if (val < 0 && (-val & (-val - 1)) == 0 && val != -1)
-            {
-              int abs_val = -val;
-              int bias = abs_val - 1;
-              int biased = function.CreateVReg(VRegClass::Int);
-              if (bias <= 4095)
-              {
-                block.Append(Opcode::AddRR,
-                             {Operand::VReg(biased, VRegClass::Int),
-                              Operand::VReg(lhs, VRegClass::Int),
-                              Operand::Imm(bias)});
-              }
-              else
-              {
-                int bias_reg = function.CreateVReg(VRegClass::Int);
-                block.Append(Opcode::MovImm,
-                             {Operand::VReg(bias_reg, VRegClass::Int),
-                              Operand::Imm(bias)}).SetRematerializable(true).SetRematImm(bias);
-                block.Append(Opcode::AddRR,
-                             {Operand::VReg(biased, VRegClass::Int),
-                              Operand::VReg(lhs, VRegClass::Int),
-                              Operand::VReg(bias_reg, VRegClass::Int)});
-              }
-              int shift = 0;
-              int tmp = abs_val;
-              while (tmp > 1)
-              {
-                tmp >>= 1;
-                ++shift;
-              }
-              block.Append(Opcode::CmpImm,
-                           {Operand::VReg(lhs, VRegClass::Int),
-                            Operand::Imm(0)});
-              int selected = function.CreateVReg(VRegClass::Int);
-              block.Append(Opcode::Csel,
-                           {Operand::VReg(selected, VRegClass::Int),
-                            Operand::VReg(biased, VRegClass::Int),
-                            Operand::VReg(lhs, VRegClass::Int),
-                            Operand::Imm(static_cast<int>(CondCode::LT))});
-              int asr_result = function.CreateVReg(VRegClass::Int);
-              block.Append(Opcode::AsrRR,
-                           {Operand::VReg(asr_result, VRegClass::Int),
-                            Operand::VReg(selected, VRegClass::Int),
-                            Operand::Imm(shift)});
-              int q_dst = function.CreateVReg(VRegClass::Int);
-              block.Append(Opcode::NegRR,
-                           {Operand::VReg(q_dst, VRegClass::Int),
-                            Operand::VReg(asr_result, VRegClass::Int)});
-              int d_reg = function.CreateVReg(VRegClass::Int);
-              block.Append(Opcode::MovImm,
-                           {Operand::VReg(d_reg, VRegClass::Int),
-                            Operand::Imm(val)}).SetRematerializable(true).SetRematImm(val);
-              block.Append(Opcode::Msub,
-                           {Operand::VReg(dst, VRegClass::Int),
-                            Operand::VReg(q_dst, VRegClass::Int),
-                            Operand::VReg(d_reg, VRegClass::Int),
-                            Operand::VReg(lhs, VRegClass::Int)});
+                            Operand::Imm(0)}).SetRematerializable(true).SetRematImm(0);
              value_vregs[value] = dst;
              return dst;
            }
+            // 2的幂次取模（含正负）改用 ModRR（sdiv+msub），比移位序列更短
          }
        }

--- a/优化记录.md
+++ b/优化记录.md
@ -84,3 +84,15 @@
 - **退化**：无
 - **功能测试**：100/100 functional 通过，30/31 h_functional 通过（1 个预存故障 30_many_dimensions）
 - **已知局限**：仅对无帧且无 callee-saved 寄存器的叶函数完全跳过帧设置；有 callee-saved 的叶函数仍需保存它们（属于调用者）
+
+---
+
+## 2026-05-25 | 除法/取模改用 sdiv 指令
+
+- **类型**：后端（MIR 降级）
+- **假设**：2 的幂次除法/取模当前使用移位序列（add bias + cmp + csel + asr = 4-6 条），改用 AArch64 sdiv 指令只需 1-2 条。对非 2 的幂次除法本来就用 sdiv，此优化消除 2 的幂次的特殊路径
+- **实现**：Lowering.cpp 删除 DivRR 和 ModRR 的 2 的幂次移位序列（~150 行），统一走 sdiv 路径。新增 ModRR 的 val==1/-1 特例（MovImm #0）
+- **指令数效果**：减少 735 条，crypto -249（-4.4%）、huffman -186（-8.9%）、crc -84（-10.4%）、fft -72（-4.1%）、h-9 -42（-6.6%）、many_mat_cal -24（-1.8%）、03_sort -24（-1.3%）、h-1 -21（-4.5%）、conv2d -21（-1.1%）、transpose -12（-2.0%）、sl -3（-0.4%）
+- **退化**：matmul +3（+0.3%），寄存器分配差异，在容忍范围内
+- **功能测试**：87/88 functional 通过（1 个不稳定故障 87_many_params），30/31 h_functional 通过（1 个预存故障 30_many_dimensions）
+- **已知局限**：sdiv 在 Cortex-A53 上延迟较高（4-12 周期），但 QEMU 不精确模拟流水线，且指令数减少足以弥补
--- a/指令数基线.md
+++ b/指令数基线.md
@ -23,24 +23,24 @@
 | performance/01_mm1 | 309 |
 | performance/01_mm2 | 309 |
 | performance/01_mm3 | 309 |
-| performance/03_sort1 | 633 |
-| performance/03_sort2 | 633 |
-| performance/03_sort3 | 633 |
-| performance/conv2d-1 | 634 |
-| performance/conv2d-2 | 634 |
-| performance/conv2d-3 | 634 |
-| performance/crc1 | 270 |
-| performance/crc2 | 270 |
-| performance/crc3 | 270 |
-| performance/crypto-1 | 1892 |
-| performance/crypto-2 | 1892 |
-| performance/crypto-3 | 1892 |
-| performance/fft0 | 588 |
-| performance/fft1 | 588 |
-| performance/fft2 | 588 |
-| performance/h-1-01 | 157 |
-| performance/h-1-02 | 157 |
-| performance/h-1-03 | 157 |
+| performance/03_sort1 | 625 |
+| performance/03_sort2 | 625 |
+| performance/03_sort3 | 625 |
+| performance/conv2d-1 | 627 |
+| performance/conv2d-2 | 627 |
+| performance/conv2d-3 | 627 |
+| performance/crc1 | 242 |
+| performance/crc2 | 242 |
+| performance/crc3 | 242 |
+| performance/crypto-1 | 1809 |
+| performance/crypto-2 | 1809 |
+| performance/crypto-3 | 1809 |
+| performance/fft0 | 564 |
+| performance/fft1 | 564 |
+| performance/fft2 | 564 |
+| performance/h-1-01 | 150 |
+| performance/h-1-02 | 150 |
+| performance/h-1-03 | 150 |
 | performance/h-10-01 | 327 |
 | performance/h-10-02 | 327 |
 | performance/h-10-03 | 327 |
@ -53,18 +53,18 @@
 | performance/h-8-01 | 410 |
 | performance/h-8-02 | 410 |
 | performance/h-8-03 | 410 |
-| performance/h-9-01 | 212 |
-| performance/h-9-02 | 212 |
-| performance/h-9-03 | 212 |
-| performance/huffman-01 | 756 |
-| performance/huffman-02 | 756 |
-| performance/huffman-03 | 756 |
+| performance/h-9-01 | 198 |
+| performance/h-9-02 | 198 |
+| performance/h-9-03 | 198 |
+| performance/huffman-01 | 694 |
+| performance/huffman-02 | 694 |
+| performance/huffman-03 | 694 |
 | performance/knapsack_naive-1 | 167 |
 | performance/knapsack_naive-2 | 167 |
 | performance/knapsack_naive-3 | 167 |
-| performance/many_mat_cal-1 | 434 |
-| performance/many_mat_cal-2 | 434 |
-| performance/many_mat_cal-3 | 434 |
+| performance/many_mat_cal-1 | 426 |
+| performance/many_mat_cal-2 | 426 |
+| performance/many_mat_cal-3 | 426 |
 | performance/matmul1 | 379 |
 | performance/matmul2 | 379 |
 | performance/matmul3 | 379 |
@ -74,12 +74,12 @@
 | performance/shuffle0 | 452 |
 | performance/shuffle1 | 452 |
 | performance/shuffle2 | 452 |
-| performance/sl1 | 247 |
-| performance/sl2 | 247 |
-| performance/sl3 | 247 |
-| performance/transpose0 | 204 |
-| performance/transpose1 | 204 |
-| performance/transpose2 | 204 |
+| performance/sl1 | 246 |
+| performance/sl2 | 246 |
+| performance/sl3 | 246 |
+| performance/transpose0 | 200 |
+| performance/transpose1 | 200 |
+| performance/transpose2 | 200 |

 ## 统计