perf(mir): 常量除法魔法数优化——smull+asr 替代 sdiv

实现 Hacker's Delight 有符号除法魔法数算法,将 x/C 替换为乘法逆元序列。
32/60 性能测试受益,sdiv 全部消除(仅剩变量除法无法优化)。

- M 为正(MSB=0):smull + asr #(32+s) + sub 符号修正
- M 为负(MSB=1):smull + lsr #32 + add + asr #s + sub 修正
- ModRR 同样受益:q = magic_div(x,C); r = x - q*C (msub)
- 添加 Umull 操作码(与 Smull 对称,为后续优化预留)
- 性能分 33→55(+65%),几何平均 0.55→0.55(因 gcc ref 更多成功运行)
lzk
lzkk 5 days ago
parent ea790dd05d
commit 4403dc08b8

@ -169,6 +169,7 @@ namespace mir
Csel,
Csneg,
Smull,
Umull,
Msub,
NegRR,
FAddRR,

@ -744,6 +744,22 @@ namespace mir
}
return;
case Opcode::Umull:
if (operands.size() >= 3)
{
os << " umull ";
if (operands[0].GetKind() == Operand::Kind::Reg && IsWReg(operands[0].GetReg()))
os << PhysRegName(static_cast<PhysReg>(static_cast<int>(operands[0].GetReg()) + 31));
else
PrintOperand(operands[0], os);
os << ", ";
PrintOperand(operands[1], os);
os << ", ";
PrintOperand(operands[2], os);
os << "\n";
}
return;
case Opcode::Msub:
case Opcode::Madd:
if (operands.size() >= 4)

@ -253,6 +253,140 @@ namespace mir
return false;
}
// 常量除法的魔法数计算结果
struct SignedDivMagic
{
unsigned M; // 魔法数32 位无符号)
int s; // 移位量
};
// 计算有符号除法 x/d 的魔法数Hacker's Delight 第 10 章)
// d 不能为 0, 1, -1, -2^31, 或 2 的幂(上层已过滤)
// 返回 {M, s},其中 M 为 32 位无符号值s 为后续移位量
static SignedDivMagic ComputeSignedDivMagic(int d)
{
const unsigned two31 = 0x80000000;
unsigned ad = d > 0 ? d : -d;
unsigned t = two31 + (static_cast<unsigned>(d) >> 31);
unsigned anc = t - 1 - t % ad;
int p = 31;
unsigned q1 = two31 / anc;
unsigned r1 = two31 - q1 * anc;
unsigned q2 = two31 / ad;
unsigned r2 = two31 - q2 * ad;
for (;;)
{
++p;
q1 = 2 * q1;
r1 = 2 * r1;
if (r1 >= anc) { ++q1; r1 -= anc; }
q2 = 2 * q2;
r2 = 2 * r2;
if (r2 >= ad) { ++q2; r2 -= ad; }
unsigned delta = ad - r2;
if (q1 < delta || (q1 == delta && r1 == 0))
continue;
break;
}
SignedDivMagic mag;
mag.M = q2 + 1;
if (d < 0) mag.M = -mag.M;
mag.s = p - 32;
return mag;
}
// 发射魔法数除法序列x / dd 为常量d != 1/-1/2^n
// 返回 dst vreg
static void EmitMagicDiv(int dst, int lhs, int d, MachineFunction &function,
MachineBasicBlock &block)
{
auto mag = ComputeSignedDivMagic(d);
int magic_vreg = function.CreateVReg(VRegClass::Int);
int sign_vreg = function.CreateVReg(VRegClass::Int);
// 加载魔法数常量
block.Append(Opcode::MovImm,
{Operand::VReg(magic_vreg, VRegClass::Int),
Operand::Imm(static_cast<int>(mag.M))})
.SetRematerializable(true).SetRematImm(static_cast<int>(mag.M));
if ((mag.M & 0x80000000) == 0)
{
// Case A: M 为正signed 32-bit 正数)
// smull xh, w_n, w_M → 64-bit 有符号乘积
int xh_vreg = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::Smull,
{Operand::VReg(xh_vreg, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::VReg(magic_vreg, VRegClass::Int)});
// asr/lsr xh, xh, #(32+s)
int total_shift = 32 + mag.s;
if (mag.s == 0)
block.Append(Opcode::Lsr64RR,
{Operand::VReg(xh_vreg, VRegClass::Int),
Operand::VReg(xh_vreg, VRegClass::Int),
Operand::Imm(total_shift)});
else
block.Append(Opcode::Asr64RR,
{Operand::VReg(xh_vreg, VRegClass::Int),
Operand::VReg(xh_vreg, VRegClass::Int),
Operand::Imm(total_shift)});
// sub w_dst, w_h, w_n, asr #31符号修正
block.Append(Opcode::AsrRR,
{Operand::VReg(sign_vreg, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::Imm(31)});
block.Append(Opcode::SubRR,
{Operand::VReg(dst, VRegClass::Int),
Operand::VReg(xh_vreg, VRegClass::Int),
Operand::VReg(sign_vreg, VRegClass::Int)});
}
else
{
// Case B: M 的 MSB 为 1signed 32-bit 中为负数)
// smull xh, w_n, w_M → 有符号 64 位乘积M 解释为有符号负数)
int xh_vreg = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::Smull,
{Operand::VReg(xh_vreg, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::VReg(magic_vreg, VRegClass::Int)});
// lsr xh, xh, #32 → 取高 32 位(无符号移位)
block.Append(Opcode::Lsr64RR,
{Operand::VReg(xh_vreg, VRegClass::Int),
Operand::VReg(xh_vreg, VRegClass::Int),
Operand::Imm(32)});
// add w_tmp, w_n, w_h
int tmp_vreg = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::AddRR,
{Operand::VReg(tmp_vreg, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::VReg(xh_vreg, VRegClass::Int)});
// asr w_tmp, w_tmp, #s
if (mag.s > 0)
block.Append(Opcode::AsrRR,
{Operand::VReg(tmp_vreg, VRegClass::Int),
Operand::VReg(tmp_vreg, VRegClass::Int),
Operand::Imm(mag.s)});
// sub w_dst, w_tmp, w_n, asr #31符号修正
block.Append(Opcode::AsrRR,
{Operand::VReg(sign_vreg, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int),
Operand::Imm(31)});
block.Append(Opcode::SubRR,
{Operand::VReg(dst, VRegClass::Int),
Operand::VReg(tmp_vreg, VRegClass::Int),
Operand::VReg(sign_vreg, VRegClass::Int)});
}
}
static int EmitIntValue(const ir::Value *value, MachineFunction &function,
ValueVRegMap &value_vregs, const LocalScalarMap &scalar_slots,
const LocalArrayMap &array_slots, MachineBasicBlock &block);
@ -636,6 +770,25 @@ namespace mir
value_vregs[value] = dst;
return dst;
}
// 魔法数除法x / CC 不是 1/-1/2^n
if (val > 2 && (val & (val - 1)) != 0)
{
EmitMagicDiv(dst, lhs, val, function, block);
value_vregs[value] = dst;
return dst;
}
// 负常量除法x / (-C) → -(x / C)
if (val < -1 && ((-val) & ((-val) - 1)) != 0)
{
int pos_dst = function.CreateVReg(VRegClass::Int);
EmitMagicDiv(pos_dst, lhs, -val, function, block);
block.Append(Opcode::NegRR,
{Operand::VReg(dst, VRegClass::Int),
Operand::VReg(pos_dst, VRegClass::Int)});
value_vregs[value] = dst;
return dst;
}
}
}
@ -727,10 +880,45 @@ namespace mir
value_vregs[value] = dst;
return dst;
}
// 魔法数取模x % C = x - (x / C) * C
if (val > 2 && (val & (val - 1)) != 0)
{
int q_vreg = function.CreateVReg(VRegClass::Int);
EmitMagicDiv(q_vreg, lhs, val, function, block);
int d_vreg = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::MovImm,
{Operand::VReg(d_vreg, VRegClass::Int),
Operand::Imm(val)})
.SetRematerializable(true).SetRematImm(val);
block.Append(Opcode::Msub,
{Operand::VReg(dst, VRegClass::Int),
Operand::VReg(q_vreg, VRegClass::Int),
Operand::VReg(d_vreg, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int)});
value_vregs[value] = dst;
return dst;
}
if (val < -1 && ((-val) & ((-val) - 1)) != 0)
{
int pos_val = -val;
int q_vreg = function.CreateVReg(VRegClass::Int);
EmitMagicDiv(q_vreg, lhs, pos_val, function, block);
int d_vreg = function.CreateVReg(VRegClass::Int);
block.Append(Opcode::MovImm,
{Operand::VReg(d_vreg, VRegClass::Int),
Operand::Imm(pos_val)})
.SetRematerializable(true).SetRematImm(pos_val);
block.Append(Opcode::Msub,
{Operand::VReg(dst, VRegClass::Int),
Operand::VReg(q_vreg, VRegClass::Int),
Operand::VReg(d_vreg, VRegClass::Int),
Operand::VReg(lhs, VRegClass::Int)});
value_vregs[value] = dst;
return dst;
}
}
}
// 立即数折叠AddRR/SubRR 的操作数 2 如果是常量且 <=4095直接用 Imm
int imm_val = 0;
if ((opcode == Opcode::AddRR || opcode == Opcode::SubRR) &&
TryGetConstantInt(bin->GetRhs(), imm_val))

@ -234,6 +234,7 @@ namespace mir
break;
case Opcode::Smull:
case Opcode::Umull:
if (ops.size() >= 3)
{
if (ops[0].GetKind() == Operand::Kind::VReg)

Loading…
Cancel
Save