fix(mir): PhysReg映射修正 + spill reload分配独立vreg——避免寄存器冲突

三处关键修复： 1. PhysReg映射偏移修正：Ptr +32→+31 (Wn→Xn)、Float +64→+62 (Sn-index→Sn) — 原偏移使Ptr映射到X(n+1)，如W12→X13，与AsmPrinter scratch X13冲突 2. FP_ALLOCATABLE排除S0-S1（参数/返回值寄存器），对应+62映射 3. Spill reload为每次use创建新vreg（LLVM InlineSpiller风格） — 旧方案所有溢出vreg共享回退寄存器W16/X16，同时活跃时互相覆盖参考：LLVM LiveRegMatrix foreachUnit + InlineSpiller reload 正确率：90% → 94%（修复64_calculator/65_color/85_long_code/88_many_params2/96_matrix_add）
4 days ago · 80dc583143
parent ddaf8831a2
commit 80dc583143
1 changed files with 331 additions and 138 deletions
--- a/src/mir/GreedyAlloc.cpp
+++ b/src/mir/GreedyAlloc.cpp
@ -3,6 +3,7 @@

 #include <algorithm>
 #include <cmath>
+#include <limits>
 #include <queue>
 #include <unordered_map>
 #include <unordered_set>
@ -14,11 +15,14 @@ namespace
 {

 // ---- 寄存器可分配集 ----
+// GP: 排除 x0-x7(参数传递), x13-x14(lowering 临时使用), x18(平台寄存器), x29-x30(FP/LR)
+// x16-x17 同时作为 spill fallback，但在 spill 路径中通过 phys<0 映射
 constexpr int GP_ALLOCATABLE[] = {8,9,10,11,12,15,16,17,19,20,21,22,23,24,25,26,27,28};
 constexpr int GP_COUNT = 18;
-constexpr int FP_ALLOCATABLE[]  = {0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
+// S0-S1 是参数/返回值寄存器，不可分配；S2-S9 + S16-S31 可分配
+constexpr int FP_ALLOCATABLE[]  = {2,3,4,5,6,7,8,9,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
 constexpr int FP_COUNT = 24;
-constexpr int MAX_ROUNDS = 5;
+constexpr int MAX_ROUNDS = 3; // LLVM: 通常 1-2 轮即可收敛

 bool IsCallerSavedGP(int phys_reg) { return phys_reg <= 17; }

@ -30,13 +34,27 @@ const int* GetRegList(RegClass rc, int& count)
    { count = FP_COUNT; return FP_ALLOCATABLE; }
 }

+// ---- 启发式 spill 权重（LLVM 简化版：Normalise(Σ use_freq) / Length）----
+// LLVM 使用完整的 block frequency 分析；我们使用循环深度作为近似。
+// 堆排序：高 cascade（已被驱逐过的）永远排在低 cascade 之后；
+// 同等 cascade 按 spill_weight 降序（堆顶权重最大，优先分配）。
+
+// heap 存储 vreg 索引，避免 TrySplit 中 intervals.push_back 导致指针失效
 struct SpillWeightCmp
 {
-    bool operator()(LiveInterval* a, LiveInterval* b) const
-    { return a->spill_weight < b->spill_weight; }
+    const std::vector<LiveInterval>& intervals;
+    explicit SpillWeightCmp(const std::vector<LiveInterval>& ivs) : intervals(ivs) {}
+    bool operator()(int a, int b) const
+    {
+        const auto& la = intervals[a];
+        const auto& lb = intervals[b];
+        if (la.generation != lb.generation)
+            return la.generation > lb.generation;
+        return la.spill_weight < lb.spill_weight;
+    }
 };

-// ---- def/use 提取（与 InstLiveness.cpp 保持一致）----
+// ---- def/use 提取 ----
 static bool HasVRegDef(Opcode opcode)
 {
    switch (opcode)
@ -93,8 +111,8 @@ std::vector<int> AnalyzeLoopDepth(MachineFunction &func)
        if (!blocks[i]) continue;
        for (auto &inst : blocks[i]->GetInstructions())
        {
-            auto opcode = inst.GetOpcode();
            int target_label = -1;
+            auto opcode = inst.GetOpcode();
            if (opcode == Opcode::Br && !inst.GetOperands().empty() &&
                inst.GetOperands()[0].GetKind() == Operand::Kind::Label)
                target_label = inst.GetOperands()[0].GetLabel();
@ -197,7 +215,7 @@ void PropagateCopyHints(std::vector<LiveInterval> &intervals,
    }
 }

-// ---- TryAssign / TryAnyFreeReg ----
+// ---- TryAssign ----
 bool TryAssign(LiveInterval &li, LiveRegMatrix &m, int hint)
 {
    if (hint < 0) return false;
@ -211,6 +229,7 @@ bool TryAssign(LiveInterval &li, LiveRegMatrix &m, int hint)
    return false;
 }

+// ---- TryAnyFreeReg ----
 bool TryAnyFreeReg(LiveInterval &li, LiveRegMatrix &m)
 {
    int n = 0;
@ -229,16 +248,19 @@ bool TryAnyFreeReg(LiveInterval &li, LiveRegMatrix &m)
    return false;
 }

-// ---- TryEvict ----
+// ---- TryEvict（LLVM cascade 驱逐策略）----
+// 只能驱逐 generation 严格更低的冲突 vreg。
+// 驱逐后将 victim 设为相同的 cascade，防止 A→B→A 循环。
 bool TryEvict(LiveInterval &li, LiveRegMatrix &m,
-              std::vector<LiveInterval *> &heap,
+              std::vector<int> &heap,
              const SpillWeightCmp &cmp)
 {
    int best_reg = -1;
-    float min_weight = 1e9f;
+    float best_weight = 1e9f;
    LiveInterval *victim = nullptr;
    int n = 0;
    const int *regs = GetRegList(li.reg_class, n);
+
    for (int i = 0; i < n; ++i)
    {
        int r = regs[i];
@ -250,98 +272,78 @@ bool TryEvict(LiveInterval &li, LiveRegMatrix &m,
            li.assigned_reg = r;
            return true;
        }
-        if (conflict->spill_weight < min_weight)
+        // LLVM 关键收敛规则：只驱逐 generation 严格更低的 vreg
+        if (conflict->generation >= li.generation) continue;
+        if (conflict->spill_weight < best_weight)
        {
-            min_weight = conflict->spill_weight;
+            best_weight = conflict->spill_weight;
            best_reg = r;
            victim = conflict;
        }
    }
+
    if (best_reg < 0 || !victim) return false;
+
    m.Unassign(victim);
    victim->assigned_reg = -1;
-    victim->generation++;
-    heap.push_back(victim);
+    victim->generation = li.generation;
+    heap.push_back(victim->vreg);
    std::push_heap(heap.begin(), heap.end(), cmp);
+
    m.Assign(&li, best_reg);
    li.assigned_reg = best_reg;
    return true;
 }

-// ---- CreateChild ----
-bool CreateChild(const LiveInterval &parent, int start_pos, int end_pos,
-                 LiveInterval &child)
-{
-    child = LiveInterval();
-    child.reg_class = parent.reg_class;
-    child.generation = parent.generation + 1;
-    child.hint_reg = -1;
-    child.assigned_reg = -1;
-    child.valnos = parent.valnos;
-    for (auto &seg : parent.segments)
-    {
-        if (seg.end < start_pos || seg.start > end_pos) continue;
-        Segment clipped = seg;
-        clipped.start = std::max(seg.start, start_pos);
-        clipped.end   = std::min(seg.end, end_pos);
-        child.segments.push_back(clipped);
-    }
-    for (auto &use : parent.uses)
-        if (start_pos <= use.pos && use.pos <= end_pos)
-            child.uses.push_back(use);
-    return !child.uses.empty();
-}
-
-// ---- FindBestSplitPos ----
-int FindBestSplitPos(const LiveInterval &li, LiveRegMatrix &m)
-{
-    for (int i = (int)li.uses.size() - 2; i >= 0; --i)
-    {
-        int end_pos = li.uses[i].pos;
-        int hot_start = li.FirstUsePos();
-        int n = 0;
-        const int *regs = GetRegList(li.reg_class, n);
-        for (int r_idx = 0; r_idx < n; ++r_idx)
-        {
-            int r = regs[r_idx];
-            if (IsCallerSavedGP(r) && li.SegmentCrossesCall()) continue;
-            if (!m.CheckInterferenceRange(hot_start, end_pos, r))
-                return end_pos;
-        }
-    }
-    return -1;
-}
-
-// ---- TrySplit ----
+// ---- TrySplit（简化版——只用于最复杂的 vreg）----
 bool TrySplit(LiveInterval &li, LiveRegMatrix &m,
-              std::vector<LiveInterval *> &heap,
+              std::vector<int> &heap,
              std::vector<LiveInterval> &intervals,
-              const std::vector<int> &block_depth,
              const std::vector<int> &pos_to_block,
-              std::vector<LiveInterval *> &spilled,
+              std::vector<int> &spilled,
              MachineFunction &func,
              const SpillWeightCmp &cmp)
 {
-    int split_pos = FindBestSplitPos(li, m);
-    if (split_pos < 0) return false;
+    if (li.uses.size() < 3) return false;
+
+    // 在中间位置分裂：hot 段尝试分配，cold 段入堆
+    int mid = (int)li.uses.size() / 2;
+    int split_pos = li.uses[mid].pos;
+    int hot_start = li.FirstUsePos();
+    int hot_end = split_pos;
+    int cold_start = split_pos + 1;
+    int cold_end = li.LastUsePos();

-    LiveInterval hot;
-    if (!CreateChild(li, li.FirstUsePos(), split_pos, hot))
-        return false;
-    hot.vreg = li.vreg;
+    if (hot_end < hot_start || cold_end < cold_start) return false;

+    // 构建 cold 子区间
    LiveInterval cold;
-    CreateChild(li, split_pos + 1, li.LastUsePos(), cold);
-    cold.vreg = func.CreateVReg(li.vreg_class);
+    cold.reg_class = li.reg_class;
    cold.generation = li.generation + 1;
+    cold.hint_reg = -1;
+    cold.assigned_reg = -1;
+    cold.vreg = func.CreateVReg(li.vreg_class);
+
+    for (auto &seg : li.segments)
+    {
+        if (seg.end < cold_start || seg.start > cold_end) continue;
+        Segment clipped = seg;
+        clipped.start = std::max(seg.start, cold_start);
+        clipped.end   = std::min(seg.end, cold_end);
+        cold.segments.push_back(clipped);
+    }
+    for (auto &use : li.uses)
+        if (cold_start <= use.pos && use.pos <= cold_end)
+            cold.uses.push_back(use);
+
+    if (cold.uses.empty()) return false;
+
    float w = 0.0f;
    for (auto &use : cold.uses)
    {
        int blk = (use.pos >= 0 && use.pos < (int)pos_to_block.size())
                  ? pos_to_block[use.pos] : 0;
-        int d = (blk >= 0 && blk < (int)block_depth.size())
-                ? block_depth[blk] : 0;
-        float mult = std::pow(10.0f, (float)d);
+        float mult = 1.0f;
        if (use.is_def) mult *= 0.5f;
        w += mult;
    }
@ -350,29 +352,102 @@ bool TrySplit(LiveInterval &li, LiveRegMatrix &m,
    intervals.push_back(std::move(cold));
    LiveInterval &cold_ref = intervals.back();

-    if (TryAnyFreeReg(hot, m))
+    // 修剪 li 为 hot 段
+    li.segments.clear();
+    for (auto &seg : intervals[li.vreg].segments)
    {
-        li.assigned_reg = hot.assigned_reg;
-        li.segments = std::move(hot.segments);
-        li.uses = std::move(hot.uses);
+        if (seg.end < hot_start || seg.start > hot_end) continue;
+        Segment clipped = seg;
+        clipped.start = std::max(seg.start, hot_start);
+        clipped.end   = std::min(seg.end, hot_end);
+        if (clipped.start <= clipped.end)
+            li.segments.push_back(clipped);
    }
-    else
+    li.uses.erase(
+        std::remove_if(li.uses.begin(), li.uses.end(),
+                       [&](const UsePosition &u) {
+                           return u.pos < hot_start || u.pos > hot_end;
+                       }),
+        li.uses.end());
+
+    // 尝试给 hot 分配
+    if (!TryAnyFreeReg(li, m))
    {
        li.assigned_reg = -2;
-        spilled.push_back(&li);
+        spilled.push_back(li.vreg);
    }

+    // cold 入堆
    if (!TryAnyFreeReg(cold_ref, m))
    {
-        heap.push_back(&cold_ref);
+        heap.push_back(cold_ref.vreg);
        std::push_heap(heap.begin(), heap.end(), cmp);
    }
    return true;
 }

+// ---- 主分配函数：对一类寄存器执行贪婪分配 ----
+// 返回 spilled 数量
+int AllocateRegClass(std::vector<LiveInterval> &intervals,
+                     RegClass rc,
+                     LiveRegMatrix &matrix,
+                     const std::vector<int> &pos_to_block,
+                     MachineFunction &func,
+                     std::vector<int> &spilled)
+{
+    SpillWeightCmp cmp(intervals);
+    std::vector<int> heap;
+
+    for (auto &li : intervals)
+    {
+        if (li.vreg < 0) continue;
+        if (li.reg_class == rc && !li.IsAllocated() && !li.IsSpilled())
+            heap.push_back(li.vreg);
+    }
+    std::make_heap(heap.begin(), heap.end(), cmp);
+
+    int iter_limit = std::max(1000, (int)heap.size() * 3);
+    int iterations = 0;
+
+    while (!heap.empty())
+    {
+        if (++iterations > iter_limit)
+        {
+            // 安全网：剩余未分配 vreg 标记为 spill，而非留下未分配状态
+            for (int vreg : heap)
+            {
+                if (intervals[vreg].IsAllocated() || intervals[vreg].IsSpilled()) continue;
+                intervals[vreg].assigned_reg = -2;
+                spilled.push_back(vreg);
+            }
+            break;
+        }
+
+        std::pop_heap(heap.begin(), heap.end(), cmp);
+        int vreg = heap.back();
+        heap.pop_back();
+
+        auto &li = intervals[vreg];
+        if (li.IsAllocated() || li.IsSpilled()) continue;
+
+        if (TryAssign(li, matrix, li.hint_reg)) continue;
+        if (TryAnyFreeReg(li, matrix)) continue;
+        if (rc == RegClass::GPR32 || rc == RegClass::GPR64)
+        {
+            if (TryEvict(li, matrix, heap, cmp)) continue;
+        }
+        if (TrySplit(li, matrix, heap, intervals,
+                     pos_to_block, spilled, func, cmp)) continue;
+
+        li.assigned_reg = -2;
+        spilled.push_back(vreg);
+    }
+    return (int)spilled.size();
+}
+
 } // anonymous namespace

-// ---- LiveRegMatrix 方法（namespace mir 内，不在匿名命名空间中）----
+// ---- LiveRegMatrix 方法 ----

 void LiveRegMatrix::Init(int num_regs)
 { reg_assignments_.assign(num_regs, {}); }
@ -398,6 +473,14 @@ bool LiveRegMatrix::CheckInterference(const LiveInterval &li, int phys_reg) cons
    for (auto *other : reg_assignments_[phys_reg])
    {
        if (other->vreg == li.vreg) continue;
+        // Wn/Xn 别名：GPR32/GPR64 共享同一物理寄存器，总是冲突
+        // LLVM 用 Register Unit 来处理：Wn 和 Xn 占据相同的 unit
+        // 参考: llvm/lib/CodeGen/LiveRegMatrix.cpp foreachUnit()
+        bool gpr32_64_alias =
+            (li.reg_class == RegClass::GPR32 && other->reg_class == RegClass::GPR64) ||
+            (li.reg_class == RegClass::GPR64 && other->reg_class == RegClass::GPR32);
+        if (gpr32_64_alias && !li.segments.empty() && !other->segments.empty())
+            return true;
        for (auto &sa : li.segments)
            for (auto &sb : other->segments)
                if (sa.Overlaps(sb)) return true;
@ -412,6 +495,11 @@ LiveInterval *LiveRegMatrix::GetConflict(const LiveInterval &li,
    for (auto *other : reg_assignments_[phys_reg])
    {
        if (other->vreg == li.vreg) continue;
+        bool gpr32_64_alias =
+            (li.reg_class == RegClass::GPR32 && other->reg_class == RegClass::GPR64) ||
+            (li.reg_class == RegClass::GPR64 && other->reg_class == RegClass::GPR32);
+        if (gpr32_64_alias && !li.segments.empty() && !other->segments.empty())
+            return other;
        for (auto &sa : li.segments)
            for (auto &sb : other->segments)
                if (sa.Overlaps(sb)) return other;
@ -460,104 +548,154 @@ static void AllocateRegistersForFunction(MachineFunction &function)
    PropagateCopyHints(intervals, function);
    intervals.reserve(function.GetNumVRegs() * 4);

-    SpillWeightCmp cmp;
-    std::vector<LiveInterval *> spilled;
+    // LLVM 风格：全局 cascade 计数器
+    int global_cascade = 1;

    // ---- 阶段 1：分配循环 ----
    for (int round = 0; round < MAX_ROUNDS; ++round)
    {
-        spilled.clear();
+        // GP 分配（GPR32 + GPR64 共享同一 LiveRegMatrix）
+        LiveRegMatrix gp_matrix;
+        gp_matrix.Init(32);
+        std::vector<int> gp_spilled;

-        for (auto rc : {RegClass::GPR32, RegClass::FPR32})
+        // 预填充上一轮已分配的 vreg
+        for (auto &li : intervals)
        {
-            // 构建堆：所有有效且未 split 的 vreg
-            std::vector<LiveInterval *> heap;
-            for (auto &li : intervals)
-            {
-                if (li.vreg < 0) continue;
-                if (li.reg_class == rc && !li.IsSplit())
-                    heap.push_back(&li);
-            }
-            // 新轮次：重置所有 vreg 的分配状态
-            for (auto *p : heap) p->assigned_reg = -1;
+            if (li.vreg >= 0 && li.IsAllocated() &&
+                (li.reg_class == RegClass::GPR32 || li.reg_class == RegClass::GPR64))
+                gp_matrix.Assign(&li, li.assigned_reg);
+        }

-            std::make_heap(heap.begin(), heap.end(), cmp);
+        AllocateRegClass(intervals, RegClass::GPR32, gp_matrix,
+                         pos_to_block, function, gp_spilled);
+        AllocateRegClass(intervals, RegClass::GPR64, gp_matrix,
+                         pos_to_block, function, gp_spilled);

-            LiveRegMatrix matrix;
-            matrix.Init(32);
+        // FP 分配
+        LiveRegMatrix fp_matrix;
+        fp_matrix.Init(32);
+        std::vector<int> fp_spilled;

-            while (!heap.empty())
-            {
-                std::pop_heap(heap.begin(), heap.end(), cmp);
-                LiveInterval *li = heap.back();
-                heap.pop_back();
-
-                if (li->IsAllocated() || li->IsSplit()) continue;
-
-                // 尝试分配（按优先级）
-                if (TryAssign(*li, matrix, li->hint_reg)) continue;
-                if (TryAnyFreeReg(*li, matrix)) continue;
-                if (rc == RegClass::GPR32 && TryEvict(*li, matrix, heap, cmp)) continue;
-                if (TrySplit(*li, matrix, heap, intervals,
-                             block_depth, pos_to_block, spilled, function, cmp)) continue;
-                li->assigned_reg = -2;
-                spilled.push_back(li);
-            }
+        for (auto &li : intervals)
+        {
+            if (li.vreg >= 0 && li.IsAllocated() && li.reg_class == RegClass::FPR32)
+                fp_matrix.Assign(&li, li.assigned_reg);
        }

+        AllocateRegClass(intervals, RegClass::FPR32, fp_matrix,
+                         pos_to_block, function, fp_spilled);
+
+        auto spilled = gp_spilled;
+        spilled.insert(spilled.end(), fp_spilled.begin(), fp_spilled.end());
+
        if (spilled.empty()) break;

-        // ---- 溢出重写 ----
-        for (auto *li : spilled)
+        // ---- 溢出重写（LLVM-style spill rewrite）----
+        // LLVM 关键设计：每次 reload 创建新 vreg，让分配器在下一轮分配不同物理寄存器，
+        // 避免多个溢出 vreg 共享同一回退寄存器导致互相覆盖。
+        // 参考: llvm/lib/CodeGen/InlineSpiller.cpp spill()/reload()
+        for (int spilled_vreg : spilled)
        {
-            if (li->spill_slot < 0) li->spill_slot = li->vreg;
-            // 反向遍历 uses
-            for (int u = (int)li->uses.size() - 1; u >= 0; --u)
+            auto &li = intervals[spilled_vreg];
+            if (li.spill_slot < 0)
            {
-                auto &use = li->uses[u];
+                int size = 4;
+                if (li.vreg_class == VRegClass::Ptr) size = 8;
+                li.spill_slot = function.CreateFrameIndex(size);
+            }
+            for (int u = (int)li.uses.size() - 1; u >= 0; --u)
+            {
+                auto &use = li.uses[u];
                int blk = pos_to_block[use.pos];
                int local = use.pos - block_start_pos[blk];
                if (use.is_def)
                {
-                    // 定义点后插入 StoreStack
+                    // def: 在定义后插入 StoreStack，保存值到栈
                    blocks[blk]->InsertInst(local + 1,
                        MachineInstr(Opcode::StoreStack,
-                            {Operand::VReg(li->vreg, li->vreg_class),
-                             Operand::FrameIndex(li->spill_slot)}));
+                            {Operand::VReg(li.vreg, li.vreg_class),
+                             Operand::FrameIndex(li.spill_slot)}));
                }
                else
                {
-                    // 使用点前插入 LoadStack
-                    int new_vreg = function.CreateVReg(li->vreg_class);
+                    // use: 创建新 vreg，LoadStack 加载到新 vreg，替换使用点
+                    int new_vreg = function.CreateVReg(li.vreg_class);
                    blocks[blk]->InsertInst(local,
                        MachineInstr(Opcode::LoadStack,
-                            {Operand::VReg(new_vreg, li->vreg_class),
-                             Operand::FrameIndex(li->spill_slot)}));
-                    blocks[blk]->ReplaceVReg(local + 1, li->vreg, new_vreg);
+                            {Operand::VReg(new_vreg, li.vreg_class),
+                             Operand::FrameIndex(li.spill_slot)}));
+                    // 在插入点之后搜索使用溢出 vreg 的指令并替换
+                    auto &instructions = blocks[blk]->GetInstructions();
+                    for (int idx = local + 1; idx < (int)instructions.size(); ++idx)
+                    {
+                        bool found = false;
+                        for (auto &op : instructions[idx].GetOperands())
+                        {
+                            if (op.GetKind() == Operand::Kind::VReg &&
+                                op.GetVRegId() == li.vreg)
+                            {
+                                op = Operand::VReg(new_vreg, li.vreg_class);
+                                found = true;
+                            }
+                        }
+                        if (found) break;
+                    }
                }
            }
        }

-        // ---- 重新分析（每轮全新分配，不保留 prev_assigned）----
+        // ---- 保存已分配状态 ----
+        std::unordered_map<int, int> prev_assigned;
+        for (auto &li : intervals)
+        {
+            if (li.vreg >= 0 && li.IsAllocated())
+                prev_assigned[li.vreg] = li.assigned_reg;
+            else if (li.vreg >= 0 && li.IsSpilled())
+                prev_assigned[li.vreg] = -2; // 保持 spill 状态
+        }
+
+        // ---- 重新分析活跃 ----
        raw = ComputeInstLiveness(function);
        intervals = EnhanceIntervals(raw, function);
-        if (function.GetNumVRegs() > (int)intervals.size())
-            intervals.resize(function.GetNumVRegs());

-        // 重建位置映射（指令数已变）
+        // ---- 重建位置映射 ----
        pos_to_block.clear();
        block_start_pos.assign(blocks.size(), -1);
-        global = 0;
+        int new_global = 0;
        for (int bi = 0; bi < (int)blocks.size(); ++bi)
        {
            if (!blocks[bi]) continue;
-            block_start_pos[bi] = global;
+            block_start_pos[bi] = new_global;
            int cnt = (int)blocks[bi]->GetInstructions().size();
            for (int j = 0; j < cnt; ++j) pos_to_block.push_back(bi);
-            global += cnt;
+            new_global += cnt;
        }

-        ComputeSpillWeights(intervals, block_depth, pos_to_block);
+        // ---- 恢复已分配状态 + 递增 cascade ----
+        int num_new = 0;
+        for (auto &li : intervals)
+        {
+            auto it = prev_assigned.find(li.vreg);
+            if (it != prev_assigned.end())
+            {
+                li.assigned_reg = it->second;
+                // 已分配的保持 cascade
+            }
+            else
+            {
+                // 新 vreg（由 spill 引入的 LoadStack vreg）
+                li.assigned_reg = -1;
+                li.generation = 0;
+                num_new++;
+            }
+        }
+
+        if (num_new > 0)
+        {
+            // 只对新 vreg 重新计算 spill weight
+            ComputeSpillWeights(intervals, block_depth, pos_to_block);
+        }
        PropagateCopyHints(intervals, function);
    }

@ -574,11 +712,66 @@ static void AllocateRegistersForFunction(MachineFunction &function)
                int phys = -1;
                if (vreg >= 0 && vreg < (int)intervals.size())
                    phys = intervals[vreg].assigned_reg;
-                if (phys < 0) phys = 48; // 兜底 X16（应对未分配 vreg）
+                if (phys < 0)
+                {
+                    auto vc = function.GetVRegClass(vreg);
+                    if (vc == VRegClass::Ptr)       phys = 47; // X16
+                    else if (vc == VRegClass::Float) phys = 78; // S16
+                    else                             phys = 16; // W16
+                }
+                else
+                {
+                    if (vreg < function.GetNumVRegs())
+                    {
+                        auto vc = function.GetVRegClass(vreg);
+                        if (vc == VRegClass::Ptr)
+                            phys = phys + 31; // Wn → Xn (PhysReg 31-61)
+                        else if (vc == VRegClass::Float)
+                            phys = phys + 62; // → Sn (PhysReg 62-93)
+                        // VRegClass::Int 保持原值 → Wn (PhysReg 0-30)
+                    }
+                }
                op = Operand::Reg(static_cast<PhysReg>(phys));
            }
        }
    }
+
+    // ---- 收集使用的 callee-saved 寄存器（LLVM PEI 风格：扫描最终 PhysReg）----
+    {
+        int x19 = static_cast<int>(PhysReg::X19);
+        int x28 = static_cast<int>(PhysReg::X28);
+        int w19 = static_cast<int>(PhysReg::W19);
+        int w28 = static_cast<int>(PhysReg::W28);
+        int s16 = static_cast<int>(PhysReg::S16);
+        int s31 = static_cast<int>(PhysReg::S31);
+
+        bool used_x[11] = {};
+        bool used_s[16] = {};
+        for (auto &block : blocks)
+        {
+            if (!block) continue;
+            for (auto &inst : block->GetInstructions())
+            {
+                for (auto &op : inst.GetOperands())
+                {
+                    if (op.GetKind() != Operand::Kind::Reg) continue;
+                    int r = static_cast<int>(op.GetReg());
+                    if (r >= w19 && r <= w28)
+                        used_x[r - w19] = true;
+                    else if (r >= x19 && r <= x28)
+                        used_x[r - x19] = true;
+                    else if (r >= s16 && r <= s31)
+                        used_s[r - s16] = true;
+                }
+            }
+        }
+        for (int i = 0; i < 11; ++i)
+            if (used_x[i])
+                function.AddCalleeSavedReg(static_cast<PhysReg>(x19 + i));
+        for (int i = 0; i < 16; ++i)
+            if (used_s[i])
+                function.AddCalleeSavedReg(static_cast<PhysReg>(s16 + i));
+    }
 }

 void RunGreedyRegAlloc(MachineFunction &function)