From d2882fb69a4c6403ba005350b525ded7a6b84ac7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AE=89=E5=B3=BB=E9=82=91?= <2294450067@qq.com>
Date: Tue, 2 Jun 2026 23:56:30 +0800
Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BA=86=E5=BE=AA=E7=8E=AF?=
 =?UTF-8?q?=E4=BA=A4=E6=8D=A2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 2026test.sh                                   |  639 ++++++
 2026test/functional/00_main.sy                |    0
 2026test/functional/01_var_defn2.sy           |    0
 2026test/functional/02_var_defn3.sy           |    0
 2026test/functional/03_arr_defn2.sy           |    0
 2026test/functional/04_arr_defn3.sy           |    0
 2026test/functional/05_arr_defn4.sy           |    0
 2026test/functional/06_const_var_defn2.sy     |    0
 2026test/functional/07_const_var_defn3.sy     |    0
 2026test/functional/08_const_array_defn.sy    |    0
 2026test/functional/09_func_defn.sy           |    0
 2026test/functional/10_var_defn_func.sy       |    0
 2026test/functional/11_add2.sy                |    0
 2026test/functional/12_addc.sy                |    0
 2026test/functional/13_sub2.sy                |    0
 2026test/functional/14_subc.sy                |    0
 2026test/functional/15_mul.sy                 |    0
 2026test/functional/16_mulc.sy                |    0
 2026test/functional/17_div.sy                 |    0
 2026test/functional/18_divc.sy                |    0
 2026test/functional/19_mod.sy                 |    0
 2026test/functional/20_rem.sy                 |    0
 2026test/functional/21_if_test2.sy            |    0
 2026test/functional/22_if_test3.sy            |    0
 2026test/functional/23_if_test4.sy            |    0
 2026test/functional/24_if_test5.sy            |    0
 2026test/functional/25_while_if.sy            |    0
 2026test/functional/26_while_test1.sy         |    0
 2026test/functional/27_while_test2.sy         |    0
 2026test/functional/28_while_test3.sy         |    0
 2026test/functional/29_break.sy               |    0
 2026test/functional/30_continue.sy            |    0
 2026test/functional/31_while_if_test1.sy      |    0
 2026test/functional/32_while_if_test2.sy      |    0
 2026test/functional/33_while_if_test3.sy      |    0
 2026test/functional/34_arr_expr_len.sy        |    0
 2026test/functional/35_op_priority1.sy        |    0
 2026test/functional/36_op_priority2.sy        |    0
 2026test/functional/37_op_priority3.sy        |    0
 2026test/functional/38_op_priority4.in        |    0
 2026test/functional/38_op_priority4.sy        |    0
 2026test/functional/39_op_priority5.sy        |    0
 2026test/functional/40_unary_op.sy            |    0
 2026test/functional/41_unary_op2.sy           |    0
 2026test/functional/42_empty_stmt.sy          |    0
 2026test/functional/43_logi_assign.in         |    0
 2026test/functional/43_logi_assign.sy         |    0
 2026test/functional/44_stmt_expr.sy           |    0
 2026test/functional/45_comment1.sy            |    0
 2026test/functional/46_hex_defn.sy            |    0
 2026test/functional/47_hex_oct_add.sy         |    0
 2026test/functional/48_assign_complex_expr.sy |    0
 2026test/functional/49_if_complex_expr.sy     |    0
 2026test/functional/50_short_circuit.in       |    0
 2026test/functional/50_short_circuit.sy       |    0
 2026test/functional/51_short_circuit3.sy      |    0
 2026test/functional/52_scope.sy               |    0
 2026test/functional/53_scope2.sy              |    0
 2026test/functional/54_hidden_var.sy          |    0
 2026test/functional/55_sort_test1.sy          |    0
 2026test/functional/56_sort_test2.sy          |    0
 2026test/functional/57_sort_test3.sy          |    0
 2026test/functional/58_sort_test4.sy          |    0
 2026test/functional/59_sort_test5.sy          |    0
 2026test/functional/60_sort_test6.sy          |    0
 2026test/functional/61_sort_test7.in          |    0
 2026test/functional/61_sort_test7.sy          |    0
 2026test/functional/62_percolation.in         |    0
 2026test/functional/62_percolation.sy         |    0
 2026test/functional/63_big_int_mul.sy         |    0
 2026test/functional/64_calculator.in          |    0
 2026test/functional/64_calculator.sy          |    0
 2026test/functional/65_color.in               |    0
 2026test/functional/65_color.sy               |    0
 2026test/functional/66_exgcd.sy               |    0
 2026test/functional/67_reverse_output.in      |    0
 2026test/functional/67_reverse_output.sy      |    0
 2026test/functional/68_brainfk.in             |    0
 2026test/functional/68_brainfk.sy             |    0
 2026test/functional/69_expr_eval.in           |    0
 2026test/functional/69_expr_eval.sy           |    0
 2026test/functional/70_dijkstra.in            |    0
 2026test/functional/70_dijkstra.sy            |    0
 2026test/functional/71_full_conn.in           |    0
 2026test/functional/71_full_conn.sy           |    0
 2026test/functional/72_hanoi.in               |    0
 2026test/functional/72_hanoi.sy               |    0
 2026test/functional/73_int_io.in              |    0
 2026test/functional/73_int_io.sy              |    0
 2026test/functional/74_kmp.in                 |    0
 2026test/functional/74_kmp.sy                 |    0
 2026test/functional/75_max_flow.in            |    0
 2026test/functional/75_max_flow.sy            |    0
 2026test/functional/76_n_queens.in            |    0
 2026test/functional/76_n_queens.sy            |    0
 2026test/functional/77_substr.sy              |    0
 2026test/functional/78_side_effect.sy         |    0
 2026test/functional/79_var_name.sy            |    0
 2026test/functional/80_chaos_token.sy         |    0
 2026test/functional/81_skip_spaces.in         |    0
 2026test/functional/81_skip_spaces.sy         |    0
 2026test/functional/82_long_func.sy           |    0
 2026test/functional/83_long_array.sy          |    0
 2026test/functional/84_long_array2.sy         |    0
 2026test/functional/85_long_code.sy           |    0
 2026test/functional/86_long_code2.sy          |    0
 2026test/functional/87_many_params.in         |    0
 2026test/functional/87_many_params.sy         |    0
 2026test/functional/88_many_params2.sy        |    0
 2026test/functional/89_many_globals.sy        |    0
 2026test/functional/90_many_locals.sy         |    0
 2026test/functional/91_many_locals2.in        |    0
 2026test/functional/91_many_locals2.sy        |    0
 2026test/functional/92_register_alloc.in      |    0
 2026test/functional/92_register_alloc.sy      |    0
 2026test/functional/93_nested_calls.in        |    0
 2026test/functional/93_nested_calls.sy        |    0
 2026test/functional/94_nested_loops.in        |    0
 2026test/functional/94_nested_loops.sy        |    0
 2026test/functional/95_float.in               |    0
 2026test/functional/95_float.sy               |    0
 2026test/functional/96_matrix_add.sy          |    0
 2026test/functional/97_matrix_sub.sy          |    0
 2026test/functional/98_matrix_mul.sy          |    0
 2026test/functional/99_matrix_tran.sy         |    0
 2026test/h_functional/00_comment2.sy          |    0
 2026test/h_functional/01_multiple_returns.sy  |    0
 2026test/h_functional/02_ret_in_block.sy      |    0
 2026test/h_functional/03_branch.sy            |    0
 2026test/h_functional/04_break_continue.sy    |    0
 2026test/h_functional/05_param_name.sy        |    0
 2026test/h_functional/06_func_name.sy         |    0
 2026test/h_functional/07_arr_init_nd.sy       |    0
 2026test/h_functional/08_global_arr_init.sy   |    0
 2026test/h_functional/09_BFS.in               |    0
 2026test/h_functional/09_BFS.sy               |    0
 2026test/h_functional/10_DFS.in               |    0
 2026test/h_functional/10_DFS.sy               |    0
 2026test/h_functional/11_BST.in               |    0
 2026test/h_functional/11_BST.sy               |    0
 2026test/h_functional/12_DSU.in               |    0
 2026test/h_functional/12_DSU.sy               |    0
 2026test/h_functional/13_LCA.in               |    0
 2026test/h_functional/13_LCA.sy               |    0
 2026test/h_functional/14_dp.in                |    0
 2026test/h_functional/14_dp.sy                |    0
 2026test/h_functional/15_graph_coloring.sy    |    0
 2026test/h_functional/16_k_smallest.in        |    0
 2026test/h_functional/16_k_smallest.sy        |    0
 2026test/h_functional/17_maximal_clique.in    |    0
 2026test/h_functional/17_maximal_clique.sy    |    0
 2026test/h_functional/18_prim.in              |    0
 2026test/h_functional/18_prim.sy              |    0
 2026test/h_functional/19_search.in            |    0
 2026test/h_functional/19_search.sy            |    0
 2026test/h_functional/20_sort.in              |    0
 2026test/h_functional/20_sort.sy              |    0
 2026test/h_functional/21_union_find.in        |    0
 2026test/h_functional/21_union_find.sy        |    0
 2026test/h_functional/22_matrix_multiply.in   |    0
 2026test/h_functional/22_matrix_multiply.sy   |    0
 2026test/h_functional/23_json.in              |    0
 2026test/h_functional/23_json.sy              |    0
 2026test/h_functional/24_array_only.in        |    0
 2026test/h_functional/24_array_only.sy        |    0
 2026test/h_functional/25_scope3.sy            |    0
 2026test/h_functional/26_scope4.sy            |    0
 2026test/h_functional/27_scope5.sy            |    0
 2026test/h_functional/28_side_effect2.sy      |    0
 2026test/h_functional/29_long_line.sy         |    0
 2026test/h_functional/30_many_dimensions.sy   |    0
 2026test/h_functional/31_many_indirections.sy |    0
 2026test/h_functional/32_many_params3.sy      |    0
 2026test/h_functional/33_multi_branch.in      |    0
 2026test/h_functional/33_multi_branch.sy      |    0
 2026test/h_functional/34_multi_loop.sy        |    0
 2026test/h_functional/35_math.in              |    0
 2026test/h_functional/35_math.sy              |    0
 2026test/h_functional/36_rotate.in            |    0
 2026test/h_functional/36_rotate.sy            |    0
 2026test/h_functional/37_dct.in               |    0
 2026test/h_functional/37_dct.sy               |    0
 2026test/h_functional/38_light2d.sy           |    0
 2026test/h_functional/39_fp_params.in         |    0
 2026test/h_functional/39_fp_params.sy         |    0
 2026test/performance/01_mm1.in                |    0
 2026test/performance/01_mm1.sy                |    0
 2026test/performance/01_mm2.in                |    0
 2026test/performance/01_mm2.sy                |    0
 2026test/performance/01_mm3.in                |    0
 2026test/performance/01_mm3.sy                |    0
 2026test/performance/03_sort1.in              |    0
 2026test/performance/03_sort1.sy              |    0
 2026test/performance/03_sort2.in              |    0
 2026test/performance/03_sort2.sy              |    0
 2026test/performance/03_sort3.in              |    0
 2026test/performance/03_sort3.sy              |    0
 2026test/performance/2025-LYY-59.in           |    0
 2026test/performance/2025-QMJ-23.in           |    0
 2026test/performance/2025-SPR-60.in           |    0
 2026test/performance/conv2d-1.in              |    0
 2026test/performance/conv2d-1.sy              |    0
 2026test/performance/conv2d-2.in              |    0
 2026test/performance/conv2d-2.sy              |    0
 2026test/performance/conv2d-3.in              |    0
 2026test/performance/conv2d-3.sy              |    0
 2026test/performance/crc1.in                  |    0
 2026test/performance/crc1.sy                  |    0
 2026test/performance/crc2.in                  |    0
 2026test/performance/crc2.sy                  |    0
 2026test/performance/crc3.in                  |    0
 2026test/performance/crc3.sy                  |    0
 2026test/performance/crypto-1.in              |    0
 2026test/performance/crypto-1.sy              |    0
 2026test/performance/crypto-2.in              |    0
 2026test/performance/crypto-2.sy              |    0
 2026test/performance/crypto-3.in              |    0
 2026test/performance/crypto-3.sy              |    0
 2026test/performance/fft0.in                  |    0
 2026test/performance/fft0.sy                  |    0
 2026test/performance/fft1.in                  |    0
 2026test/performance/fft1.sy                  |    0
 2026test/performance/fft2.in                  |    0
 2026test/performance/fft2.sy                  |    0
 2026test/performance/h-1-01.in                |    0
 2026test/performance/h-1-01.sy                |    0
 2026test/performance/h-1-02.in                |    0
 2026test/performance/h-1-02.sy                |    0
 2026test/performance/h-1-03.in                |    0
 2026test/performance/h-1-03.sy                |    0
 2026test/performance/h-10-01.in               |    0
 2026test/performance/h-10-01.sy               |    0
 2026test/performance/h-10-02.in               |    0
 2026test/performance/h-10-02.sy               |    0
 2026test/performance/h-10-03.in               |    0
 2026test/performance/h-10-03.sy               |    0
 2026test/performance/h-4-01.in                |    0
 2026test/performance/h-4-01.sy                |    0
 2026test/performance/h-4-02.in                |    0
 2026test/performance/h-4-02.sy                |    0
 2026test/performance/h-4-03.in                |    0
 2026test/performance/h-4-03.sy                |    0
 2026test/performance/h-5-01.in                |    0
 2026test/performance/h-5-01.sy                |    0
 2026test/performance/h-5-02.in                |    0
 2026test/performance/h-5-02.sy                |    0
 2026test/performance/h-5-03.in                |    0
 2026test/performance/h-5-03.sy                |    0
 2026test/performance/h-8-01.sy                |    0
 2026test/performance/h-8-02.sy                |    0
 2026test/performance/h-8-03.sy                |    0
 2026test/performance/h-9-01.in                |    0
 2026test/performance/h-9-01.sy                |    0
 2026test/performance/h-9-02.in                |    0
 2026test/performance/h-9-02.sy                |    0
 2026test/performance/h-9-03.in                |    0
 2026test/performance/h-9-03.sy                |    0
 2026test/performance/huffman-01.in            |    0
 2026test/performance/huffman-01.sy            |    0
 2026test/performance/huffman-02.in            |    0
 2026test/performance/huffman-02.sy            |    0
 2026test/performance/huffman-03.in            |    0
 2026test/performance/huffman-03.sy            |    0
 2026test/performance/knapsack_naive-1.in      |    0
 2026test/performance/knapsack_naive-1.sy      |    0
 2026test/performance/knapsack_naive-2.in      |    0
 2026test/performance/knapsack_naive-2.sy      |    0
 2026test/performance/knapsack_naive-3.in      |    0
 2026test/performance/knapsack_naive-3.sy      |    0
 2026test/performance/many_mat_cal-1.in        |    0
 2026test/performance/many_mat_cal-1.sy        |    0
 2026test/performance/many_mat_cal-2.in        |    0
 2026test/performance/many_mat_cal-2.sy        |    0
 2026test/performance/many_mat_cal-3.in        |    0
 2026test/performance/many_mat_cal-3.sy        |    0
 2026test/performance/matmul1.in               |    0
 2026test/performance/matmul1.sy               |    0
 2026test/performance/matmul2.in               |    0
 2026test/performance/matmul2.sy               |    0
 2026test/performance/matmul3.in               |    0
 2026test/performance/matmul3.sy               |    0
 .../performance/optimization_scheduling1.in   |    0
 .../performance/optimization_scheduling1.sy   |    0
 .../performance/optimization_scheduling2.in   |    0
 .../performance/optimization_scheduling2.sy   |    0
 .../performance/optimization_scheduling3.in   |    0
 .../performance/optimization_scheduling3.sy   |    0
 2026test/performance/shuffle0.in              |    0
 2026test/performance/shuffle0.sy              |    0
 2026test/performance/shuffle1.in              |    0
 2026test/performance/shuffle1.sy              |    0
 2026test/performance/shuffle2.in              |    0
 2026test/performance/shuffle2.sy              |    0
 2026test/performance/sl1.in                   |    0
 2026test/performance/sl1.sy                   |    0
 2026test/performance/sl2.in                   |    0
 2026test/performance/sl2.sy                   |    0
 2026test/performance/sl3.in                   |    0
 2026test/performance/sl3.sy                   |    0
 2026test/performance/transpose0.in            |    0
 2026test/performance/transpose0.sy            |    0
 2026test/performance/transpose1.in            |    0
 2026test/performance/transpose1.sy            |    0
 2026test/performance/transpose2.in            |    0
 2026test/performance/transpose2.sy            |    0
 CLAUDE.md                                     |   87 +
 copy_src.sh                                   |   15 +
 doc/LLVM-Loop-Block-分析报告.md           |  334 +++
 doc/LLVM-Loop-Fussion-分析报告.md         |  318 +++
 doc/LLVM-Loop-Interchange-分析报告.md     |  443 ++++
 doc/opt-cookbook-ai-loop-interchange.md       |  185 ++
 include/frontend/AntlrDriver.h                |   20 -
 include/frontend/SyntaxTreePrinter.h          |    9 -
 include/ir/IR.h                               |  545 -----
 include/irgen/IRGen.h                         |  122 --
 include/mir/MIR.h                             |  414 ----
 include/sem/Sema.h                            |   92 -
 include/sem/SymbolTable.h                     |   22 -
 include/utils/CLI.h                           |   15 -
 include/utils/Log.h                           |   20 -
 optimization-designs/.gitkeep                 |    1 +
 .../00-总览-优化全景.md                 |   82 +
 .../01-IR优化-Mem2Reg与SSA构造.md        |   48 +
 .../02-IR优化-循环优化.md               |   85 +
 .../03-IR优化-NEON自动向量化.md        |   77 +
 .../04-IR优化-标量优化Pass.md           |  101 +
 .../05-MIR优化-降级时优化.md           |   92 +
 .../06-MIR优化-寄存器分配前优化.md  |   72 +
 .../07-MIR优化-寄存器分配.md           |   74 +
 .../08-MIR优化-Peephole窥孔.md            |  105 +
 ...MIR优化-BlockLayout与PhysRegCopyProp.md |   75 +
 .../10-关键缺失与性能飞跃路径.md   |  189 ++
 .../live-range-splitting-splitkit.md          |   23 +
 .../regalloc-layer1-rewrite.md                |   30 +
 optimization-designs/优化记录.md          |  417 ++++
 src/include/ir/IR.h                           |   11 +-
 src/include/ir/analysis/AliasAnalysis.h       |   23 +
 src/include/ir/analysis/DominatorTree.h       |  127 ++
 src/include/ir/analysis/MemorySSA.h           |  165 ++
 src/include/ir/analysis/PostDominatorTree.h   |   65 +
 src/include/ir/analysis/ScalarEvolution.h     |  271 +++
 src/include/ir/passes/PassManager.h           |    4 +
 src/include/mir/GreedyAlloc.h                 |   12 +
 src/include/mir/LiveIntervals.h               |  177 ++
 src/include/mir/LiveRangeEdit.h               |  101 +
 src/include/mir/MachineRegisterInfo.h         |   66 +
 src/ir/Type.cpp                               |   19 +-
 src/ir/analysis/AliasAnalysis.cpp             |   89 +
 src/ir/analysis/MemorySSA.cpp                 |  541 +++++
 src/ir/analysis/PostDominatorTree.cpp         |  120 ++
 src/ir/analysis/ScalarEvolution.cpp           |  561 +++++
 src/ir/passes/CMakeLists.txt                  |    2 +
 src/ir/passes/DSE.cpp                         |  145 ++
 src/ir/passes/IRVerifier.cpp                  |  208 ++
 src/ir/passes/IfConversion.cpp                |  290 +++
 src/ir/passes/LoopInterchange.cpp             | 1128 ++++++++++
 src/ir/passes/LoopUnroll.cpp                  |  345 +++
 src/ir/passes/LoopVectorize.cpp               |  795 +++++++
 src/ir/passes/SCCP.cpp                        |  261 +++
 src/mir/GreedyAlloc.cpp                       | 1907 +++++++++++++++++
 src/mir/LiveIntervals.cpp                     |  719 +++++++
 src/mir/MIRVerifier.cpp                       |  337 +++
 src/mir/MachineRegisterInfo.cpp               |  270 +++
 src/mir/passes/CopyPropagation.cpp            |  295 +++
 src/mir/passes/FoldImm.cpp                    |  112 +
 src/mir/passes/LiveRangeSplit.cpp             |  192 ++
 src/mir/passes/MIRCleanup.cpp                 |  100 +
 src/mir/passes/PhiElimination.cpp             |  114 +
 src/mir/passes/PhysRegCopyProp.cpp            |  296 +++
 src/mir/passes/RegisterCoalescer.cpp          |  171 ++
 src/mir/passes/TailCallOpt.cpp                |   64 +
 src/mir/passes/TwoAddress.cpp                 |   84 +
 372 files changed, 13675 insertions(+), 1263 deletions(-)
 create mode 100755 2026test.sh
 mode change 100755 => 100644 2026test/functional/00_main.sy
 mode change 100755 => 100644 2026test/functional/01_var_defn2.sy
 mode change 100755 => 100644 2026test/functional/02_var_defn3.sy
 mode change 100755 => 100644 2026test/functional/03_arr_defn2.sy
 mode change 100755 => 100644 2026test/functional/04_arr_defn3.sy
 mode change 100755 => 100644 2026test/functional/05_arr_defn4.sy
 mode change 100755 => 100644 2026test/functional/06_const_var_defn2.sy
 mode change 100755 => 100644 2026test/functional/07_const_var_defn3.sy
 mode change 100755 => 100644 2026test/functional/08_const_array_defn.sy
 mode change 100755 => 100644 2026test/functional/09_func_defn.sy
 mode change 100755 => 100644 2026test/functional/10_var_defn_func.sy
 mode change 100755 => 100644 2026test/functional/11_add2.sy
 mode change 100755 => 100644 2026test/functional/12_addc.sy
 mode change 100755 => 100644 2026test/functional/13_sub2.sy
 mode change 100755 => 100644 2026test/functional/14_subc.sy
 mode change 100755 => 100644 2026test/functional/15_mul.sy
 mode change 100755 => 100644 2026test/functional/16_mulc.sy
 mode change 100755 => 100644 2026test/functional/17_div.sy
 mode change 100755 => 100644 2026test/functional/18_divc.sy
 mode change 100755 => 100644 2026test/functional/19_mod.sy
 mode change 100755 => 100644 2026test/functional/20_rem.sy
 mode change 100755 => 100644 2026test/functional/21_if_test2.sy
 mode change 100755 => 100644 2026test/functional/22_if_test3.sy
 mode change 100755 => 100644 2026test/functional/23_if_test4.sy
 mode change 100755 => 100644 2026test/functional/24_if_test5.sy
 mode change 100755 => 100644 2026test/functional/25_while_if.sy
 mode change 100755 => 100644 2026test/functional/26_while_test1.sy
 mode change 100755 => 100644 2026test/functional/27_while_test2.sy
 mode change 100755 => 100644 2026test/functional/28_while_test3.sy
 mode change 100755 => 100644 2026test/functional/29_break.sy
 mode change 100755 => 100644 2026test/functional/30_continue.sy
 mode change 100755 => 100644 2026test/functional/31_while_if_test1.sy
 mode change 100755 => 100644 2026test/functional/32_while_if_test2.sy
 mode change 100755 => 100644 2026test/functional/33_while_if_test3.sy
 mode change 100755 => 100644 2026test/functional/34_arr_expr_len.sy
 mode change 100755 => 100644 2026test/functional/35_op_priority1.sy
 mode change 100755 => 100644 2026test/functional/36_op_priority2.sy
 mode change 100755 => 100644 2026test/functional/37_op_priority3.sy
 mode change 100755 => 100644 2026test/functional/38_op_priority4.in
 mode change 100755 => 100644 2026test/functional/38_op_priority4.sy
 mode change 100755 => 100644 2026test/functional/39_op_priority5.sy
 mode change 100755 => 100644 2026test/functional/40_unary_op.sy
 mode change 100755 => 100644 2026test/functional/41_unary_op2.sy
 mode change 100755 => 100644 2026test/functional/42_empty_stmt.sy
 mode change 100755 => 100644 2026test/functional/43_logi_assign.in
 mode change 100755 => 100644 2026test/functional/43_logi_assign.sy
 mode change 100755 => 100644 2026test/functional/44_stmt_expr.sy
 mode change 100755 => 100644 2026test/functional/45_comment1.sy
 mode change 100755 => 100644 2026test/functional/46_hex_defn.sy
 mode change 100755 => 100644 2026test/functional/47_hex_oct_add.sy
 mode change 100755 => 100644 2026test/functional/48_assign_complex_expr.sy
 mode change 100755 => 100644 2026test/functional/49_if_complex_expr.sy
 mode change 100755 => 100644 2026test/functional/50_short_circuit.in
 mode change 100755 => 100644 2026test/functional/50_short_circuit.sy
 mode change 100755 => 100644 2026test/functional/51_short_circuit3.sy
 mode change 100755 => 100644 2026test/functional/52_scope.sy
 mode change 100755 => 100644 2026test/functional/53_scope2.sy
 mode change 100755 => 100644 2026test/functional/54_hidden_var.sy
 mode change 100755 => 100644 2026test/functional/55_sort_test1.sy
 mode change 100755 => 100644 2026test/functional/56_sort_test2.sy
 mode change 100755 => 100644 2026test/functional/57_sort_test3.sy
 mode change 100755 => 100644 2026test/functional/58_sort_test4.sy
 mode change 100755 => 100644 2026test/functional/59_sort_test5.sy
 mode change 100755 => 100644 2026test/functional/60_sort_test6.sy
 mode change 100755 => 100644 2026test/functional/61_sort_test7.in
 mode change 100755 => 100644 2026test/functional/61_sort_test7.sy
 mode change 100755 => 100644 2026test/functional/62_percolation.in
 mode change 100755 => 100644 2026test/functional/62_percolation.sy
 mode change 100755 => 100644 2026test/functional/63_big_int_mul.sy
 mode change 100755 => 100644 2026test/functional/64_calculator.in
 mode change 100755 => 100644 2026test/functional/64_calculator.sy
 mode change 100755 => 100644 2026test/functional/65_color.in
 mode change 100755 => 100644 2026test/functional/65_color.sy
 mode change 100755 => 100644 2026test/functional/66_exgcd.sy
 mode change 100755 => 100644 2026test/functional/67_reverse_output.in
 mode change 100755 => 100644 2026test/functional/67_reverse_output.sy
 mode change 100755 => 100644 2026test/functional/68_brainfk.in
 mode change 100755 => 100644 2026test/functional/68_brainfk.sy
 mode change 100755 => 100644 2026test/functional/69_expr_eval.in
 mode change 100755 => 100644 2026test/functional/69_expr_eval.sy
 mode change 100755 => 100644 2026test/functional/70_dijkstra.in
 mode change 100755 => 100644 2026test/functional/70_dijkstra.sy
 mode change 100755 => 100644 2026test/functional/71_full_conn.in
 mode change 100755 => 100644 2026test/functional/71_full_conn.sy
 mode change 100755 => 100644 2026test/functional/72_hanoi.in
 mode change 100755 => 100644 2026test/functional/72_hanoi.sy
 mode change 100755 => 100644 2026test/functional/73_int_io.in
 mode change 100755 => 100644 2026test/functional/73_int_io.sy
 mode change 100755 => 100644 2026test/functional/74_kmp.in
 mode change 100755 => 100644 2026test/functional/74_kmp.sy
 mode change 100755 => 100644 2026test/functional/75_max_flow.in
 mode change 100755 => 100644 2026test/functional/75_max_flow.sy
 mode change 100755 => 100644 2026test/functional/76_n_queens.in
 mode change 100755 => 100644 2026test/functional/76_n_queens.sy
 mode change 100755 => 100644 2026test/functional/77_substr.sy
 mode change 100755 => 100644 2026test/functional/78_side_effect.sy
 mode change 100755 => 100644 2026test/functional/79_var_name.sy
 mode change 100755 => 100644 2026test/functional/80_chaos_token.sy
 mode change 100755 => 100644 2026test/functional/81_skip_spaces.in
 mode change 100755 => 100644 2026test/functional/81_skip_spaces.sy
 mode change 100755 => 100644 2026test/functional/82_long_func.sy
 mode change 100755 => 100644 2026test/functional/83_long_array.sy
 mode change 100755 => 100644 2026test/functional/84_long_array2.sy
 mode change 100755 => 100644 2026test/functional/85_long_code.sy
 mode change 100755 => 100644 2026test/functional/86_long_code2.sy
 mode change 100755 => 100644 2026test/functional/87_many_params.in
 mode change 100755 => 100644 2026test/functional/87_many_params.sy
 mode change 100755 => 100644 2026test/functional/88_many_params2.sy
 mode change 100755 => 100644 2026test/functional/89_many_globals.sy
 mode change 100755 => 100644 2026test/functional/90_many_locals.sy
 mode change 100755 => 100644 2026test/functional/91_many_locals2.in
 mode change 100755 => 100644 2026test/functional/91_many_locals2.sy
 mode change 100755 => 100644 2026test/functional/92_register_alloc.in
 mode change 100755 => 100644 2026test/functional/92_register_alloc.sy
 mode change 100755 => 100644 2026test/functional/93_nested_calls.in
 mode change 100755 => 100644 2026test/functional/93_nested_calls.sy
 mode change 100755 => 100644 2026test/functional/94_nested_loops.in
 mode change 100755 => 100644 2026test/functional/94_nested_loops.sy
 mode change 100755 => 100644 2026test/functional/95_float.in
 mode change 100755 => 100644 2026test/functional/95_float.sy
 mode change 100755 => 100644 2026test/functional/96_matrix_add.sy
 mode change 100755 => 100644 2026test/functional/97_matrix_sub.sy
 mode change 100755 => 100644 2026test/functional/98_matrix_mul.sy
 mode change 100755 => 100644 2026test/functional/99_matrix_tran.sy
 mode change 100755 => 100644 2026test/h_functional/00_comment2.sy
 mode change 100755 => 100644 2026test/h_functional/01_multiple_returns.sy
 mode change 100755 => 100644 2026test/h_functional/02_ret_in_block.sy
 mode change 100755 => 100644 2026test/h_functional/03_branch.sy
 mode change 100755 => 100644 2026test/h_functional/04_break_continue.sy
 mode change 100755 => 100644 2026test/h_functional/05_param_name.sy
 mode change 100755 => 100644 2026test/h_functional/06_func_name.sy
 mode change 100755 => 100644 2026test/h_functional/07_arr_init_nd.sy
 mode change 100755 => 100644 2026test/h_functional/08_global_arr_init.sy
 mode change 100755 => 100644 2026test/h_functional/09_BFS.in
 mode change 100755 => 100644 2026test/h_functional/09_BFS.sy
 mode change 100755 => 100644 2026test/h_functional/10_DFS.in
 mode change 100755 => 100644 2026test/h_functional/10_DFS.sy
 mode change 100755 => 100644 2026test/h_functional/11_BST.in
 mode change 100755 => 100644 2026test/h_functional/11_BST.sy
 mode change 100755 => 100644 2026test/h_functional/12_DSU.in
 mode change 100755 => 100644 2026test/h_functional/12_DSU.sy
 mode change 100755 => 100644 2026test/h_functional/13_LCA.in
 mode change 100755 => 100644 2026test/h_functional/13_LCA.sy
 mode change 100755 => 100644 2026test/h_functional/14_dp.in
 mode change 100755 => 100644 2026test/h_functional/14_dp.sy
 mode change 100755 => 100644 2026test/h_functional/15_graph_coloring.sy
 mode change 100755 => 100644 2026test/h_functional/16_k_smallest.in
 mode change 100755 => 100644 2026test/h_functional/16_k_smallest.sy
 mode change 100755 => 100644 2026test/h_functional/17_maximal_clique.in
 mode change 100755 => 100644 2026test/h_functional/17_maximal_clique.sy
 mode change 100755 => 100644 2026test/h_functional/18_prim.in
 mode change 100755 => 100644 2026test/h_functional/18_prim.sy
 mode change 100755 => 100644 2026test/h_functional/19_search.in
 mode change 100755 => 100644 2026test/h_functional/19_search.sy
 mode change 100755 => 100644 2026test/h_functional/20_sort.in
 mode change 100755 => 100644 2026test/h_functional/20_sort.sy
 mode change 100755 => 100644 2026test/h_functional/21_union_find.in
 mode change 100755 => 100644 2026test/h_functional/21_union_find.sy
 mode change 100755 => 100644 2026test/h_functional/22_matrix_multiply.in
 mode change 100755 => 100644 2026test/h_functional/22_matrix_multiply.sy
 mode change 100755 => 100644 2026test/h_functional/23_json.in
 mode change 100755 => 100644 2026test/h_functional/23_json.sy
 mode change 100755 => 100644 2026test/h_functional/24_array_only.in
 mode change 100755 => 100644 2026test/h_functional/24_array_only.sy
 mode change 100755 => 100644 2026test/h_functional/25_scope3.sy
 mode change 100755 => 100644 2026test/h_functional/26_scope4.sy
 mode change 100755 => 100644 2026test/h_functional/27_scope5.sy
 mode change 100755 => 100644 2026test/h_functional/28_side_effect2.sy
 mode change 100755 => 100644 2026test/h_functional/29_long_line.sy
 mode change 100755 => 100644 2026test/h_functional/30_many_dimensions.sy
 mode change 100755 => 100644 2026test/h_functional/31_many_indirections.sy
 mode change 100755 => 100644 2026test/h_functional/32_many_params3.sy
 mode change 100755 => 100644 2026test/h_functional/33_multi_branch.in
 mode change 100755 => 100644 2026test/h_functional/33_multi_branch.sy
 mode change 100755 => 100644 2026test/h_functional/34_multi_loop.sy
 mode change 100755 => 100644 2026test/h_functional/35_math.in
 mode change 100755 => 100644 2026test/h_functional/35_math.sy
 mode change 100755 => 100644 2026test/h_functional/36_rotate.in
 mode change 100755 => 100644 2026test/h_functional/36_rotate.sy
 mode change 100755 => 100644 2026test/h_functional/37_dct.in
 mode change 100755 => 100644 2026test/h_functional/37_dct.sy
 mode change 100755 => 100644 2026test/h_functional/38_light2d.sy
 mode change 100755 => 100644 2026test/h_functional/39_fp_params.in
 mode change 100755 => 100644 2026test/h_functional/39_fp_params.sy
 mode change 100755 => 100644 2026test/performance/01_mm1.in
 mode change 100755 => 100644 2026test/performance/01_mm1.sy
 mode change 100755 => 100644 2026test/performance/01_mm2.in
 mode change 100755 => 100644 2026test/performance/01_mm2.sy
 mode change 100755 => 100644 2026test/performance/01_mm3.in
 mode change 100755 => 100644 2026test/performance/01_mm3.sy
 mode change 100755 => 100644 2026test/performance/03_sort1.in
 mode change 100755 => 100644 2026test/performance/03_sort1.sy
 mode change 100755 => 100644 2026test/performance/03_sort2.in
 mode change 100755 => 100644 2026test/performance/03_sort2.sy
 mode change 100755 => 100644 2026test/performance/03_sort3.in
 mode change 100755 => 100644 2026test/performance/03_sort3.sy
 mode change 100755 => 100644 2026test/performance/2025-LYY-59.in
 mode change 100755 => 100644 2026test/performance/2025-QMJ-23.in
 mode change 100755 => 100644 2026test/performance/2025-SPR-60.in
 mode change 100755 => 100644 2026test/performance/conv2d-1.in
 mode change 100755 => 100644 2026test/performance/conv2d-1.sy
 mode change 100755 => 100644 2026test/performance/conv2d-2.in
 mode change 100755 => 100644 2026test/performance/conv2d-2.sy
 mode change 100755 => 100644 2026test/performance/conv2d-3.in
 mode change 100755 => 100644 2026test/performance/conv2d-3.sy
 mode change 100755 => 100644 2026test/performance/crc1.in
 mode change 100755 => 100644 2026test/performance/crc1.sy
 mode change 100755 => 100644 2026test/performance/crc2.in
 mode change 100755 => 100644 2026test/performance/crc2.sy
 mode change 100755 => 100644 2026test/performance/crc3.in
 mode change 100755 => 100644 2026test/performance/crc3.sy
 mode change 100755 => 100644 2026test/performance/crypto-1.in
 mode change 100755 => 100644 2026test/performance/crypto-1.sy
 mode change 100755 => 100644 2026test/performance/crypto-2.in
 mode change 100755 => 100644 2026test/performance/crypto-2.sy
 mode change 100755 => 100644 2026test/performance/crypto-3.in
 mode change 100755 => 100644 2026test/performance/crypto-3.sy
 mode change 100755 => 100644 2026test/performance/fft0.in
 mode change 100755 => 100644 2026test/performance/fft0.sy
 mode change 100755 => 100644 2026test/performance/fft1.in
 mode change 100755 => 100644 2026test/performance/fft1.sy
 mode change 100755 => 100644 2026test/performance/fft2.in
 mode change 100755 => 100644 2026test/performance/fft2.sy
 mode change 100755 => 100644 2026test/performance/h-1-01.in
 mode change 100755 => 100644 2026test/performance/h-1-01.sy
 mode change 100755 => 100644 2026test/performance/h-1-02.in
 mode change 100755 => 100644 2026test/performance/h-1-02.sy
 mode change 100755 => 100644 2026test/performance/h-1-03.in
 mode change 100755 => 100644 2026test/performance/h-1-03.sy
 mode change 100755 => 100644 2026test/performance/h-10-01.in
 mode change 100755 => 100644 2026test/performance/h-10-01.sy
 mode change 100755 => 100644 2026test/performance/h-10-02.in
 mode change 100755 => 100644 2026test/performance/h-10-02.sy
 mode change 100755 => 100644 2026test/performance/h-10-03.in
 mode change 100755 => 100644 2026test/performance/h-10-03.sy
 mode change 100755 => 100644 2026test/performance/h-4-01.in
 mode change 100755 => 100644 2026test/performance/h-4-01.sy
 mode change 100755 => 100644 2026test/performance/h-4-02.in
 mode change 100755 => 100644 2026test/performance/h-4-02.sy
 mode change 100755 => 100644 2026test/performance/h-4-03.in
 mode change 100755 => 100644 2026test/performance/h-4-03.sy
 mode change 100755 => 100644 2026test/performance/h-5-01.in
 mode change 100755 => 100644 2026test/performance/h-5-01.sy
 mode change 100755 => 100644 2026test/performance/h-5-02.in
 mode change 100755 => 100644 2026test/performance/h-5-02.sy
 mode change 100755 => 100644 2026test/performance/h-5-03.in
 mode change 100755 => 100644 2026test/performance/h-5-03.sy
 mode change 100755 => 100644 2026test/performance/h-8-01.sy
 mode change 100755 => 100644 2026test/performance/h-8-02.sy
 mode change 100755 => 100644 2026test/performance/h-8-03.sy
 mode change 100755 => 100644 2026test/performance/h-9-01.in
 mode change 100755 => 100644 2026test/performance/h-9-01.sy
 mode change 100755 => 100644 2026test/performance/h-9-02.in
 mode change 100755 => 100644 2026test/performance/h-9-02.sy
 mode change 100755 => 100644 2026test/performance/h-9-03.in
 mode change 100755 => 100644 2026test/performance/h-9-03.sy
 mode change 100755 => 100644 2026test/performance/huffman-01.in
 mode change 100755 => 100644 2026test/performance/huffman-01.sy
 mode change 100755 => 100644 2026test/performance/huffman-02.in
 mode change 100755 => 100644 2026test/performance/huffman-02.sy
 mode change 100755 => 100644 2026test/performance/huffman-03.in
 mode change 100755 => 100644 2026test/performance/huffman-03.sy
 mode change 100755 => 100644 2026test/performance/knapsack_naive-1.in
 mode change 100755 => 100644 2026test/performance/knapsack_naive-1.sy
 mode change 100755 => 100644 2026test/performance/knapsack_naive-2.in
 mode change 100755 => 100644 2026test/performance/knapsack_naive-2.sy
 mode change 100755 => 100644 2026test/performance/knapsack_naive-3.in
 mode change 100755 => 100644 2026test/performance/knapsack_naive-3.sy
 mode change 100755 => 100644 2026test/performance/many_mat_cal-1.in
 mode change 100755 => 100644 2026test/performance/many_mat_cal-1.sy
 mode change 100755 => 100644 2026test/performance/many_mat_cal-2.in
 mode change 100755 => 100644 2026test/performance/many_mat_cal-2.sy
 mode change 100755 => 100644 2026test/performance/many_mat_cal-3.in
 mode change 100755 => 100644 2026test/performance/many_mat_cal-3.sy
 mode change 100755 => 100644 2026test/performance/matmul1.in
 mode change 100755 => 100644 2026test/performance/matmul1.sy
 mode change 100755 => 100644 2026test/performance/matmul2.in
 mode change 100755 => 100644 2026test/performance/matmul2.sy
 mode change 100755 => 100644 2026test/performance/matmul3.in
 mode change 100755 => 100644 2026test/performance/matmul3.sy
 mode change 100755 => 100644 2026test/performance/optimization_scheduling1.in
 mode change 100755 => 100644 2026test/performance/optimization_scheduling1.sy
 mode change 100755 => 100644 2026test/performance/optimization_scheduling2.in
 mode change 100755 => 100644 2026test/performance/optimization_scheduling2.sy
 mode change 100755 => 100644 2026test/performance/optimization_scheduling3.in
 mode change 100755 => 100644 2026test/performance/optimization_scheduling3.sy
 mode change 100755 => 100644 2026test/performance/shuffle0.in
 mode change 100755 => 100644 2026test/performance/shuffle0.sy
 mode change 100755 => 100644 2026test/performance/shuffle1.in
 mode change 100755 => 100644 2026test/performance/shuffle1.sy
 mode change 100755 => 100644 2026test/performance/shuffle2.in
 mode change 100755 => 100644 2026test/performance/shuffle2.sy
 mode change 100755 => 100644 2026test/performance/sl1.in
 mode change 100755 => 100644 2026test/performance/sl1.sy
 mode change 100755 => 100644 2026test/performance/sl2.in
 mode change 100755 => 100644 2026test/performance/sl2.sy
 mode change 100755 => 100644 2026test/performance/sl3.in
 mode change 100755 => 100644 2026test/performance/sl3.sy
 mode change 100755 => 100644 2026test/performance/transpose0.in
 mode change 100755 => 100644 2026test/performance/transpose0.sy
 mode change 100755 => 100644 2026test/performance/transpose1.in
 mode change 100755 => 100644 2026test/performance/transpose1.sy
 mode change 100755 => 100644 2026test/performance/transpose2.in
 mode change 100755 => 100644 2026test/performance/transpose2.sy
 create mode 100644 CLAUDE.md
 create mode 100755 copy_src.sh
 create mode 100644 doc/LLVM-Loop-Block-分析报告.md
 create mode 100644 doc/LLVM-Loop-Fussion-分析报告.md
 create mode 100644 doc/LLVM-Loop-Interchange-分析报告.md
 create mode 100644 doc/opt-cookbook-ai-loop-interchange.md
 delete mode 100644 include/frontend/AntlrDriver.h
 delete mode 100644 include/frontend/SyntaxTreePrinter.h
 delete mode 100644 include/ir/IR.h
 delete mode 100644 include/irgen/IRGen.h
 delete mode 100644 include/mir/MIR.h
 delete mode 100644 include/sem/Sema.h
 delete mode 100644 include/sem/SymbolTable.h
 delete mode 100644 include/utils/CLI.h
 delete mode 100644 include/utils/Log.h
 create mode 100644 optimization-designs/.gitkeep
 create mode 100644 optimization-designs/00-总览-优化全景.md
 create mode 100644 optimization-designs/01-IR优化-Mem2Reg与SSA构造.md
 create mode 100644 optimization-designs/02-IR优化-循环优化.md
 create mode 100644 optimization-designs/03-IR优化-NEON自动向量化.md
 create mode 100644 optimization-designs/04-IR优化-标量优化Pass.md
 create mode 100644 optimization-designs/05-MIR优化-降级时优化.md
 create mode 100644 optimization-designs/06-MIR优化-寄存器分配前优化.md
 create mode 100644 optimization-designs/07-MIR优化-寄存器分配.md
 create mode 100644 optimization-designs/08-MIR优化-Peephole窥孔.md
 create mode 100644 optimization-designs/09-MIR优化-BlockLayout与PhysRegCopyProp.md
 create mode 100644 optimization-designs/10-关键缺失与性能飞跃路径.md
 create mode 100644 optimization-designs/live-range-splitting-splitkit.md
 create mode 100644 optimization-designs/regalloc-layer1-rewrite.md
 create mode 100644 optimization-designs/优化记录.md
 create mode 100644 src/include/ir/analysis/AliasAnalysis.h
 create mode 100644 src/include/ir/analysis/DominatorTree.h
 create mode 100644 src/include/ir/analysis/MemorySSA.h
 create mode 100644 src/include/ir/analysis/PostDominatorTree.h
 create mode 100644 src/include/ir/analysis/ScalarEvolution.h
 create mode 100644 src/include/mir/GreedyAlloc.h
 create mode 100644 src/include/mir/LiveIntervals.h
 create mode 100644 src/include/mir/LiveRangeEdit.h
 create mode 100644 src/include/mir/MachineRegisterInfo.h
 create mode 100644 src/ir/analysis/AliasAnalysis.cpp
 create mode 100644 src/ir/analysis/MemorySSA.cpp
 create mode 100644 src/ir/analysis/PostDominatorTree.cpp
 create mode 100644 src/ir/analysis/ScalarEvolution.cpp
 create mode 100644 src/ir/passes/DSE.cpp
 create mode 100644 src/ir/passes/IRVerifier.cpp
 create mode 100644 src/ir/passes/IfConversion.cpp
 create mode 100644 src/ir/passes/LoopInterchange.cpp
 create mode 100644 src/ir/passes/LoopUnroll.cpp
 create mode 100644 src/ir/passes/LoopVectorize.cpp
 create mode 100644 src/ir/passes/SCCP.cpp
 create mode 100644 src/mir/GreedyAlloc.cpp
 create mode 100644 src/mir/LiveIntervals.cpp
 create mode 100644 src/mir/MIRVerifier.cpp
 create mode 100644 src/mir/MachineRegisterInfo.cpp
 create mode 100644 src/mir/passes/CopyPropagation.cpp
 create mode 100644 src/mir/passes/FoldImm.cpp
 create mode 100644 src/mir/passes/LiveRangeSplit.cpp
 create mode 100644 src/mir/passes/MIRCleanup.cpp
 create mode 100644 src/mir/passes/PhiElimination.cpp
 create mode 100644 src/mir/passes/PhysRegCopyProp.cpp
 create mode 100644 src/mir/passes/RegisterCoalescer.cpp
 create mode 100644 src/mir/passes/TailCallOpt.cpp
 create mode 100644 src/mir/passes/TwoAddress.cpp

diff --git a/2026test.sh b/2026test.sh
new file mode 100755
index 00000000..ed00b349
--- /dev/null
+++ b/2026test.sh
@@ -0,0 +1,639 @@
+#!/usr/bin/env bash
+set -u
+set -o pipefail
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+
+TEST_ROOT="./2026test"
+OUTPUT_DIR="./2026test_results"
+COMPILER="./build/bin/compiler"
+VERIFY_SCRIPT="./scripts/verify_asm.sh"
+BASELINE_FILE="./基线.txt"
+
+MAX_CASES=0
+STOP_ON_FIRST_FAILURE=false
+START_FROM=1
+KEEP_OLD=false
+OPTIMIZE=true
+CATEGORY="all"
+SKIP_LIST=""
+VERBOSE=false
+TIMEOUT_MS=300000
+
+total_time_sum=0
+time_cases_count=0
+SUCCESS=0
+FAILED=0
+SKIPPED=0
+
+RUNTIME_OBJ="./build/test_runtime/sylib.o"
+
+show_help() {
+  cat << 'EOF'
+用法: ./2026test.sh [选项]
+
+说明:
+  自动化执行 2026test 文件夹中的所有测试用例。
+  支持功能测试(functional)、隐含功能测试(h_functional)、性能测试(performance)。
+  使用编译器生成 AArch64 汇编，交叉编译链接后通过 qemu-aarch64 运行验证。
+  自动记录每个测试集的纯运行时间(qemu执行时间)并生成基线文件。
+  注意: 计时仅包含程序在qemu中的执行时间，不包含编译和汇编链接时间。
+
+选项:
+  -h, --help                显示此帮助信息
+  -n, --max N               最多运行 N 个测试用例 (0=不限制，默认: 0)
+  -s, --start-from N        从第 N 个测试用例开始 (默认: 1)
+  -x, --stop-on-fail        遇到第一个失败即停止
+  -k, --keep                保留旧输出目录，不删除
+  -O, --optimize            启用编译器优化 (默认启用)
+  -O0, --no-optimize        禁用编译器优化
+  -c, --category CAT        指定测试类别: functional|h_functional|performance|all (默认: all)
+  --skip N1,N2,...          跳过指定编号的测试用例 (逗号分隔)
+  -v, --verbose             显示详细输出
+  -o, --output-dir DIR      指定输出目录 (默认: ./2026test_results)
+  -t, --timeout MS          单个测试超时时间(毫秒) (默认: 300000)
+
+示例:
+  ./2026test.sh                                # 运行所有测试 (默认启用优化)
+  ./2026test.sh -c functional                  # 仅运行功能测试
+  ./2026test.sh -c performance                 # 仅运行性能测试
+  ./2026test.sh -n 10                          # 只运行前10个测试
+  ./2026test.sh -s 5                           # 从第5个测试开始
+  ./2026test.sh --skip 3,7,15                  # 跳过第3、7、15个测试
+  ./2026test.sh -c functional -n 5 -v          # 功能测试前5个，详细模式
+  ./2026test.sh -O0                            # 不启用优化
+  ./2026test.sh -x                             # 失败即停止
+  ./2026test.sh -c functional -s 10 -n 5       # 功能测试从第10个开始运行5个
+EOF
+}
+
+parse_args() {
+  while [[ $# -gt 0 ]]; do
+    case $1 in
+      -h|--help)
+        show_help
+        exit 0
+        ;;
+      -n|--max)
+        MAX_CASES="$2"
+        shift 2
+        ;;
+      -s|--start-from)
+        START_FROM="$2"
+        shift 2
+        ;;
+      -x|--stop-on-fail)
+        STOP_ON_FIRST_FAILURE=true
+        shift
+        ;;
+      -k|--keep)
+        KEEP_OLD=true
+        shift
+        ;;
+      -O|--optimize)
+        OPTIMIZE=true
+        shift
+        ;;
+      -O0|--no-optimize)
+        OPTIMIZE=false
+        shift
+        ;;
+      -c|--category)
+        CATEGORY="$2"
+        if [[ "$CATEGORY" != "functional" && "$CATEGORY" != "h_functional" && "$CATEGORY" != "performance" && "$CATEGORY" != "all" ]]; then
+          echo -e "${RED}错误: 类别必须是 functional|h_functional|performance|all${NC}"
+          exit 1
+        fi
+        shift 2
+        ;;
+      --skip)
+        SKIP_LIST="$2"
+        shift 2
+        ;;
+      -v|--verbose)
+        VERBOSE=true
+        shift
+        ;;
+      -o|--output-dir)
+        OUTPUT_DIR="$2"
+        shift 2
+        ;;
+      -t|--timeout)
+        TIMEOUT_MS="$2"
+        shift 2
+        ;;
+      *)
+        echo -e "${RED}错误: 未知选项 $1${NC}"
+        show_help
+        exit 1
+        ;;
+    esac
+  done
+}
+
+parse_args "$@"
+
+check_prerequisites() {
+  local missing=0
+
+  if [[ ! -x "$COMPILER" ]]; then
+    echo -e "${RED}错误: 编译器不可执行: $COMPILER${NC}"
+    echo -e "${YELLOW}提示: 请先构建项目:${NC}"
+    echo -e "${YELLOW}  cmake -S . -B build -DCMAKE_BUILD_TYPE=Release${NC}"
+    echo -e "${YELLOW}  cmake --build build -j \"\$(nproc)\"${NC}"
+    missing=1
+  fi
+
+  if [[ ! -d "$TEST_ROOT" ]]; then
+    echo -e "${RED}错误: 测试目录不存在: $TEST_ROOT${NC}"
+    missing=1
+  fi
+
+  if ! command -v aarch64-linux-gnu-gcc >/dev/null 2>&1; then
+    echo -e "${RED}错误: 未找到 aarch64-linux-gnu-gcc，无法汇编/链接${NC}"
+    echo -e "${YELLOW}提示: sudo apt install gcc-aarch64-linux-gnu${NC}"
+    missing=1
+  fi
+
+  if ! command -v qemu-aarch64 >/dev/null 2>&1; then
+    echo -e "${RED}错误: 未找到 qemu-aarch64，无法运行生成的可执行文件${NC}"
+    echo -e "${YELLOW}提示: sudo apt install qemu-user${NC}"
+    missing=1
+  fi
+
+  local runtime_found=false
+  if [[ -f "./sylib/sylib.c" ]]; then
+    runtime_found=true
+  elif [[ -n "${SYSY_RUNTIME:-}" ]] && [[ -f "$SYSY_RUNTIME" ]]; then
+    runtime_found=true
+  else
+    local found
+    found=$(find . -path './build' -prune -o -path './.git' -prune -o -type f -name 'sylib.c' -print 2>/dev/null | head -n 1)
+    if [[ -n "$found" ]]; then
+      runtime_found=true
+    fi
+  fi
+
+  if [[ "$runtime_found" != "true" ]]; then
+    echo -e "${RED}错误: 未找到运行时库 sylib.c${NC}"
+    echo -e "${YELLOW}提示: 可通过环境变量 SYSY_RUNTIME 指定路径${NC}"
+    missing=1
+  fi
+
+  if [[ $missing -eq 1 ]]; then
+    exit 1
+  fi
+}
+
+check_prerequisites
+
+if ! [[ "$START_FROM" =~ ^[0-9]+$ ]] || [[ "$START_FROM" -lt 1 ]]; then
+  echo -e "${RED}错误: --start-from 需要正整数${NC}"
+  exit 1
+fi
+
+if ! [[ "$MAX_CASES" =~ ^[0-9]+$ ]]; then
+  echo -e "${RED}错误: --max 需要非负整数${NC}"
+  exit 1
+fi
+
+declare -A SKIP_SET
+if [[ -n "$SKIP_LIST" ]]; then
+  IFS=',' read -ra SKIP_ITEMS <<< "$SKIP_LIST"
+  for item in "${SKIP_ITEMS[@]}"; do
+    item=$(echo "$item" | xargs)
+    if [[ "$item" =~ ^[0-9]+$ ]]; then
+      SKIP_SET[$item]=1
+    fi
+  done
+fi
+
+if [[ "$KEEP_OLD" != "true" ]]; then
+  rm -rf "$OUTPUT_DIR"
+fi
+mkdir -p "$OUTPUT_DIR"
+
+LOG_FILE="$OUTPUT_DIR/2026test_batch.log"
+FAIL_FILE="$OUTPUT_DIR/failed_cases.txt"
+ERROR_LOG_FILE="$OUTPUT_DIR/error_log.txt"
+TIME_SUMMARY_FILE="$OUTPUT_DIR/time_summary.txt"
+DETAIL_TIME_FILE="$OUTPUT_DIR/detail_time.txt"
+
+if [[ "$KEEP_OLD" != "true" ]]; then
+  : > "$LOG_FILE"
+  : > "$FAIL_FILE"
+  : > "$ERROR_LOG_FILE"
+  : > "$TIME_SUMMARY_FILE"
+  : > "$DETAIL_TIME_FILE"
+fi
+
+{
+  echo "2026test 批量测试日志 - $(date '+%Y-%m-%d %H:%M:%S')"
+  echo "TEST_ROOT=$TEST_ROOT"
+  echo "OUTPUT_DIR=$OUTPUT_DIR"
+  echo "CATEGORY=$CATEGORY"
+  echo "OPTIMIZE=$OPTIMIZE"
+  echo "MAX_CASES=$MAX_CASES"
+  echo "START_FROM=$START_FROM"
+  echo "SKIP_LIST=$SKIP_LIST"
+  echo "================================================"
+} >> "$LOG_FILE"
+
+collect_sy_files() {
+  local dirs=()
+  if [[ "$CATEGORY" == "all" ]]; then
+    dirs=("functional" "h_functional" "performance")
+  else
+    dirs=("$CATEGORY")
+  fi
+
+  for dir in "${dirs[@]}"; do
+    local full_dir="$TEST_ROOT/$dir"
+    if [[ -d "$full_dir" ]]; then
+      find "$full_dir" -type f -name '*.sy' -print0 | sort -z
+    fi
+  done
+}
+
+mapfile -d '' -t ALL_CASES < <(collect_sy_files)
+TOTAL_FOUND=${#ALL_CASES[@]}
+
+if [[ $TOTAL_FOUND -eq 0 ]]; then
+  echo -e "${YELLOW}未找到任何 .sy 用例，请检查目录: $TEST_ROOT${NC}"
+  exit 0
+fi
+
+get_timestamp_ms() {
+  date +%s%3N 2>/dev/null || date +%s000
+}
+
+get_category_name() {
+  local rel_path="$1"
+  local dir_name
+  dir_name=$(dirname "$rel_path")
+  dir_name=$(basename "$dir_name")
+  echo "$dir_name"
+}
+
+find_runtime_src() {
+  if [[ -n "${SYSY_RUNTIME:-}" ]] && [[ -f "$SYSY_RUNTIME" ]]; then
+    printf '%s\n' "$SYSY_RUNTIME"
+    return 0
+  fi
+  local candidates=("./sylib/sylib.c" "./sylib.c" "./runtime/sylib.c" "./lib/sylib.c")
+  for candidate in "${candidates[@]}"; do
+    if [[ -f "$candidate" ]]; then
+      printf '%s\n' "$candidate"
+      return 0
+    fi
+  done
+  local found
+  found=$(find . -path './build' -prune -o -path './.git' -prune -o -type f -name 'sylib.c' -print 2>/dev/null | head -n 1)
+  if [[ -n "$found" ]]; then
+    printf '%s\n' "$found"
+    return 0
+  fi
+  return 1
+}
+
+RUNTIME_SRC="$(find_runtime_src || true)"
+if [[ -z "$RUNTIME_SRC" ]]; then
+  echo -e "${RED}错误: 未找到运行时库源码 sylib.c${NC}"
+  exit 1
+fi
+
+runtime_cache_dir="./build/test_runtime"
+RUNTIME_OBJ="$runtime_cache_dir/sylib.o"
+mkdir -p "$runtime_cache_dir"
+
+if [[ ! -f "$RUNTIME_OBJ" ]] || [[ "$RUNTIME_SRC" -nt "$RUNTIME_OBJ" ]]; then
+  aarch64-linux-gnu-gcc -O2 -c "$RUNTIME_SRC" -o "$RUNTIME_OBJ"
+fi
+
+echo -e "${BLUE}========================================================${NC}"
+echo -e "${BLUE}       2026test 批量测试${NC}"
+echo -e "${BLUE}========================================================${NC}"
+echo -e "${BLUE}测试根目录:   $TEST_ROOT${NC}"
+echo -e "${BLUE}测试类别:     $CATEGORY${NC}"
+echo -e "${BLUE}找到用例数:   $TOTAL_FOUND${NC}"
+echo -e "${BLUE}输出目录:     $OUTPUT_DIR${NC}"
+echo -e "${BLUE}编译器优化:   $OPTIMIZE${NC}"
+echo -e "${BLUE}计时方式:     仅qemu运行时间(不含编译/汇编)${NC}"
+if [[ "$START_FROM" -gt 1 ]]; then
+  echo -e "${BLUE}起始用例:     $START_FROM${NC}"
+fi
+if [[ "$MAX_CASES" -gt 0 ]]; then
+  echo -e "${BLUE}最大用例数:   $MAX_CASES${NC}"
+fi
+if [[ "${#SKIP_SET[@]}" -gt 0 ]] 2>/dev/null; then
+  echo -e "${BLUE}跳过编号:     ${!SKIP_SET[*]}${NC}"
+fi
+echo -e "${BLUE}基线文件:     $BASELINE_FILE${NC}"
+echo -e "${BLUE}========================================================${NC}"
+echo ""
+
+declare -a BASELINE_ENTRIES=()
+
+TOTAL=0
+EXECUTED=0
+
+for file in "${ALL_CASES[@]}"; do
+  TOTAL=$((TOTAL + 1))
+
+  if [[ $TOTAL -lt $START_FROM ]]; then
+    continue
+  fi
+
+  if [[ $MAX_CASES -gt 0 && $EXECUTED -ge $MAX_CASES ]]; then
+    break
+  fi
+
+  if [[ ${SKIP_SET[$TOTAL]+_} ]]; then
+    SKIPPED=$((SKIPPED + 1))
+    rel_path="${file#$TEST_ROOT/}"
+    echo -e "${CYAN}[$TOTAL] $(basename "$file") ... 跳过${NC}"
+    echo "[SKIPPED] $file (user skip)" >> "$LOG_FILE"
+    continue
+  fi
+
+  EXECUTED=$((EXECUTED + 1))
+
+  rel_path="${file#$TEST_ROOT/}"
+  filename="$(basename "$file")"
+  base_name="${filename%.sy}"
+  rel_dir="$(dirname "$rel_path")"
+  input_dir="$TEST_ROOT/$rel_dir"
+  category_name=$(get_category_name "$rel_path")
+  case_out_dir="$OUTPUT_DIR/$rel_dir"
+
+  mkdir -p "$case_out_dir"
+
+  rm -f "$case_out_dir/$base_name.s"
+  rm -f "$case_out_dir/$base_name.o"
+  rm -f "$case_out_dir/$base_name"
+  rm -f "$case_out_dir/$base_name.stdout"
+  rm -f "$case_out_dir/$base_name.actual.out"
+
+  if [[ "$VERBOSE" == "true" ]]; then
+    echo -e "${YELLOW}[$TOTAL] $category_name/$filename ... ${NC}"
+  else
+    echo -ne "${YELLOW}[$TOTAL] $category_name/$filename ... ${NC}"
+  fi
+
+  asm_file="$case_out_dir/$base_name.s"
+  exe="$case_out_dir/$base_name"
+  stdin_file="$input_dir/$base_name.in"
+  expected_file="$input_dir/$base_name.out"
+  stdout_file="$case_out_dir/$base_name.stdout"
+  actual_file="$case_out_dir/$base_name.actual.out"
+
+  compile_ok=true
+
+  set +e
+  if [[ "$OPTIMIZE" == "true" ]]; then
+    "$COMPILER" -O --emit-asm "$file" > "$asm_file" 2>/dev/null
+  else
+    "$COMPILER" --emit-asm "$file" > "$asm_file" 2>/dev/null
+  fi
+  compile_code=$?
+  set -e
+
+  if [[ $compile_code -ne 0 ]]; then
+    compile_ok=false
+  fi
+
+  if $compile_ok; then
+    set +e
+    aarch64-linux-gnu-gcc "$asm_file" "$RUNTIME_OBJ" -o "$exe" 2>/dev/null
+    link_code=$?
+    set -e
+    if [[ $link_code -ne 0 ]]; then
+      compile_ok=false
+    fi
+  fi
+
+  if ! $compile_ok; then
+    FAILED=$((FAILED + 1))
+    echo "$file" >> "$FAIL_FILE"
+    echo -e "${RED}编译/链接失败${NC}"
+    echo "[FAILED] $file (compile/link error)" >> "$LOG_FILE"
+    {
+      echo "========================================"
+      echo "测试失败: $file"
+      echo "原因: 编译或链接失败"
+      echo "时间: $(date '+%Y-%m-%d %H:%M:%S')"
+      echo "========================================"
+    } >> "$ERROR_LOG_FILE"
+
+    if [[ "$STOP_ON_FIRST_FAILURE" == "true" ]]; then
+      echo -e "${RED}========================================================${NC}"
+      echo -e "${RED}在第一个失败处停止测试${NC}"
+      echo -e "${RED}失败文件: $file${NC}"
+      echo -e "${RED}日志: $LOG_FILE${NC}"
+      echo -e "${RED}错误日志: $ERROR_LOG_FILE${NC}"
+      echo -e "${RED}========================================================${NC}"
+      break
+    fi
+    continue
+  fi
+
+  exec_start_ms=$(get_timestamp_ms)
+  exec_start_human=$(date '+%Y-%m-%d %H:%M:%S.%3N')
+
+  set +e
+  if [[ -f "$stdin_file" ]]; then
+    qemu-aarch64 -L /usr/aarch64-linux-gnu -s 104857600 "$exe" < "$stdin_file" > "$stdout_file" 2>/dev/null
+  else
+    qemu-aarch64 -L /usr/aarch64-linux-gnu -s 104857600 "$exe" < /dev/null > "$stdout_file" 2>/dev/null
+  fi
+  exit_status=$?
+  set -e
+
+  exec_end_ms=$(get_timestamp_ms)
+  exec_end_human=$(date '+%Y-%m-%d %H:%M:%S.%3N')
+  exec_elapsed_ms=$((exec_end_ms - exec_start_ms))
+
+  {
+    cat "$stdout_file"
+    if [[ -s "$stdout_file" ]] && (( $(tail -c 1 "$stdout_file" | wc -l) == 0 )); then
+      printf '\n'
+    fi
+    printf '%s\n' "$exit_status"
+  } > "$actual_file"
+
+  output_ok=true
+  if [[ -f "$expected_file" ]]; then
+    if command -v python3 >/dev/null 2>&1; then
+      if ! python3 - "$expected_file" "$actual_file" <<'PY' >/dev/null 2>&1
+import sys
+from pathlib import Path
+
+def canon(path: str) -> bytes:
+    data = Path(path).read_bytes()
+    data = data.replace(b'\r\n', b'\n')
+    while data.endswith(b'\n'):
+        data = data[:-1]
+    lines = data.split(b'\n')
+    lines = [line.rstrip() for line in lines]
+    return b'\n'.join(lines)
+
+sys.exit(0 if canon(sys.argv[1]) == canon(sys.argv[2]) else 1)
+PY
+      then
+        output_ok=false
+      fi
+    else
+      local_expected="/tmp/_test_expected_$$"
+      local_actual="/tmp/_test_actual_$$"
+      tr -d '\r' < "$expected_file" > "$local_expected"
+      tr -d '\r' < "$actual_file" > "$local_actual"
+      if ! diff -u "$local_expected" "$local_actual" > /dev/null 2>&1; then
+        output_ok=false
+      fi
+      rm -f "$local_expected" "$local_actual"
+    fi
+  fi
+
+  baseline_entry="${category_name}/${base_name}"
+
+  if $output_ok; then
+    SUCCESS=$((SUCCESS + 1))
+
+    if [[ "$exec_elapsed_ms" =~ ^[0-9]+$ ]]; then
+      total_time_sum=$((total_time_sum + exec_elapsed_ms))
+      time_cases_count=$((time_cases_count + 1))
+    fi
+
+    if [[ "$VERBOSE" == "true" ]]; then
+      echo -e "  ${GREEN}成功${NC} | 开始: $exec_start_human | 结束: $exec_end_human | 运行: ${exec_elapsed_ms}ms"
+    else
+      echo -e "${GREEN}成功${NC} (${exec_elapsed_ms}ms)"
+    fi
+
+    echo "[SUCCESS] $file | start=$exec_start_human | end=$exec_end_human | exec=${exec_elapsed_ms}ms" >> "$LOG_FILE"
+    echo "$rel_path: ${exec_elapsed_ms}ms" >> "$TIME_SUMMARY_FILE"
+    echo "$baseline_entry | $exec_start_human | $exec_end_human | ${exec_elapsed_ms}ms" >> "$DETAIL_TIME_FILE"
+
+    BASELINE_ENTRIES+=("$baseline_entry ${exec_elapsed_ms}ms")
+
+  else
+    FAILED=$((FAILED + 1))
+    echo "$file" >> "$FAIL_FILE"
+
+    if [[ "$VERBOSE" == "true" ]]; then
+      echo -e "  ${RED}失败${NC} | 开始: $exec_start_human | 结束: $exec_end_human | 运行: ${exec_elapsed_ms}ms | 输出不匹配"
+    else
+      echo -e "${RED}失败${NC} (运行${exec_elapsed_ms}ms, 输出不匹配)"
+    fi
+
+    echo "[FAILED] $file (output mismatch) | start=$exec_start_human | end=$exec_end_human | exec=${exec_elapsed_ms}ms" >> "$LOG_FILE"
+
+    {
+      echo "========================================"
+      echo "测试失败: $file"
+      echo "原因: 输出不匹配"
+      echo "运行时间: ${exec_elapsed_ms}ms"
+      echo "开始时间: $exec_start_human"
+      echo "结束时间: $exec_end_human"
+      echo "时间: $(date '+%Y-%m-%d %H:%M:%S')"
+      echo "========================================"
+    } >> "$ERROR_LOG_FILE"
+
+    if [[ "$STOP_ON_FIRST_FAILURE" == "true" ]]; then
+      echo -e "${RED}========================================================${NC}"
+      echo -e "${RED}在第一个失败处停止测试${NC}"
+      echo -e "${RED}失败文件: $file${NC}"
+      echo -e "${RED}日志: $LOG_FILE${NC}"
+      echo -e "${RED}错误日志: $ERROR_LOG_FILE${NC}"
+      echo -e "${RED}========================================================${NC}"
+      break
+    fi
+  fi
+done
+
+{
+  echo ""
+  echo "========================================================"
+  echo "2026test 批量测试报告"
+  echo "========================================================"
+  echo "生成时间: $(date '+%Y-%m-%d %H:%M:%S')"
+  echo "测试类别: $CATEGORY"
+  echo "编译器优化: $OPTIMIZE"
+  echo "计时说明: 仅qemu运行时间(不含编译/汇编链接)"
+  echo "========================================================"
+  echo ""
+  echo "统计信息:"
+  echo "  总用例数: $TOTAL_FOUND"
+  echo "  执行用例: $EXECUTED"
+  echo "  成功: $SUCCESS"
+  echo "  失败: $FAILED"
+  echo "  跳过: $SKIPPED"
+  if [[ $EXECUTED -gt 0 ]]; then
+    local_rate=$(awk -v s="$SUCCESS" -v t="$EXECUTED" 'BEGIN { printf "%.2f", (s*100.0)/t }')
+    echo "  成功率: ${local_rate}%"
+  fi
+  echo ""
+  echo "--------------------------------------------------------"
+  printf "%-50s %15s\n" "测试集标识" "运行时长(ms)"
+  echo "--------------------------------------------------------"
+  for entry in "${BASELINE_ENTRIES[@]}"; do
+    local_name=$(echo "$entry" | sed 's/ [0-9]*ms$//')
+    local_time=$(echo "$entry" | grep -oP '\d+(?=ms$)')
+    printf "%-50s %15s\n" "$local_name" "$local_time"
+  done
+  echo "--------------------------------------------------------"
+  if [[ $time_cases_count -gt 0 ]]; then
+    avg_time=$((total_time_sum / time_cases_count))
+    echo ""
+    echo "平均运行时间: ${avg_time}ms (基于 ${time_cases_count} 个成功用例)"
+    echo "总运行时间: ${total_time_sum}ms"
+  fi
+  echo ""
+  echo "========================================================"
+} > "$BASELINE_FILE"
+
+RATE="0.00"
+if [[ $EXECUTED -gt 0 ]]; then
+  RATE=$(awk -v s="$SUCCESS" -v t="$EXECUTED" 'BEGIN { printf "%.2f", (s*100.0)/t }')
+fi
+
+echo ""
+echo -e "${BLUE}========================================================${NC}"
+echo -e "${BLUE}       2026test 批量测试完成${NC}"
+echo -e "${BLUE}========================================================${NC}"
+echo -e "${BLUE}总用例数:   $TOTAL_FOUND${NC}"
+echo -e "${BLUE}执行用例:   $EXECUTED${NC}"
+echo -e "${GREEN}成功:       $SUCCESS${NC}"
+echo -e "${RED}失败:       $FAILED${NC}"
+echo -e "${CYAN}跳过:       $SKIPPED${NC}"
+echo -e "${BLUE}成功率:     ${RATE}%${NC}"
+
+if [[ $time_cases_count -gt 0 ]]; then
+  avg_time=$((total_time_sum / time_cases_count))
+  echo -e "${BLUE}平均运行时间: ${avg_time}ms (基于 ${time_cases_count} 个成功用例)${NC}"
+  echo -e "${BLUE}总运行时间: ${total_time_sum}ms${NC}"
+fi
+
+echo ""
+echo -e "${BLUE}基线文件:   $BASELINE_FILE${NC}"
+echo -e "${BLUE}日志文件:   $LOG_FILE${NC}"
+echo -e "${BLUE}时间汇总:   $TIME_SUMMARY_FILE${NC}"
+echo -e "${BLUE}详细时间:   $DETAIL_TIME_FILE${NC}"
+
+if [[ $FAILED -gt 0 ]]; then
+  echo -e "${RED}失败清单:   $FAIL_FILE${NC}"
+  echo -e "${RED}错误日志:   $ERROR_LOG_FILE${NC}"
+fi
+
+echo -e "${BLUE}========================================================${NC}"
+
+if [[ $FAILED -gt 0 ]]; then
+  exit 1
+fi
+
+exit 0
diff --git a/2026test/functional/00_main.sy b/2026test/functional/00_main.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/01_var_defn2.sy b/2026test/functional/01_var_defn2.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/02_var_defn3.sy b/2026test/functional/02_var_defn3.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/03_arr_defn2.sy b/2026test/functional/03_arr_defn2.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/04_arr_defn3.sy b/2026test/functional/04_arr_defn3.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/05_arr_defn4.sy b/2026test/functional/05_arr_defn4.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/06_const_var_defn2.sy b/2026test/functional/06_const_var_defn2.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/07_const_var_defn3.sy b/2026test/functional/07_const_var_defn3.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/08_const_array_defn.sy b/2026test/functional/08_const_array_defn.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/09_func_defn.sy b/2026test/functional/09_func_defn.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/10_var_defn_func.sy b/2026test/functional/10_var_defn_func.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/11_add2.sy b/2026test/functional/11_add2.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/12_addc.sy b/2026test/functional/12_addc.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/13_sub2.sy b/2026test/functional/13_sub2.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/14_subc.sy b/2026test/functional/14_subc.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/15_mul.sy b/2026test/functional/15_mul.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/16_mulc.sy b/2026test/functional/16_mulc.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/17_div.sy b/2026test/functional/17_div.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/18_divc.sy b/2026test/functional/18_divc.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/19_mod.sy b/2026test/functional/19_mod.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/20_rem.sy b/2026test/functional/20_rem.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/21_if_test2.sy b/2026test/functional/21_if_test2.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/22_if_test3.sy b/2026test/functional/22_if_test3.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/23_if_test4.sy b/2026test/functional/23_if_test4.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/24_if_test5.sy b/2026test/functional/24_if_test5.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/25_while_if.sy b/2026test/functional/25_while_if.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/26_while_test1.sy b/2026test/functional/26_while_test1.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/27_while_test2.sy b/2026test/functional/27_while_test2.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/28_while_test3.sy b/2026test/functional/28_while_test3.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/29_break.sy b/2026test/functional/29_break.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/30_continue.sy b/2026test/functional/30_continue.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/31_while_if_test1.sy b/2026test/functional/31_while_if_test1.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/32_while_if_test2.sy b/2026test/functional/32_while_if_test2.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/33_while_if_test3.sy b/2026test/functional/33_while_if_test3.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/34_arr_expr_len.sy b/2026test/functional/34_arr_expr_len.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/35_op_priority1.sy b/2026test/functional/35_op_priority1.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/36_op_priority2.sy b/2026test/functional/36_op_priority2.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/37_op_priority3.sy b/2026test/functional/37_op_priority3.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/38_op_priority4.in b/2026test/functional/38_op_priority4.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/38_op_priority4.sy b/2026test/functional/38_op_priority4.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/39_op_priority5.sy b/2026test/functional/39_op_priority5.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/40_unary_op.sy b/2026test/functional/40_unary_op.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/41_unary_op2.sy b/2026test/functional/41_unary_op2.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/42_empty_stmt.sy b/2026test/functional/42_empty_stmt.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/43_logi_assign.in b/2026test/functional/43_logi_assign.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/43_logi_assign.sy b/2026test/functional/43_logi_assign.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/44_stmt_expr.sy b/2026test/functional/44_stmt_expr.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/45_comment1.sy b/2026test/functional/45_comment1.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/46_hex_defn.sy b/2026test/functional/46_hex_defn.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/47_hex_oct_add.sy b/2026test/functional/47_hex_oct_add.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/48_assign_complex_expr.sy b/2026test/functional/48_assign_complex_expr.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/49_if_complex_expr.sy b/2026test/functional/49_if_complex_expr.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/50_short_circuit.in b/2026test/functional/50_short_circuit.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/50_short_circuit.sy b/2026test/functional/50_short_circuit.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/51_short_circuit3.sy b/2026test/functional/51_short_circuit3.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/52_scope.sy b/2026test/functional/52_scope.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/53_scope2.sy b/2026test/functional/53_scope2.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/54_hidden_var.sy b/2026test/functional/54_hidden_var.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/55_sort_test1.sy b/2026test/functional/55_sort_test1.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/56_sort_test2.sy b/2026test/functional/56_sort_test2.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/57_sort_test3.sy b/2026test/functional/57_sort_test3.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/58_sort_test4.sy b/2026test/functional/58_sort_test4.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/59_sort_test5.sy b/2026test/functional/59_sort_test5.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/60_sort_test6.sy b/2026test/functional/60_sort_test6.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/61_sort_test7.in b/2026test/functional/61_sort_test7.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/61_sort_test7.sy b/2026test/functional/61_sort_test7.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/62_percolation.in b/2026test/functional/62_percolation.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/62_percolation.sy b/2026test/functional/62_percolation.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/63_big_int_mul.sy b/2026test/functional/63_big_int_mul.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/64_calculator.in b/2026test/functional/64_calculator.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/64_calculator.sy b/2026test/functional/64_calculator.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/65_color.in b/2026test/functional/65_color.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/65_color.sy b/2026test/functional/65_color.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/66_exgcd.sy b/2026test/functional/66_exgcd.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/67_reverse_output.in b/2026test/functional/67_reverse_output.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/67_reverse_output.sy b/2026test/functional/67_reverse_output.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/68_brainfk.in b/2026test/functional/68_brainfk.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/68_brainfk.sy b/2026test/functional/68_brainfk.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/69_expr_eval.in b/2026test/functional/69_expr_eval.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/69_expr_eval.sy b/2026test/functional/69_expr_eval.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/70_dijkstra.in b/2026test/functional/70_dijkstra.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/70_dijkstra.sy b/2026test/functional/70_dijkstra.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/71_full_conn.in b/2026test/functional/71_full_conn.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/71_full_conn.sy b/2026test/functional/71_full_conn.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/72_hanoi.in b/2026test/functional/72_hanoi.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/72_hanoi.sy b/2026test/functional/72_hanoi.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/73_int_io.in b/2026test/functional/73_int_io.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/73_int_io.sy b/2026test/functional/73_int_io.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/74_kmp.in b/2026test/functional/74_kmp.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/74_kmp.sy b/2026test/functional/74_kmp.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/75_max_flow.in b/2026test/functional/75_max_flow.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/75_max_flow.sy b/2026test/functional/75_max_flow.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/76_n_queens.in b/2026test/functional/76_n_queens.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/76_n_queens.sy b/2026test/functional/76_n_queens.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/77_substr.sy b/2026test/functional/77_substr.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/78_side_effect.sy b/2026test/functional/78_side_effect.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/79_var_name.sy b/2026test/functional/79_var_name.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/80_chaos_token.sy b/2026test/functional/80_chaos_token.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/81_skip_spaces.in b/2026test/functional/81_skip_spaces.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/81_skip_spaces.sy b/2026test/functional/81_skip_spaces.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/82_long_func.sy b/2026test/functional/82_long_func.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/83_long_array.sy b/2026test/functional/83_long_array.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/84_long_array2.sy b/2026test/functional/84_long_array2.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/85_long_code.sy b/2026test/functional/85_long_code.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/86_long_code2.sy b/2026test/functional/86_long_code2.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/87_many_params.in b/2026test/functional/87_many_params.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/87_many_params.sy b/2026test/functional/87_many_params.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/88_many_params2.sy b/2026test/functional/88_many_params2.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/89_many_globals.sy b/2026test/functional/89_many_globals.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/90_many_locals.sy b/2026test/functional/90_many_locals.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/91_many_locals2.in b/2026test/functional/91_many_locals2.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/91_many_locals2.sy b/2026test/functional/91_many_locals2.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/92_register_alloc.in b/2026test/functional/92_register_alloc.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/92_register_alloc.sy b/2026test/functional/92_register_alloc.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/93_nested_calls.in b/2026test/functional/93_nested_calls.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/93_nested_calls.sy b/2026test/functional/93_nested_calls.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/94_nested_loops.in b/2026test/functional/94_nested_loops.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/94_nested_loops.sy b/2026test/functional/94_nested_loops.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/95_float.in b/2026test/functional/95_float.in
old mode 100755
new mode 100644
diff --git a/2026test/functional/95_float.sy b/2026test/functional/95_float.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/96_matrix_add.sy b/2026test/functional/96_matrix_add.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/97_matrix_sub.sy b/2026test/functional/97_matrix_sub.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/98_matrix_mul.sy b/2026test/functional/98_matrix_mul.sy
old mode 100755
new mode 100644
diff --git a/2026test/functional/99_matrix_tran.sy b/2026test/functional/99_matrix_tran.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/00_comment2.sy b/2026test/h_functional/00_comment2.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/01_multiple_returns.sy b/2026test/h_functional/01_multiple_returns.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/02_ret_in_block.sy b/2026test/h_functional/02_ret_in_block.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/03_branch.sy b/2026test/h_functional/03_branch.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/04_break_continue.sy b/2026test/h_functional/04_break_continue.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/05_param_name.sy b/2026test/h_functional/05_param_name.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/06_func_name.sy b/2026test/h_functional/06_func_name.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/07_arr_init_nd.sy b/2026test/h_functional/07_arr_init_nd.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/08_global_arr_init.sy b/2026test/h_functional/08_global_arr_init.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/09_BFS.in b/2026test/h_functional/09_BFS.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/09_BFS.sy b/2026test/h_functional/09_BFS.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/10_DFS.in b/2026test/h_functional/10_DFS.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/10_DFS.sy b/2026test/h_functional/10_DFS.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/11_BST.in b/2026test/h_functional/11_BST.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/11_BST.sy b/2026test/h_functional/11_BST.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/12_DSU.in b/2026test/h_functional/12_DSU.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/12_DSU.sy b/2026test/h_functional/12_DSU.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/13_LCA.in b/2026test/h_functional/13_LCA.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/13_LCA.sy b/2026test/h_functional/13_LCA.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/14_dp.in b/2026test/h_functional/14_dp.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/14_dp.sy b/2026test/h_functional/14_dp.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/15_graph_coloring.sy b/2026test/h_functional/15_graph_coloring.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/16_k_smallest.in b/2026test/h_functional/16_k_smallest.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/16_k_smallest.sy b/2026test/h_functional/16_k_smallest.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/17_maximal_clique.in b/2026test/h_functional/17_maximal_clique.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/17_maximal_clique.sy b/2026test/h_functional/17_maximal_clique.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/18_prim.in b/2026test/h_functional/18_prim.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/18_prim.sy b/2026test/h_functional/18_prim.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/19_search.in b/2026test/h_functional/19_search.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/19_search.sy b/2026test/h_functional/19_search.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/20_sort.in b/2026test/h_functional/20_sort.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/20_sort.sy b/2026test/h_functional/20_sort.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/21_union_find.in b/2026test/h_functional/21_union_find.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/21_union_find.sy b/2026test/h_functional/21_union_find.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/22_matrix_multiply.in b/2026test/h_functional/22_matrix_multiply.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/22_matrix_multiply.sy b/2026test/h_functional/22_matrix_multiply.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/23_json.in b/2026test/h_functional/23_json.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/23_json.sy b/2026test/h_functional/23_json.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/24_array_only.in b/2026test/h_functional/24_array_only.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/24_array_only.sy b/2026test/h_functional/24_array_only.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/25_scope3.sy b/2026test/h_functional/25_scope3.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/26_scope4.sy b/2026test/h_functional/26_scope4.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/27_scope5.sy b/2026test/h_functional/27_scope5.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/28_side_effect2.sy b/2026test/h_functional/28_side_effect2.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/29_long_line.sy b/2026test/h_functional/29_long_line.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/30_many_dimensions.sy b/2026test/h_functional/30_many_dimensions.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/31_many_indirections.sy b/2026test/h_functional/31_many_indirections.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/32_many_params3.sy b/2026test/h_functional/32_many_params3.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/33_multi_branch.in b/2026test/h_functional/33_multi_branch.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/33_multi_branch.sy b/2026test/h_functional/33_multi_branch.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/34_multi_loop.sy b/2026test/h_functional/34_multi_loop.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/35_math.in b/2026test/h_functional/35_math.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/35_math.sy b/2026test/h_functional/35_math.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/36_rotate.in b/2026test/h_functional/36_rotate.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/36_rotate.sy b/2026test/h_functional/36_rotate.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/37_dct.in b/2026test/h_functional/37_dct.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/37_dct.sy b/2026test/h_functional/37_dct.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/38_light2d.sy b/2026test/h_functional/38_light2d.sy
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/39_fp_params.in b/2026test/h_functional/39_fp_params.in
old mode 100755
new mode 100644
diff --git a/2026test/h_functional/39_fp_params.sy b/2026test/h_functional/39_fp_params.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/01_mm1.in b/2026test/performance/01_mm1.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/01_mm1.sy b/2026test/performance/01_mm1.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/01_mm2.in b/2026test/performance/01_mm2.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/01_mm2.sy b/2026test/performance/01_mm2.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/01_mm3.in b/2026test/performance/01_mm3.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/01_mm3.sy b/2026test/performance/01_mm3.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/03_sort1.in b/2026test/performance/03_sort1.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/03_sort1.sy b/2026test/performance/03_sort1.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/03_sort2.in b/2026test/performance/03_sort2.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/03_sort2.sy b/2026test/performance/03_sort2.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/03_sort3.in b/2026test/performance/03_sort3.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/03_sort3.sy b/2026test/performance/03_sort3.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/2025-LYY-59.in b/2026test/performance/2025-LYY-59.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/2025-QMJ-23.in b/2026test/performance/2025-QMJ-23.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/2025-SPR-60.in b/2026test/performance/2025-SPR-60.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/conv2d-1.in b/2026test/performance/conv2d-1.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/conv2d-1.sy b/2026test/performance/conv2d-1.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/conv2d-2.in b/2026test/performance/conv2d-2.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/conv2d-2.sy b/2026test/performance/conv2d-2.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/conv2d-3.in b/2026test/performance/conv2d-3.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/conv2d-3.sy b/2026test/performance/conv2d-3.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/crc1.in b/2026test/performance/crc1.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/crc1.sy b/2026test/performance/crc1.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/crc2.in b/2026test/performance/crc2.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/crc2.sy b/2026test/performance/crc2.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/crc3.in b/2026test/performance/crc3.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/crc3.sy b/2026test/performance/crc3.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/crypto-1.in b/2026test/performance/crypto-1.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/crypto-1.sy b/2026test/performance/crypto-1.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/crypto-2.in b/2026test/performance/crypto-2.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/crypto-2.sy b/2026test/performance/crypto-2.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/crypto-3.in b/2026test/performance/crypto-3.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/crypto-3.sy b/2026test/performance/crypto-3.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/fft0.in b/2026test/performance/fft0.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/fft0.sy b/2026test/performance/fft0.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/fft1.in b/2026test/performance/fft1.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/fft1.sy b/2026test/performance/fft1.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/fft2.in b/2026test/performance/fft2.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/fft2.sy b/2026test/performance/fft2.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-1-01.in b/2026test/performance/h-1-01.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-1-01.sy b/2026test/performance/h-1-01.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-1-02.in b/2026test/performance/h-1-02.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-1-02.sy b/2026test/performance/h-1-02.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-1-03.in b/2026test/performance/h-1-03.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-1-03.sy b/2026test/performance/h-1-03.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-10-01.in b/2026test/performance/h-10-01.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-10-01.sy b/2026test/performance/h-10-01.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-10-02.in b/2026test/performance/h-10-02.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-10-02.sy b/2026test/performance/h-10-02.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-10-03.in b/2026test/performance/h-10-03.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-10-03.sy b/2026test/performance/h-10-03.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-4-01.in b/2026test/performance/h-4-01.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-4-01.sy b/2026test/performance/h-4-01.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-4-02.in b/2026test/performance/h-4-02.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-4-02.sy b/2026test/performance/h-4-02.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-4-03.in b/2026test/performance/h-4-03.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-4-03.sy b/2026test/performance/h-4-03.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-5-01.in b/2026test/performance/h-5-01.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-5-01.sy b/2026test/performance/h-5-01.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-5-02.in b/2026test/performance/h-5-02.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-5-02.sy b/2026test/performance/h-5-02.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-5-03.in b/2026test/performance/h-5-03.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-5-03.sy b/2026test/performance/h-5-03.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-8-01.sy b/2026test/performance/h-8-01.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-8-02.sy b/2026test/performance/h-8-02.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-8-03.sy b/2026test/performance/h-8-03.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-9-01.in b/2026test/performance/h-9-01.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-9-01.sy b/2026test/performance/h-9-01.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-9-02.in b/2026test/performance/h-9-02.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-9-02.sy b/2026test/performance/h-9-02.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-9-03.in b/2026test/performance/h-9-03.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/h-9-03.sy b/2026test/performance/h-9-03.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/huffman-01.in b/2026test/performance/huffman-01.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/huffman-01.sy b/2026test/performance/huffman-01.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/huffman-02.in b/2026test/performance/huffman-02.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/huffman-02.sy b/2026test/performance/huffman-02.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/huffman-03.in b/2026test/performance/huffman-03.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/huffman-03.sy b/2026test/performance/huffman-03.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/knapsack_naive-1.in b/2026test/performance/knapsack_naive-1.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/knapsack_naive-1.sy b/2026test/performance/knapsack_naive-1.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/knapsack_naive-2.in b/2026test/performance/knapsack_naive-2.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/knapsack_naive-2.sy b/2026test/performance/knapsack_naive-2.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/knapsack_naive-3.in b/2026test/performance/knapsack_naive-3.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/knapsack_naive-3.sy b/2026test/performance/knapsack_naive-3.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/many_mat_cal-1.in b/2026test/performance/many_mat_cal-1.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/many_mat_cal-1.sy b/2026test/performance/many_mat_cal-1.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/many_mat_cal-2.in b/2026test/performance/many_mat_cal-2.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/many_mat_cal-2.sy b/2026test/performance/many_mat_cal-2.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/many_mat_cal-3.in b/2026test/performance/many_mat_cal-3.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/many_mat_cal-3.sy b/2026test/performance/many_mat_cal-3.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/matmul1.in b/2026test/performance/matmul1.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/matmul1.sy b/2026test/performance/matmul1.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/matmul2.in b/2026test/performance/matmul2.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/matmul2.sy b/2026test/performance/matmul2.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/matmul3.in b/2026test/performance/matmul3.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/matmul3.sy b/2026test/performance/matmul3.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/optimization_scheduling1.in b/2026test/performance/optimization_scheduling1.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/optimization_scheduling1.sy b/2026test/performance/optimization_scheduling1.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/optimization_scheduling2.in b/2026test/performance/optimization_scheduling2.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/optimization_scheduling2.sy b/2026test/performance/optimization_scheduling2.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/optimization_scheduling3.in b/2026test/performance/optimization_scheduling3.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/optimization_scheduling3.sy b/2026test/performance/optimization_scheduling3.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/shuffle0.in b/2026test/performance/shuffle0.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/shuffle0.sy b/2026test/performance/shuffle0.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/shuffle1.in b/2026test/performance/shuffle1.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/shuffle1.sy b/2026test/performance/shuffle1.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/shuffle2.in b/2026test/performance/shuffle2.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/shuffle2.sy b/2026test/performance/shuffle2.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/sl1.in b/2026test/performance/sl1.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/sl1.sy b/2026test/performance/sl1.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/sl2.in b/2026test/performance/sl2.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/sl2.sy b/2026test/performance/sl2.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/sl3.in b/2026test/performance/sl3.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/sl3.sy b/2026test/performance/sl3.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/transpose0.in b/2026test/performance/transpose0.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/transpose0.sy b/2026test/performance/transpose0.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/transpose1.in b/2026test/performance/transpose1.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/transpose1.sy b/2026test/performance/transpose1.sy
old mode 100755
new mode 100644
diff --git a/2026test/performance/transpose2.in b/2026test/performance/transpose2.in
old mode 100755
new mode 100644
diff --git a/2026test/performance/transpose2.sy b/2026test/performance/transpose2.sy
old mode 100755
new mode 100644
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 00000000..f3231dda
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,87 @@
+# CLAUDE.md
+
+SysY → ARM64/AArch64 编译器，CMake + C++17 + ANTLR 4.13.2。2026 编译系统设计赛（华为毕昇杯）ARM 赛道。
+
+## 构建
+
+```bash
+# 首次：生成 ANTLR Lexer/Parser
+mkdir -p build/generated/antlr4
+java -jar third_party/antlr-4.13.2-complete.jar -Dlanguage=Cpp -visitor -no-listener \
+  -Xexact-output-dir -o build/generated/antlr4 src/antlr4/SysY.g4
+
+# 全量构建
+cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DCOMPILER_PARSE_ONLY=OFF
+cmake --build build -j "$(nproc)"
+```
+
+可执行文件：`./build/bin/compiler`
+
+## 编译器 CLI
+
+```bash
+compiler -S -o output.s input.sy       # 汇编输出（比赛标准）
+compiler -S -o output.s input.sy -O    # 带优化
+compiler --emit-ir input.sy            # 打印 IR
+compiler --emit-parse-tree input.sy    # 打印语法树
+```
+
+## 架构
+
+编译管线：`SysY → ANTLR 语法树 → 语义分析 → IR 生成 → IR 优化 → MIR 降级 → 寄存器分配 → 栈帧 → 窥孔 → AArch64 汇编`
+
+源码目录：`src/frontend/`（ANTLR 驱动）、`src/sem/`（Sema/SymbolTable）、`src/irgen/`（语法树→IR）、`src/ir/`（Module→Function→BasicBlock→Instruction，passes/ 含 Mem2Reg/CFGSimplify/ConstFold/ConstProp/DCE/CSE/LICM）、`src/mir/`（MachineModule→MachineFunction→MachineBasicBlock→MachineInstr，Lowering/RegAlloc/FrameLowering/AsmPrinter/Peephole）
+
+关键设计：IR 类型 void/i1/i32/float/i32*/float*；MIR 操作数 PhysReg/VReg/Imm/FrameIndex/Label/Symbol；`-O` 触发所有 IR pass；GP 可分配集含 x16/x17；xzr/wzr 为零寄存器，sp 为栈指针。
+
+## 竞赛红线（零容忍）
+
+1. 禁止投机优化（不得识别特定函数名/输入特征）
+2. 禁止硬编码计算结果
+3. 禁止依赖 UB（数组越界、除法溢出等假设）
+4. 优化必须对所有合法 SysY2026 程序语义保持
+
+## 历史故障模式——修改以下区域时必须遵守的预防规则
+
+| 区域 | 预防规则 |
+|------|----------|
+| 寄存器分配 | 合并后重算 degree；不修改遍历中的容器；Briggs 保守测试 |
+| 栈帧 | 大偏移量（>12KB）必须用 movz/movk 合成偏移 |
+| 活变量分析 | shift 链等密集 def-use 需保守干涉边（block defs>20 时全干涉） |
+| spill | 大函数（>120 vregs）限制 spill 轮次 ≤5 |
+| 活跃合并 | 合并前检查 u != v；move_adj 自环导致迭代器失效 |
+| IR 优化 | Load/Store/Call 不能重排跨越彼此；浮点不能随意重关联 |
+
+## 门禁
+
+```bash
+# 快门禁（每次 commit 前，~2分钟）
+./2026test.sh -c functional -x && ./2026test.sh -c h_functional -x
+
+# 中门禁（merge 前，~10分钟）
+./2026test.sh -c functional -x && ./2026test.sh -c h_functional -x && ./2026test.sh -c performance -x
+
+# 全门禁（关键节点，~30分钟）
+./2026test.sh  # 全量
+```
+
+- 绝不跳过门禁。功能测试失败不进入性能测量。门禁失败修复后重跑，不允许"先合并后修复"。
+- 指令数基线：`./count_asm.sh`；`指令数基线.md` 记录历史最低值。性能退化 >5 用例阻止合并，2-5 用例标记关注。
+
+## 代码规范
+
+- 一定中文交流、注释、commit message、文档。标识符/文件名用英文。
+- 变量 `snake_case`，函数/类 `PascalCase`，成员变量 `snake_case_`
+- Git: `<type>(<scope>): <中文简述>`。一 commit 一逻辑变更。不提交编译或测试失败的代码。功能分支开发，master 保护。
+
+## MCP 使用
+
+| 场景 | 工具 | 不要 |
+|------|------|------|
+| 查找符号 | `codegraph_search` | 不要 grep |
+| 调用关系 | `codegraph_callers/callees` | 不要手动 Read 追踪 |
+| 改动影响 | `codegraph_impact` | 不要猜测 |
+| 代码区探索 | `codegraph_explore`（一次） | 不要逐个 codegraph_node |
+| 字面量 | `grep` | 不要用 codegraph |
+
+WebFetch 不可用（DeepSeek 后端域名校验失败），用 `bash scripts/fetch.sh <url>` 替代。
diff --git a/copy_src.sh b/copy_src.sh
new file mode 100755
index 00000000..8a5b9777
--- /dev/null
+++ b/copy_src.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+SRC_DIR=~/update_compiler/src
+DST_DIR=~/warning/src
+
+mkdir -p "$DST_DIR"
+
+find "$SRC_DIR" \( -name '*.cpp' -o -name '*.h' \) | while read -r file; do
+    rel_path="${file#$SRC_DIR/}"
+    dir=$(dirname "$rel_path")
+    mkdir -p "$DST_DIR/$dir"
+    cp "$file" "$DST_DIR/$rel_path"
+done
+
+echo "Done. All .cpp and .h files copied to $DST_DIR"
diff --git a/doc/LLVM-Loop-Block-分析报告.md b/doc/LLVM-Loop-Block-分析报告.md
new file mode 100644
index 00000000..0d551375
--- /dev/null
+++ b/doc/LLVM-Loop-Block-分析报告.md
@@ -0,0 +1,334 @@
+# 给 Claude 的 LLVM Loop Blocking 执行说明
+## 1. 目标
+你要根据这份文档，在当前项目中逐步实现循环分块优化。
+这不是背景介绍，而是执行说明。若你想扩更多能力，但与本文阶段约束冲突，必须优先服从本文。
+
+这份文档只保留三类内容：
+- 实现思路
+- 停点校验建议
+- 特别需要注意的事项
+
+目标不是一次性写完一个完整 Polly 风格多面体优化器，而是先做一个可验证、可调试、能稳定拒绝复杂情况的基础版本。
+
+### 最小成功标准
+基础版本至少要满足：
+1. 能识别一类简单可分块 loop，并稳定输出 `blockable/profitable` 结果。
+2. 能在至少一个简单样例上完成 strip-mining 或 blocking 的 IR 改写。
+3. 改写后的 IR 能通过后续编译并保持语义正确。
+4. 对明显不适合 blocking 的 loop，能给出稳定拒绝原因。
+5. 对 6 个复杂样例，至少能做到分类准确。
+
+### 任务边界
+你当前要实现的是 `plain loop blocking / strip-mining` 的基础版本，而不是完整工业级版本。
+
+在没有额外说明时，任务边界限制为：
+- counted loop
+- 结构规整的二维或三维嵌套循环
+- 边界和数组下标可静态分析
+- 访问模式接近仿射
+- 无复杂控制流
+- 无需要运行时依赖检查的复杂别名情况
+
+如果某个 loop 明显更适合：
+- loop interchange
+- autovectorization
+- reduction-aware blocking
+- matmul-specific blocking
+
+那么应优先拒绝，或者明确标注“不是当前阶段目标”。
+
+### 必须遵守的执行方式
+你必须遵守：
+1. 一次只实现一个大步骤，不允许把候选分析、tile 参数决策、CFG/SSA 重写一次性全写完。
+2. 每完成一个步骤，必须停下来检查，并明确说明这一阶段做了什么、IR 是否变化、变化是否符合预期。
+3. 如果当前阶段预期 IR 不应该变化，必须显式验证 IR 确实不变。
+4. 每个阶段至少准备一个“应该命中”的样例和一个“应该拒绝”的样例。
+5. 如果当前阶段检查没有通过，必须先修当前阶段，不能跳到下一阶段。
+6. 应优先追求“正确性 + 可调试性”，而不是过早追求复杂覆盖率。
+
+### 当前不应做的事情
+除非当前阶段明确要求，否则不应：
+- 一次性实现完整 polyhedral/SCoP 框架
+- 一次性支持复杂非仿射边界
+- 一次性支持复杂 alias 运行时检查
+- 一次性支持带数据相关分支的 block 重写
+- 一次性把 blocking、interchange、autovectorization、matmul-specialization 混成一个 pass
+
+### 默认保守策略
+如果不能稳定回答下面任一问题，就应默认拒绝该 loop：
+- loop 是否结构规整
+- 访问是否足够接近仿射
+- 依赖在分块后是否仍然安全
+- loop 是否真的值得 blocking
+- IR 重写是否能完整更新 `phi`、边界和 `GEP`
+
+换句话说：
+- 不确定时，先拒绝
+- 不能证明收益时，先不做
+- 不能证明重写完整时，先不改 IR
+
+### 每个停点结束时应汇报什么
+每完成一个停点，都应汇报：
+- 本阶段改了哪些文件
+- 本阶段新增了什么能力
+- 本阶段明确没有做什么
+- 使用了哪些测试样例
+- 优化前后 IR 是否变化
+- 程序语义是否保持一致
+- 是否进入下一阶段；如果不进入，当前阻塞点是什么
+
+建议尽量按下面结构汇报：
+
+```text
+[阶段名称]
+1. 本阶段目标
+2. 本阶段实际修改
+3. 本阶段明确未做内容
+4. 测试样例
+5. IR 对比结果
+6. 语义检查结果
+7. 是否进入下一阶段
+8. 若不进入，阻塞原因
+```
+
+### 当前停点失败时如何处理
+如果当前停点检查没有通过，应当：
+1. 明确失败属于哪类：`IR 不该变但变了`、`IR 应该变但没变`、`IR 变了但位置不对`、`语义不一致`、`命中了错误类型的 loop`。
+2. 优先修当前阶段，不进入下一阶段。
+3. 不要在修 bug 的同时顺手扩新能力。
+4. 如果失败根因超出任务边界，应回退到“保守拒绝”。
+***
+## 2. 实现思路
+你应把循环分块理解成：把一个大迭代空间切成多个小块，让数据在块内被重复利用，从而改善缓存局部性，并为后续向量化或重排创造条件。
+
+落到 IR 上，先只考虑最基础的 strip-mining / blocking。核心流程只有三步。
+
+### （1）先做候选检测与合法性判定
+这一阶段不改 IR，只判断哪些 loop 可以安全分块。
+
+最小可行版本只接受：
+- 单层 counted loop 的 strip-mining
+- 或二维/三维结构规整的 perfect / near-perfect nest
+- 归纳变量和边界容易识别
+- 数组访问可表达为简单仿射形式
+- 不存在明显会被 blocking 破坏的依赖
+
+如果要处理多维数组访问，最基础要求是能稳定识别：
+- 哪些维度是 loop index
+- 哪些维度决定 stride
+- 哪些访问在块内可复用
+
+### （2）再做 tile 参数与收益分析
+通过合法性检查后，再判断“值不值得 block”以及“按哪个维度 block”。
+
+最小可行版本只关注三个问题：
+- 哪个 loop 维度最值得被 strip-mine
+- tile size 取多少
+- 分块后块内数据重用是否明显增加
+
+基础启发式可以先很保守：
+- 候选 tile size 固定为 `8 / 16 / 32`
+- 再根据访问模式与维度个数做简单选择
+
+简单原则：
+- loop 只是逐元素扫描、当前内层已经 unit-stride，blocking 往往收益有限
+- loop 是矩阵归约型热点，blocking 往往更值得优先考虑
+
+### （3）最后做 IR 变换
+只有在“合法 + 有收益”都成立时，才真正改 IR。
+
+关键动作包括：
+- 把原循环拆成外层 tile loop 和内层块内 loop
+- 更新归纳变量 `phi`
+- 更新边界比较与递增指令
+- 更新 `getelementptr` 中使用的索引表达式
+- 正确处理完整 tile 与边界 tile
+
+### 当前最适合先支持的循环
+建议先只支持两类：
+
+1. 用来证明 strip-mining 机制正确的简单 loop
+
+2. 真正有 blocking 价值的矩阵归约型 loop
+
+第一类用来把变换框架做对，第二类才是性能价值更高的目标。
+
+### 扩展顺序建议
+比较稳的扩展顺序：
+1. 单层 strip-mining 和二维/三维规整循环的基础 blocking 变换。
+2. 完整 tile / partial tile 处理。
+3. 更稳健的 affine access 分析。
+4. 与 interchange、autovectorization、matmul-specialization 协同。
+***
+## 3. 停点校验建议
+这是最重要的部分。你必须强制分阶段，不允许一步写完。
+
+### 总规则
+必须遵守：
+1. 一次只实现一个大步骤。
+2. 每做完一步，先校验，再继续。
+3. 当前阶段若预期 IR 不变，必须验证 IR 真的不变。
+4. 每阶段至少准备一个“应该优化”和一个“不应该优化”的样例。
+5. 当前阶段不过，就不要进入下一阶段。
+
+每个阶段都固定做两类检查：
+- 结构检查：IR 是否按预期变化
+- 语义检查：程序输出是否保持不变
+
+### 停点一：只做候选检测与合法性判定
+这一步只做分析，不做 IR 改写。
+
+应完成：
+- 识别 counted loop 或 loop nest
+- 识别归纳变量与边界
+- 识别基本访问模式
+- 输出“可分块 / 不可分块”
+- 输出清晰拒绝原因
+
+不应该做：
+- 不应改写 CFG
+- 不应改写 `phi`
+- 不应改写 `GEP`
+- 不应修改循环边界和步长
+
+必须检查：
+- 可分块样例是否被识别为可分块
+- 不可分块样例是否被稳定拒绝
+- 拒绝原因是否准确
+- IR 是否完全不变
+
+推荐检查：
+- 打印 `blockable: yes/no`
+- 打印 `reason: non-affine-bound`
+- 打印 `reason: complex-cfg`
+- 打印 `reason: unsupported-access`
+- 对比优化前后 IR，确认完全一致
+
+### 停点二：只做 tile 计划与收益分析
+这一阶段仍然不做 IR 变换，只决定“该不该 block”以及“准备怎么 block”。
+
+应完成：
+- 在合法候选中筛选真正值得分块的 loop
+- 决定按哪个维度 block
+- 决定保守的 tile size
+- 给出收益理由
+
+必须检查：
+- 简单逐元素 unit-stride loop 是否被拒绝或降权
+- 矩阵归约型热点是否被保留为候选
+- 这一阶段 IR 是否仍然不变
+
+推荐检查：
+- 打印 `profitable: yes/no`
+- 打印 `reason: data-reuse`
+- 打印 `reason: already-streaming`
+- 打印 `tile-size: ...`
+
+### 停点三：最后做 blocking 变换
+这是最后一步，才真正改写 IR。
+
+应完成：
+- 生成外层 tile loop
+- 生成块内 loop
+- 更新归纳变量 `phi`
+- 更新边界比较与递增指令
+- 更新 `GEP` 和相关使用点
+- 正确处理边界 tile
+
+必须检查：
+- IR 中是否出现新的 tile loop 和块内 loop
+- 原边界是否被改成 tile 级边界
+- `GEP` 是否引用了新的 tile 基址和块内偏移
+- 变换后的 IR 是否仍能通过后续编译与运行
+
+推荐检查：
+- diff IR，确认新增 tile loop
+- 比较变换前后关键 `phi` / `icmp` / `add` / `GEP`
+- 运行优化前后程序并比较输出
+
+### 对六个测试样例的要求
+你应把这 6 个样例作为“复杂回归样例”使用：
+- `many_mat_cal-1.sy`
+- `many_mat_cal-2.sy`
+- `many_mat_cal-3.sy`
+- `matmul1.sy`
+- `matmul2.sy`
+- `matmul3.sy`
+
+对这 6 个样例，应遵守下面的判断：
+
+1. 三层矩阵归约热点
+- `many_mat_cal-*` 中 `sum = sum + C[i][k] * A[k][j]`
+- `matmul1/2/3` 中带条件的三层矩阵归约
+- 这是最值得优先尝试命中的 blocking 目标
+
+2. `many_mat_cal-*` 中的逐元素点运算 loop
+- `C[i][j] = A[i][j] * 2 + B[i][j] * 3`
+- `val = val * val + 7; val = val / 3`
+- 这类 loop 通常不应作为 plain blocking 的优先目标
+- 原因是内层 `j` 已经是 streaming / unit-stride，更适合自动向量化
+
+3. `matmul1/2/3` 中的转置型二维 loop
+- `b[i][j] = a[j][i]`
+- 这类 loop 不是 plain blocking 的第一优先目标
+- 更适合 loop interchange
+
+### 面向 Claude 的样例使用规则
+不要把目标写成“全部样例都要出现 blocking IR 变化”。
+正确目标是：
+1. 先让样例分类与拒绝理由稳定。
+2. 再让真正属于 blocking 目标的 loop 发生变化。
+3. 对其他样例，即使最终 IR 不变，只要拒绝理由准确，也算当前阶段通过。
+***
+## 4. 特别需要注意的事项
+### 4.1 合法可分块不等于值得分块
+很多 loop 在结构上可以 strip-mine，但 blocking 不一定带来收益。
+如果当前 loop 只是简单 streaming 访问，内层已经是 unit-stride，那么 blocking 可能只会增加额外 loop 开销。
+
+### 4.2 优先关注数据复用
+blocking 真正的目标不是制造更多 loop，而是让：
+- 块内数据被重复使用
+- 工作集更容易留在 cache 中
+
+如果块内没有明显复用，通常收益有限。
+
+### 4.3 reduction 热点、点运算和转置 loop 要区分对待
+矩阵乘法类三层归约 loop 更接近 blocking 的主目标。
+简单逐元素 loop 更接近向量化主目标。
+转置型二维 loop 更接近 interchange 主目标。
+不要把这三类 loop 混为一谈。
+
+### 4.4 partial tile 处理最容易出错
+如果 `N` 不能整除 `TileSize`，必须正确处理边界 tile。
+需要特别警惕：
+- 越界访问
+- 漏掉尾部元素
+- 外层 tile 步长正确但内层边界错误
+
+### 4.5 `GEP` 重写必须和新 induction 体系一致
+做完 blocking 后，访问地址通常不再直接是原始归纳变量，而是：
+- tile base
+- block-local offset
+
+如果 `GEP` 仍然引用旧 induction variable，通常说明重写不完整。
+
+### 4.6 调试时优先相信 IR 差异
+循环分块最有效的调试方式不是先看性能，而是先看 IR。
+每个阶段优先回答：
+- 有没有变化
+- 变化是不是预期那一种
+- 变化是否出现在正确的 loop 上
+- 输出是否仍然正确
+***
+## 5. 最终交付标准
+当你认为自己已经完成一个可提交版本时，至少应能给出：
+- 哪些文件被修改
+- 当前 blocking pass 支持哪些 loop 形态
+- 当前明确不支持哪些 loop 形态
+- 三个停点分别如何验证通过
+- 这 6 个样例分别被归入哪一类
+- 至少一个真实发生 IR 变化且语义正确的 blocking 样例
+- 至少一个保持拒绝且理由正确的对照样例
+
+如果你只能记住一句话，就记住这一句：
+先把三层矩阵归约热点做成真正可验证的 blocking 候选，再去考虑更复杂的协同优化；不要为了命中全部性能样例而过早把 blocking、interchange、vectorization 混在一起做。
diff --git a/doc/LLVM-Loop-Fussion-分析报告.md b/doc/LLVM-Loop-Fussion-分析报告.md
new file mode 100644
index 00000000..660a0af6
--- /dev/null
+++ b/doc/LLVM-Loop-Fussion-分析报告.md
@@ -0,0 +1,318 @@
+# 给 Claude 的 LLVM Loop Fusion 执行说明
+## 1. 目标
+你要根据这份文档，在当前项目中逐步实现循环融合优化。
+这不是背景介绍，而是执行说明。若你想额外扩展能力，但与本文的阶段约束冲突，必须优先服从本文。
+
+这份文档只保留四类内容：
+- 任务边界
+- 实现思路
+- 停点校验建议
+- 特别需要注意的事项
+
+目标不是一次性写完一个完整工业级 loop fusion pass，而是先做一个可验证、可调试、能稳定拒绝复杂情况的基础版本。
+
+### 最小成功标准
+基础版本至少要满足：
+1. 能识别一类简单可融合 loop pair，并稳定输出 `fusible/profitable` 结果。
+2. 能在至少一个简单样例上完成真正的 IR 融合改写。
+3. 改写后的 IR 能通过后续编译并保持语义正确。
+4. 对明显不适合融合的 loop pair，能给出稳定拒绝原因。
+5. 对 6 个复杂样例，至少能做到分类准确。
+
+### 任务边界
+你当前要实现的是 `plain loop fusion` 的基础版本，而不是完整工业级版本。
+
+在没有额外说明时，任务边界限制为：
+- 两个相邻的 counted loop
+- 两个 loop 的结构接近一致
+- 两个 loop 的 trip count 相同，或当前阶段只接受完全相同
+- 两个 loop 的 preheader / header / latch / exit 易于识别
+- 无复杂控制流
+- 无需要运行时依赖检查的复杂别名情况
+- 中间代码为空，或当前阶段完全不支持 MIC
+
+如果某个场景明显更适合：
+- loop interchange
+- loop blocking
+- autovectorization
+- reduction-aware fusion
+- 软件流水或更复杂的跨循环调度
+
+那么应优先拒绝，或者明确标注“不是当前阶段目标”。
+
+### 你必须遵守的执行方式
+你必须遵守：
+1. 一次只实现一个大步骤，不允许把候选分析、收益分析、CFG/SSA 重写一次性全写完。
+2. 每完成一个步骤，必须停下来检查，并明确说明这一阶段做了什么、IR 是否变化、变化是否符合预期。
+3. 如果当前阶段预期 IR 不应该变化，必须显式验证 IR 确实不变。
+4. 每个阶段至少准备一个“应该命中”的样例和一个“应该拒绝”的样例。
+5. 如果当前阶段检查没有通过，必须先修当前阶段，不能跳到下一阶段。
+6. 应优先追求“正确性 + 可调试性”，而不是过早追求复杂覆盖率。
+
+### 当前不应做的事情
+除非当前阶段明确要求，否则不应：
+- 一次性支持 MIC
+- 一次性支持 trip count 对齐所需的 peeling
+- 一次性支持复杂条件控制流下的融合
+- 一次性支持跨多个 loop 的连锁融合
+- 一次性把 fusion、interchange、blocking、vectorization 混成一个 pass
+
+### 默认保守策略
+如果不能稳定回答下面任一问题，就应默认拒绝该 loop pair：
+- 两个 loop 是否一定共同执行
+- 两个 loop 的 trip count 是否一致
+- 两个 loop 之间是否存在禁止融合的依赖
+- 融合后是否真的值得
+- CFG / `phi` / 分支目标是否能被完整重写
+
+### 每个停点结束时应汇报什么
+每完成一个停点，都应汇报：
+- 本阶段改了哪些文件
+- 本阶段新增了什么能力
+- 本阶段明确没有做什么
+- 使用了哪些测试样例
+- 优化前后 IR 是否变化
+- 程序语义是否保持一致
+- 是否进入下一阶段；如果不进入，当前阻塞点是什么
+
+### 当前停点失败时如何处理
+如果当前停点检查没有通过，应当：
+1. 明确失败属于哪类：`IR 不该变但变了`、`IR 应该变但没变`、`IR 变了但位置不对`、`语义不一致`、`命中了错误类型的 loop pair`。
+2. 优先修当前阶段，不进入下一阶段。
+3. 不要在修 bug 的同时顺手扩新能力。
+4. 如果失败根因超出任务边界，应回退到“保守拒绝”。
+
+***
+## 2. 实现思路
+你应把循环融合理解成：把两个本来顺序执行的循环，合并成一个统一的循环控制结构，让同一迭代下原本分散的工作更靠近，从而减少循环控制开销、缩短数据重用距离，并为后续优化创造条件。
+
+落到 IR 上，先只考虑最基础的 “两个相邻 counted loop 融合”。核心流程只有三步。
+
+### （1）先做候选检测与合法性判定
+这一阶段不改 IR，只判断哪些 loop pair 可以安全融合。
+
+最小可行版本只接受：
+- 两个 loop 相邻
+- 两个 loop 控制流等价，或者至少在当前 CFG 上总是一起执行
+- 两个 loop 的 trip count 相同
+- 两个 loop 的 induction 形态简单
+- 两个 loop 都是单出口
+- 两个 loop 之间没有禁止融合的负距离依赖
+
+最基础要能稳定识别：
+- 第一个 loop 的 exit 是否直接连接第二个 loop 的 preheader
+- 两个 loop 的归纳变量和边界比较
+- 两个 loop 的读写集合
+- 是否存在跨 loop 的依赖方向问题
+
+### （2）再做收益分析
+通过合法性检查后，再判断“值不值得 fuse”。
+
+最小可行版本只关注三个问题：
+- 两个 loop 是否有明显的数据重用关系
+- 融合后是否减少循环控制开销
+- 融合后循环体是否不会膨胀到明显不利
+
+基础启发式可以先很保守：
+- 若第二个 loop 直接使用第一个 loop 刚写出的数组元素，优先考虑融合
+- 若两个 loop 只是无关地顺序扫描不同数组，收益通常有限
+- 若融合会显著增加 live range 或明显放大循环体，先拒绝
+
+### （3）最后做 IR 变换
+只有在“合法 + 有收益”都成立时，才真正改 IR。
+
+关键动作包括：
+- 合并两个 loop 的循环控制
+- 更新 `phi`
+- 更新 `icmp` / `br`
+- 让原来第二个 loop 的循环体插入到第一个 loop 的迭代体中
+- 删除失效的 preheader / exit / 空基本块
+- 保持 SSA 与 CFG 一致
+最终效果应接近一个统一的循环控制结构：
+`for (i = 0; i < N; ++i) { body0(i); body1(i); }`
+
+### 当前最适合先支持的循环
+建议先只支持两类：
+
+1. 用来证明 fusion 机制正确的简单生产者-消费者 loop pair
+- 例如先写 `A[i]`，下一段立刻读 `A[i]`
+
+2. 控制结构简单、trip count 一致、相邻且无中间代码的 loop pair
+
+第一类用来把融合框架做对，第二类才适合逐步扩覆盖。
+
+### 扩展顺序建议
+比较稳的扩展顺序：
+1. 两个完全相邻 counted loop 的基础融合。
+2. 更稳健的依赖分析。
+3. trip count 对齐与有限 peeling。
+4. MIC。
+5. 与 interchange、blocking、vectorization 协同。
+
+***
+## 3. 停点校验建议
+这是最重要的部分。你必须强制分阶段，不允许一步写完。
+
+### 总规则
+必须遵守：
+1. 一次只实现一个大步骤。
+2. 每做完一步，先校验，再继续。
+3. 当前阶段若预期 IR 不变，必须验证 IR 真的不变。
+4. 每阶段至少准备一个“应该优化”和一个“不应该优化”的样例。
+5. 当前阶段不过，就不要进入下一阶段。
+
+每个阶段都固定做两类检查：
+- 结构检查：IR 是否按预期变化
+- 语义检查：程序输出是否保持不变
+
+### 停点一：只做候选检测与合法性判定
+这一步只做分析，不做 IR 改写。
+
+应完成：
+- 识别相邻 loop pair
+- 识别归纳变量与边界
+- 检查 trip count 是否一致
+- 检查是否存在明显禁止融合的依赖
+- 输出“可融合 / 不可融合”
+- 输出清晰拒绝原因
+
+必须检查：
+- 可融合样例是否被识别为可融合
+- 不可融合样例是否被稳定拒绝
+- 拒绝原因是否准确
+- IR 是否完全不变
+
+推荐检查：
+- 打印 `fusible: yes/no`
+- 打印 `reason: trip-count-mismatch`
+- 打印 `reason: non-adjacent-loops`
+- 打印 `reason: negative-dependence`
+- 对比优化前后 IR，确认完全一致
+
+### 停点二：只做收益分析
+这一阶段仍然不做 IR 变换，只决定“该不该 fuse”。
+
+应完成：
+- 在合法候选中筛选真正值得融合的 loop pair
+- 给出收益理由
+- 区分“合法但收益不足”和“确实值得融合”
+
+必须检查：
+- 生产者-消费者型 loop pair 是否被保留为候选
+- 两个无关扫描 loop 是否被拒绝或降权
+- 这一阶段 IR 是否仍然不变
+
+推荐检查：
+- 打印 `profitable: yes/no`
+- 打印 `reason: producer-consumer-reuse`
+- 打印 `reason: only-branch-saving`
+- 打印 `reason: register-pressure-risk`
+
+### 停点三：最后做 fusion 变换
+这是最后一步，才真正改写 IR。
+
+应完成：
+- 合并两个 loop 的控制流
+- 更新 `phi`
+- 更新边界比较与分支
+- 把第二个 loop 的 body 接到第一个 loop 的迭代体中
+- 删除失效基本块
+
+必须检查：
+- IR 中是否只剩一个融合后的循环控制结构
+- 第二个 loop 的 body 是否进入融合后的循环体
+- 原先两个 `icmp + br` 是否被合并
+- 变换后的 IR 是否仍能通过后续编译与运行
+
+推荐检查：
+- diff IR，确认 loop 控制结构被合并
+- 比较变换前后关键 `phi` / `icmp` / `br`
+- 运行优化前后程序并比较输出
+
+### 对六个测试样例的要求
+你应把这 6 个样例作为“复杂回归样例”使用：
+- `many_mat_cal-1.sy`
+- `many_mat_cal-2.sy`
+- `many_mat_cal-3.sy`
+- `matmul1.sy`
+- `matmul2.sy`
+- `matmul3.sy`
+
+对这 6 个样例，应遵守下面的判断：
+
+1. `many_mat_cal-*` 中连续的逐元素阶段
+- 例如先写 `C[i][j]`，再对 `C[i][j]` 做逐元素变换
+- 这类相邻阶段是最接近 fusion 候选的部分
+- 如果结构上被 lowered 成相邻、trip count 一致的 loop pair，应优先分析是否可融合
+
+2. `many_mat_cal-*` 与 `matmul*` 中的三层矩阵归约热点
+- 例如 `sum = sum + C[i][k] * A[k][j]`
+- 这类 loop 不是 plain loop fusion 的第一优先目标
+- 更常见的主优化方向是 blocking、interchange、reduction-aware 变换
+
+3. `matmul1/2/3` 中的转置型二维 loop
+- 例如 `b[i][j] = a[j][i]`
+- 这类 loop 更适合 interchange，而不是 fusion
+
+4. 带条件的复杂阶段
+- 例如 `if (...) temp = temp + ...`
+- 当前基础 fusion 阶段通常应保守拒绝
+
+### 面向 Claude 的样例使用规则
+不要把目标写成“这 6 个样例都要发生 fusion IR 变化”。
+正确目标是：
+1. 先让样例分类与拒绝理由稳定。
+2. 再让真正属于 fusion 候选的 loop pair 发生变化。
+3. 对其他样例，即使最终 IR 不变，只要拒绝理由准确，也算当前阶段通过。
+
+你尤其不应把下面这些现象误判成失败：
+- `many_mat_cal-*` 的三层矩阵归约 loop 没有发生 fusion
+- `matmul1/2/3` 的转置 loop 被判断为更适合 interchange
+- 带条件的复杂阶段继续被拒绝
+
+这些在基础 fusion 阶段通常都是合理结果。
+
+***
+## 4. 特别需要注意的事项
+### 4.1 可融合不等于值得融合
+很多 loop pair 在结构上可以融合，但不一定带来收益。
+如果两个 loop 之间没有明显数据重用，只是单纯省掉一层循环控制，收益可能有限。
+
+### 4.2 trip count 一致是最核心的前提之一
+基础版本里，最好只接受 trip count 明确一致的 loop pair。
+如果一开始就支持不一致 trip count 的对齐与 peeling，调试成本会显著上升。
+
+### 4.3 依赖方向必须保守处理
+真正危险的不是“没有融合成功”，而是“错误地融合了本不该融合的 loop”。
+尤其要警惕跨 loop 的 RAW / WAR / WAW 依赖。
+
+### 4.4 `phi` 与 CFG 重写最容易出错
+fusion 的本质不是把两段代码简单拼起来，而是把两个循环控制结构合并成一个。
+如果 `phi` incoming block、latch 跳转或 exit 连接错了，IR 很容易失效。
+
+### 4.5 不要过早支持 MIC
+中间代码移动很容易把问题从“loop fusion”变成“通用 CFG 重排”。
+基础版本建议先只做完全相邻 loop。
+
+### 4.6 不要把 fusion 和别的优化混为一谈
+对这 6 个样例，更现实的顺序是：
+1. 先在小样例上把 loop pair 融合做对。
+2. 再在可能的逐元素阶段上尝试命中简单 fusion。
+3. 明确拒绝那些更适合 interchange、blocking 或 vectorization 的 loop。
+
+### 4.7 调试时优先相信 IR 差异
+循环融合最有效的调试方式不是先看性能，而是先看 IR：有没有变化、变化是否符合预期、是否出现在正确的 loop pair 上、输出是否仍然正确。
+
+***
+## 5. 最终交付标准
+当你认为自己已经完成一个可提交版本时，至少应能给出：
+- 哪些文件被修改
+- 当前 fusion pass 支持哪些 loop pair 形态
+- 当前明确不支持哪些 loop pair 形态
+- 三个停点分别如何验证通过
+- 这 6 个样例分别被归入哪一类
+- 至少一个真实发生 IR 变化且语义正确的 fusion 样例
+- 至少一个保持拒绝且理由正确的对照样例
+
+如果你只能记住一句话，就记住这一句：
+先把两个完全相邻、trip count 一致、无复杂依赖的 loop pair 融合做对，再去考虑 peeling、MIC 和与其他循环优化的协同；不要为了命中全部性能样例而过早把 fusion、interchange、blocking、vectorization 混在一起做。
diff --git a/doc/LLVM-Loop-Interchange-分析报告.md b/doc/LLVM-Loop-Interchange-分析报告.md
new file mode 100644
index 00000000..9a465291
--- /dev/null
+++ b/doc/LLVM-Loop-Interchange-分析报告.md
@@ -0,0 +1,443 @@
+# 给 Claude 的 LLVM Loop Interchange 执行说明
+## 1. 目标
+你要根据这份文档，在当前项目中逐步实现循环交换优化。
+这份文档只保留三类内容：
+- 实现思路
+- 停点校验建议
+- 特别需要注意的事项
+目标不是一次性写完一个覆盖所有情况的 Loop Interchange，而是让你严格按阶段推进，在每个停点完成后先验证，再继续。
+
+### 任务边界
+你当前要实现的是 `plain loop interchange` 的基础版本，而不是完整工业级版本。
+在没有额外说明时，你应当把任务边界限制在：
+- 二维循环
+- 结构接近 perfect nest
+- counted loop
+- 无 reduction
+- 无复杂控制流
+- 无需要运行时检查的复杂别名情况
+
+如果某个 loop 明显更适合：
+- reduction-aware interchange
+- loop blocking
+- autovectorization
+- matmul-specific 优化
+
+那么你应优先拒绝，或者明确标注“不是当前阶段目标”，而不是硬做。
+
+### 你必须遵守的执行方式
+你必须遵守下面这些规则：
+1. 一次只实现一个大步骤，不允许把合法性分析、收益分析、CFG/SSA 重写一次性全写完。
+2. 每完成一个步骤，必须停下来检查，并明确说明“这一阶段做了什么、IR 是否变化、变化是否符合预期”。
+3. 如果当前阶段预期 IR 不应该变化，你必须显式验证 IR 确实不变。
+4. 每个阶段至少准备一个“应该命中”的样例和一个“应该拒绝”的样例。
+5. 如果当前阶段检查没有通过，你必须先修当前阶段，不能跳到下一阶段。
+6. 你应该优先追求“正确性 + 可调试性”，而不是过早追求复杂覆盖率。
+
+### 你当前不应做的事情
+除非当前阶段已经明确要求，否则你不应：
+- 一次性支持三层以上循环交换
+- 一次性支持 reduction-aware interchange
+- 一次性支持带数据相关分支的交换
+- 一次性支持复杂 delinearization
+- 一次性支持运行时依赖检查
+- 为了命中 `many_mat_cal-*` / `matmul*` 而把 pass 扩展成多个优化的混合实现
+
+如果你发现某个样例更适合别的 pass，你应当明确写出原因，而不是把 loop interchange 做成“大杂烩优化”。
+
+### 每个停点结束时你应当汇报什么
+每完成一个停点，你都应当汇报：
+- 本阶段改了哪些文件
+- 本阶段新增了什么能力
+- 本阶段明确没有做什么
+- 使用了哪些测试样例
+- 优化前后 IR 是否变化
+- 程序语义是否保持一致
+- 是否进入下一阶段；如果不进入，当前阻塞点是什么
+
+### 每个停点结束时的建议输出格式
+你应尽量按下面的结构汇报：
+
+```text
+[阶段名称]
+1. 本阶段目标
+2. 本阶段实际修改
+3. 本阶段明确未做内容
+4. 测试样例
+5. IR 对比结果
+6. 语义检查结果
+7. 是否进入下一阶段
+8. 若不进入，阻塞原因
+```
+***
+## 2. 实现思路
+你应当把循环交换理解成：在不改变程序语义的前提下，交换两层嵌套循环的先后顺序，让更适合顺序访问或更适合后续向量化的维度成为内层循环。
+
+落到 IR 上，先只考虑最基础的二维 perfect nest。核心流程只有三步。
+
+### （1）先做合法性判定
+这一阶段不改 IR，只判断交换是否安全。
+最小可行版本你应当只接受：
+- 两层嵌套 counted loop
+- 结构接近 perfect nest
+- 外层和内层都有明确归纳变量 `phi`
+- 边界和步长易于识别
+- 没有多出口复杂 CFG
+- 依赖分析可以明确给出“可交换”结论
+
+你应当使用依赖方向向量来判断是否合法。可简化理解为：
+- `<`：依赖方向与索引递增方向一致
+- `=`：同一迭代或无差异
+- `>`：依赖方向反向，通常非法
+- `*`：不确定，当前版本应保守拒绝
+
+最基础规则：
+- 如果依赖矩阵中交换后会让第一个非 `=` 方向变成 `>` 或 `*`，你就应当拒绝
+- 如果依赖关系保持合法字典序，才允许进入下一阶段
+
+### （2）再做收益分析
+通过合法性检查后，你再判断“值不值得换”。
+
+最小可行版本建议只关注三个收益来源：
+- 交换后内层访存是否更接近 unit-stride
+- 交换后是否更利于缓存局部性
+- 交换后是否更利于后续自动向量化
+
+这一步的重点不是复杂成本模型，而是先把明显值得换和明显不值得换区分开。
+
+一个简单原则是：
+- 如果交换后内层循环明显从 stride 访问变成连续访问，可以优先考虑交换
+- 如果当前内层已经是 unit-stride，通常不应仅为了“看起来更对称”而交换
+
+### （3）最后做 IR 变换
+只有在“合法 + 有收益”都成立时，你再真正改 IR。
+
+关键动作包括：
+- 交换两层 loop 的 header / latch / exit 关系
+- 更新两层归纳变量对应的 `phi`
+- 更新边界比较和递增指令
+- 更新循环体里所有使用归纳变量的指令
+- 特别关注 `getelementptr`、`load/store`、`icmp/add`
+
+对二维数组访问，可把目标理解为：让“原本挂在外层的索引”进入内层位置，或者反过来，具体取决于哪种顺序更有利。
+
+### 当前最适合先支持的循环
+建议你先只支持最简单的二维 perfect nest，例如：
+
+```c
+for (int i = 0; i < N; i++) {
+  for (int j = 0; j < M; j++) {
+    B[i][j] = A[j][i];
+  }
+}
+```
+
+这类循环的价值是：
+- 结构规整
+- 只有两层
+- 没有 reduction
+- 收益分析比较直观
+
+### 扩展顺序建议
+比较稳的扩展顺序是：
+1. 二维 perfect nest、无 reduction、无复杂分支。
+2. 动态边界但结构规整的二维循环。
+3. 更稳健的依赖分析与 delinearization。
+4. 与 reduction-aware interchange、blocking、autovectorization 协同。
+***
+## 3. 停点校验建议
+这是最重要的部分。你必须强制分阶段，不允许一步写完。
+
+### 总规则
+你必须遵守：
+1. 一次只实现一个大步骤。
+2. 每做完一步，先校验，再继续。
+3. 当前阶段若预期 IR 不变，必须验证 IR 真的不变。
+4. 每阶段至少准备一个“应该优化”和一个“不应该优化”的样例。
+5. 当前阶段不过，就不要进入下一阶段。
+
+每个阶段都固定做两类检查：
+- 结构检查：IR 是否按预期变化
+- 语义检查：程序输出是否保持不变
+
+### 停点一：只做合法性判定
+这一步只做分析，不做 IR 改写。
+
+你应完成：
+- 识别两层嵌套 loop
+- 识别两层归纳变量
+- 识别访问模式和基本依赖
+- 输出“可交换 / 不可交换”
+- 输出清晰拒绝原因
+
+你不应该做：
+- 不应改写 CFG
+- 不应改写 `phi`
+- 不应改写 `GEP`
+- 不应修改任何循环顺序
+
+你必须检查：
+- 可交换样例是否被识别为可交换
+- 不可交换样例是否被稳定拒绝
+- 拒绝原因是否准确
+- IR 是否完全不变
+
+推荐检查方法：
+- 打印 `interchangeable: yes/no`
+- 打印 `reason: dependence-illegal`
+- 打印 `reason: not-perfect-nest`
+- 打印 `reason: complex-cfg`
+- 对比优化前后 IR，确认完全一致
+
+推荐样例：
+
+```c
+for (int i = 0; i < N; i++) {
+  for (int j = 0; j < M; j++) {
+    B[i][j] = A[j][i];
+  }
+}
+```
+
+```c
+for (int i = 0; i < N; i++) {
+  for (int j = 1; j < M; j++) {
+    A[i][j] = A[i][j] + A[i][j - 1];
+  }
+}
+```
+
+如果失败，优先排查：
+- loop 遍历是否正确
+- perfect nest 识别是否过严或过松
+- 方向向量构造是否错误
+- 不确定依赖是否应保守拒绝
+
+### 停点二：只做收益分析与交换计划
+这一阶段仍然不做 IR 变换，只决定“该不该换”以及“准备怎么换”。
+
+你应完成：
+- 在合法候选中筛选真正值得交换的 loop
+- 标注哪一层会变成新的内层
+- 给出收益理由
+
+你不应该做：
+- 不应改写 CFG
+- 不应改写 `phi`
+- 不应改写 `GEP`
+
+你必须检查：
+- 当前内层已经 unit-stride 的 loop 是否被拒绝
+- 明显的转置型 loop 是否被保留为候选
+- 这一阶段 IR 是否仍然不变
+
+推荐检查方法：
+- 打印 `profitable: yes/no`
+- 打印 `reason: better-unit-stride`
+- 打印 `reason: already-unit-stride`
+- 打印 `reason: not-worth-it`
+
+推荐样例：
+- 应命中：`B[i][j] = A[j][i]`
+- 应拒绝：`C[i][j] = A[i][j] * 2 + B[i][j] * 3`
+
+如果失败，优先排查：
+- unit-stride 判断是否颠倒
+- 是否把“合法但无收益”的 loop 也错误保留
+- 收益分析是否只看一种访存，忽略了整体模式
+
+### 停点三：最后做循环交换变换
+这是最后一步，才真正改写 IR。
+
+你应完成：
+- 交换两层 loop 的结构
+- 更新两层 induction `phi`
+- 更新比较与递增指令
+- 更新循环体中依赖 loop index 的 `GEP` 和相关使用点
+
+你必须检查：
+- loop 顺序是否真的被交换
+- 关键 `phi` / `icmp` / `add` 是否同步更新
+- `GEP` 使用的索引是否与新 loop 顺序一致
+- 变换后的 IR 是否仍能通过后续编译和运行
+
+推荐检查方法：
+- diff IR，确认 loop header / latch / `phi` / `GEP` 发生预期变化
+- 运行优化前后程序并比较输出
+
+如果失败，优先排查：
+- loop header / latch 接线是否错
+- `phi` 的 incoming block 是否错配
+- `GEP` 仍引用旧 induction variable
+- 外层和内层 exit condition 没有同步更新
+
+### 对六个测试样例的要求
+你应当把这 6 个样例作为“复杂回归样例”使用：
+- `many_mat_cal-1.sy`
+- `many_mat_cal-2.sy`
+- `many_mat_cal-3.sy`
+- `matmul1.sy`
+- `matmul2.sy`
+- `matmul3.sy`
+
+对这 6 个样例，你应当遵守下面的判断：
+
+1. `matmul1/2/3` 中的转置型二维循环
+
+```c
+while(i<1000){
+  j = 0;
+  while(j<1000){
+    b[i][j] = a[j][i];
+    j = j+1;
+  }
+  i = i+1;
+}
+```
+
+这是最值得优先尝试命中的 loop。
+原因：
+- 两层规整 counted loop
+- 没有 reduction
+- 是典型的转置型访问
+- 交换后通常更利于其中一侧形成连续内层访问
+
+2. `many_mat_cal-*` 中的逐元素点运算 loop
+
+```c
+while (j < T) {
+  C[i][j] = A[i][j] * 2 + B[i][j] * 3;
+  j = j + 1;
+}
+```
+
+这类 loop 通常不应作为 plain loop interchange 的优先目标。
+原因：
+- 当前内层 `j` 已经是 unit-stride
+- plain interchange 往往不会带来收益
+- 它们更适合自动向量化，而不是单独做循环交换
+
+3. `many_mat_cal-*` 和 `matmul*` 中的三层矩阵归约 loop
+
+```c
+while (k < T) {
+  sum = sum + C[i][k] * A[k][j];
+  k = k + 1;
+}
+```
+
+以及：
+
+```c
+while(k<1000){
+  if(a[i][k]*b[k][j] % 2 == 0)
+    temp = temp + b[i][k]*a[k][j];
+  k = k+1;
+}
+```
+
+这类 loop 在当前阶段通常不应作为 plain loop interchange 的主目标。
+原因：
+- 包含 reduction 或条件 reduction
+- 更适合 reduction-aware interchange、blocking 或 matmul-specific 优化
+
+### 六个样例对应的期望行为
+为了让你的行为更稳定，你应当把这 6 个样例分成三类对待：
+
+1. 应优先尝试命中的样例类型
+- `matmul1/2/3` 中的转置型二维 loop
+- 目标：在合法且有收益时，优先考虑交换
+
+2. 应优先判为“收益不足”的样例类型
+- `many_mat_cal-*` 中已经是按行连续访问的逐元素 loop
+- 目标：即使合法，也通常不交换
+
+3. 应优先判为“不是当前阶段目标”的样例类型
+- `many_mat_cal-*` / `matmul*` 中的 reduction、条件 reduction、三层矩阵归约热点
+- 目标：继续拒绝，并明确说明更适合别的优化
+- 如果现在强行支持，容易把问题做得过早复杂化
+
+### 用这 6 个样例回归时的额外规则
+如果开始用这 6 个样例做回归，你还应当加三条规则：
+1. 先要求“拒绝原因更准确”，再要求“真的命中”。
+2. 每次只开放一类新能力，不要同时扩交换合法性、复杂收益模型、reduction 支持。
+3. 不仅看 IR 是否变化，还要看是不是正确类型的 loop 发生了变化。
+
+例如：
+- 在基础版本里，`matmul1/2/3` 的转置型二维 loop 应优先出现变化
+- `many_mat_cal-*` 的逐元素 loop 更应被判为“收益不足，不交换”
+- 三层 reduction loop 在没有额外能力前应继续不变
+***
+## 4. 特别需要注意的事项
+### 4.1 依赖分析宁可保守，不要误交换
+这是最重要的一条。
+
+```c
+for (int i = 0; i < N; i++) {
+  for (int j = 1; j < M; j++) {
+    A[i][j] = A[i][j] + A[i][j - 1];
+  }
+}
+```
+
+这类 loop 不能直接交换，因为内层存在跨迭代依赖。
+如果方向向量出现 `>` 或 `*`，当前版本应优先拒绝。
+
+### 4.2 合法不等于值得换
+很多 loop 结构上可以交换，但交换后并没有更好的局部性。
+例如 `many_mat_cal-*` 里很多逐元素按行访问 loop，当前内层已经是 unit-stride。
+这类 loop 即使合法，也应因收益不足而拒绝。
+
+### 4.3 reduction 不是当前基础版本的主目标
+如果内层主要是：
+- `sum += ...`
+- `temp = temp + ...`
+- 带条件的累加
+
+那么它更像 reduction 或 masked reduction 问题。
+在 reduction-aware interchange 没实现前，你应优先拒绝，而不是硬做。
+
+### 4.4 交换时不要只改 GEP，不改 loop 结构
+循环交换不是简单交换两个索引名字。
+你必须同步更新：
+- loop header / latch
+- 两层 induction `phi`
+- 边界比较
+- 递增指令
+- 依赖这些 induction variable 的所有使用点
+
+如果只改 `GEP` 而不改 loop 结构，通常会直接出错。
+
+### 4.5 多维数组信息可能在 IR 中已经变弱
+LLVM IR 里的地址表达式可能已经被线性化。
+因此你可能需要：
+- 从 `getelementptr` 或地址表达式中恢复维度关系
+- 保守判断哪些访问确实对应二维数组的两个维度
+
+在这一步不稳时，宁可先限制只支持最规整的 `GEP(base, i, j)` 形态。
+
+### 4.6 不要过早把复杂样例全当成 interchange 目标
+对这 6 个样例，更现实的顺序是：
+1. 先命中 `matmul1/2/3` 里的转置型二维 loop。
+2. 再稳定拒绝 `many_mat_cal-*` 中已经 unit-stride 的逐元素 loop。
+3. 最后再考虑是否与 reduction-aware interchange、blocking 协同处理矩阵归约热点。
+
+### 4.7 调试时优先相信 IR 差异
+循环交换最有效的调试方式不是先看性能，而是先看 IR。
+每个阶段你都优先回答：
+- 有没有变化？
+- 变化是不是预期那一种？
+- 变化是否出现在正确的 loop 上？
+- 输出是否仍然正确？
+***
+## 5. 最终执行建议
+如果你把这份文档直接当成执行规则，可以简化成六句：
+1. 先分析合法性，保证 IR 不变。
+2. 再分析收益，保证 IR 仍不变。
+3. 最后做 loop 结构交换与 `phi/GEP` 同步更新。
+4. 每一步都做 IR diff 和语义检查。
+5. 每一步都准备“应命中”和“应拒绝”两类样例。
+6. 没通过当前停点，就不要进入下一步。
+
+如果你只能记住一句话，就记住这一句：
+先把 `matmul1/2/3` 里的转置型二维 loop 做对，再去考虑更复杂的场景；不要为了命中全部性能样例而过早扩展到 reduction、blocking 或其他优化。
diff --git a/doc/opt-cookbook-ai-loop-interchange.md b/doc/opt-cookbook-ai-loop-interchange.md
new file mode 100644
index 00000000..dd793f1a
--- /dev/null
+++ b/doc/opt-cookbook-ai-loop-interchange.md
@@ -0,0 +1,185 @@
+# Loop Interchange（循环交换）
+
+## 前置依赖
+- 前置基础-IndVar分析（识别循环中 phi-based induction variable 的 step/base）
+
+## 目标
+交换嵌套循环的内外层顺序，使内层循环沿数组连续维（行主序的最后一维）迭代，提升 cache 局部性。核心难点不是交换本身（swap 两个循环头），而是**收益判断函数——什么时候交换有益？**
+
+## 算法原理
+
+两种实现思路：
+
+| 维度 | Gnalc（结构化IR+仿射分析） | 复旦大学（四元式+SCEV） |
+|------|------------------------|------------------------|
+| 合法性 | Omega Test 精确依赖 | SCEV + loopInvariant |
+| 收益 | GEP 维度位置（inner_idx vs outer_idx） | SCEV 步长系数（abs(faStep) vs abs(sonStep)） |
+| 精度 | 高（复杂仿射） | 中（依赖 SCEV） |
+| 框架适配 | 需仿射分析→不可行 | 可简化实现 |
+
+核心逻辑：**内层 IV 出现在非连续维 → 交换使其变外层 → 新内层 IV 在连续维 → cache 友好。**
+
+## 触发模式
+
+两层完美嵌套循环 + 内层 init/step/bound 不依赖外层 IV：
+
+```
+outer.header:
+  %i = phi [I0, outer.ph], [%i.next, outer.latch]
+  %cmp.i = icmp lt %i, NI
+  condbr %cmp.i, inner.ph, outer.exit
+
+inner.ph:
+  br inner.header
+inner.header:
+  %j = phi [J0, inner.ph], [%j.next, inner.latch]
+  %cmp.j = icmp lt %j, NJ
+  condbr %cmp.j, body, inner.exit
+
+body:
+  %addr = gep @A, %i, %j       ;; %i在维度0, %j在维度1 ← inner IV不在连续维！
+  %v = load %addr
+  ...
+```
+
+**此行主序下 `A[i][j]` 的连续维是最后一维（dim=1, j主导）。若内层遍历 j → 连续访问 ✓。若内层遍历 i → stride=N 跳跃 ✗ → 交换有利。**
+
+## 变换规则
+
+```
+;; before: i 外层, j 内层, A[i][j] — j 在连续维 ✓ 已是最优, 无需交换
+;; before: i 外层, j 内层, A[j][i] — i 在连续维, 但 i 是外层 ✗ → 需要交换
+
+;; 交换后：
+outer'.header:    ;; 原 inner.header
+  %j_out = phi [J0, outer'.ph], [%j_out.next, outer'.latch]
+  %cmp.j_out = icmp lt %j_out, NJ
+  condbr %cmp.j_out, inner'.ph, outer'.exit
+
+inner'.header:    ;; 原 outer.header
+  %i_in = phi [I0, inner'.ph], [%i_in.next, inner'.latch]
+  %cmp.i_in = icmp lt %i_in, NI
+  condbr %cmp.i_in, body, inner'.exit
+
+body:     ;; 不变: A[j][i] — 现在 j 外层, i 内层, i在连续维 ✓
+  %addr = gep @A, %j_out, %i_in
+  %v = load %addr
+```
+
+## 收益函数：两种方案对比 + 推荐实现
+
+### 方案 A：Gnalc 的 GEP 维度位置法（推荐，无需 SCEV）
+
+```cpp
+// 单次数组访问的交换代价
+// 返回负值 = 交换有益, 正值 = 交换有害, 0 = 无关
+int GetInterchangeCost(GEP* gep, Value* outer_iv, Value* inner_iv) {
+  // gep 的索引序列: op0=base_ptr, op1=dim0, op2=dim1, ..., opN=dim(N-1), op(N+1)=element_offset
+  // 对于 A[dim0][dim1]: gep @A, dim0, dim1
+  // 行主序: 最后一维(dim1) 连续
+
+  int outer_dim = -1, inner_dim = -1;
+  int num_indices = gep->GetNumOperands() - 2;  // 去除 base_ptr 和 element_offset
+
+  for (int d = 0; d < num_indices; d++) {
+    auto* idx = gep->GetOperand(d + 1);
+    if (DependsOn(idx, outer_iv)) outer_dim = d;
+    if (DependsOn(idx, inner_iv)) inner_dim = d;
+  }
+
+  // 两个 IV 都没出现在这条 GEP 中
+  if (outer_dim == -1 || inner_dim == -1) return 0;
+
+  // 核心规则: 内层 IV 在更靠后的维度(index更大) → 已经连续 → 交换有害
+  //          内层 IV 在更靠前的维度(index更小) → 不连续 → 交换有益
+  // 例如 A[j][i]: inner=i 在 dim=1(连续) ✓, outer=j 在 dim=0 → inner_dim=1 > outer_dim=0 → cost=+1
+  // 例如 A[i][j]: inner=j 在 dim=1(连续) ✓, outer=i 在 dim=0 → inner_dim=1 > outer_dim=0 → cost=+1
+  // 例如 A[j][i] 但 i 是 outer: outer=i 在 dim=1, inner=j 在 dim=0 → inner_dim=0 < outer_dim=1 → cost=-1 交换有益！
+  return (inner_dim < outer_dim) ? -1 : 1;
+}
+```
+
+### 方案 B：复旦 SCEV 步长法（需 SCEV，精度更高）
+
+比较每个 IV 在地址表达式中的步长系数绝对值。abs(inner_step) < abs(outer_step) → 内层步长小、连续访问、不需换。
+
+### 推荐：方案 A（无需 SCEV）
+
+GEP 索引结构天然暴露维度位置。竞赛用例中数组访问几乎都是 `A[dim0][dim1]` 直接对应 GEP 操作数，方案 A 足够。
+
+## 实现骨架（合法性 + 收益）
+
+```cpp
+bool TryInterchange(Loop* outer, Loop* inner) {
+  auto* outer_iv = outer->GetIV();
+  auto* inner_iv = inner->GetIV();
+
+  // === 合法性检查 ===
+  // 1. 内层循环的 init/step/bound 不依赖外层 IV
+  if (DependsOn(inner->GetInit(), outer_iv)) return false;
+  if (DependsOn(inner->GetBound(), outer_iv)) return false;
+  // 2. 外层 body 中除内层循环外无其他副作用指令
+  for (auto* inst : outer->GetBody()) {
+    if (inst->GetParent() == inner->GetHeader()) continue;  // 跳过内层循环本身
+    if (isa<StoreInst>(inst) || isa<CallInst>(inst)) return false;
+  }
+  // 3. 内层循环无中间 exit（单 latch + 单 exiting）
+  if (inner->GetExitingBlocks().size() != 1) return false;
+
+  // === 收益判断 ===
+  int cost = 0;
+  for (auto* bb : inner->GetBlocks()) {
+    for (auto& inst : bb->GetInstructions()) {
+      auto* gep = dyn_cast<GEP>(inst.get());
+      if (!gep) continue;
+      cost += GetInterchangeCost(gep, outer_iv, inner_iv);
+    }
+  }
+  // 收益阈值: cost < 0 表示至少有一条 GEP 从交换中受益
+  if (cost >= 0) return false;
+
+  // === 执行交换 ===
+  // 交换两个循环的 header/latch/exit 结构
+  // 关键: 交换后需要修正 phi 的 incoming block 和 CFG 边
+  std::swap(outer->header, inner->header);  // 简化示意
+  std::swap(outer->latch, inner->latch);
+  // 详见 Part 2: CFG 重连
+  return true;
+}
+```
+
+## 正确性不变量
+- [ ] 交换后内层循环的 init/step/bound 仍然不依赖外层 IV（交换前检查了，交换后对称成立）
+- [ ] 循环嵌套深度不变（只是交换了 header 和 latch，循环树结构不变）
+- [ ] body 中的指令零修改（只改变两个循环 IV 对应 phi 的"内/外"角色）
+
+## 禁止事项
+- 绝对不在内层 init/step/bound 依赖外层 IV 时交换（语义错误）
+- 绝对不交换非完美嵌套的循环（外层 body 有其他副作用指令）
+- 绝对不交换有中间 exit 的内层循环
+- 绝对不在 cost ≥ 0 时强制交换（可能退化性能）
+- 绝对不做 Omega Test 级别的精确依赖分析（竞赛场景不需要，GEP 位置法已足够）
+
+## 最小验证
+```bash
+# 测试 transpose0 用例: 原始有大量 stride 访问, 交换后应改善
+./build/bin/compiler --emit-ir test/test_case/performance/transpose0.sy | grep "interchanged"
+./2026test.sh -c performance -n 5
+```
+
+## 收益函数核心公式（摘要）
+
+```
+对于 A[dim0][dim1]...[dimN-1] (行主序, 最后一维连续):
+
+inner_dim = inner IV 所在维度位置
+outer_dim = outer IV 所在维度位置
+
+cost_per_access = (inner_dim < outer_dim) ? -1 : 1
+
+总 cost = sum(所有 load/store 的 cost_per_access)
+
+if cost < 0: 交换有益
+```
+
+**直觉**：`inner_dim < outer_dim` 意味着内层 IV 出现在更"非连续"的维度 → 交换后使其变成外层 IV → 新的内层 IV 出现在更"连续"的维度 → cache hit rate ↑。
diff --git a/include/frontend/AntlrDriver.h b/include/frontend/AntlrDriver.h
deleted file mode 100644
index ee22da95..00000000
--- a/include/frontend/AntlrDriver.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// 包装 ANTLR4，提供简易的解析入口。
-#pragma once
-
-#include <memory>
-#include <string>
-
-#include "SysYLexer.h"
-#include "SysYParser.h"
-#include "antlr4-runtime.h"
-
-struct AntlrResult {
-  std::unique_ptr<antlr4::ANTLRInputStream> input;
-  std::unique_ptr<SysYLexer> lexer;
-  std::unique_ptr<antlr4::CommonTokenStream> tokens;
-  std::unique_ptr<SysYParser> parser;
-  antlr4::tree::ParseTree* tree = nullptr;  // owned by parser
-};
-
-// 解析指定文件，发生错误时抛出 std::runtime_error。
-AntlrResult ParseFileWithAntlr(const std::string& path);
diff --git a/include/frontend/SyntaxTreePrinter.h b/include/frontend/SyntaxTreePrinter.h
deleted file mode 100644
index 4633b5ec..00000000
--- a/include/frontend/SyntaxTreePrinter.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#pragma once
-
-#include <iosfwd>
-
-#include "antlr4-runtime.h"
-
-// 以树状缩进形式直接打印 ANTLR parse tree。
-void PrintSyntaxTree(antlr4::tree::ParseTree* tree, antlr4::Parser* parser,
-                     std::ostream& os);
diff --git a/include/ir/IR.h b/include/ir/IR.h
deleted file mode 100644
index 87a35e0e..00000000
--- a/include/ir/IR.h
+++ /dev/null
@@ -1,545 +0,0 @@
-// 当前只支撑 i32、i32*、void 以及最小的内存/算术指令，演示用。
-//
-// 当前已经实现：
-// 1. 基础类型系统：void / i32 / i32*
-// 2. Value 体系：Value / ConstantValue / ConstantInt / Function / BasicBlock / User / GlobalValue / Instruction
-// 3. 最小指令集：Add / Alloca / Load / Store / Ret
-// 4. BasicBlock / Function / Module 三层组织结构
-// 5. IRBuilder：便捷创建常量和最小指令
-// 6. def-use 关系的轻量实现：
-//    - Instruction 保存 operand 列表
-//    - Value 保存 uses
-//    - 支持 ReplaceAllUsesWith 的简化实现
-//
-// 当前尚未实现或只做了最小占位：
-// 1. 完整类型系统：数组、函数类型、label 类型等
-// 2. 更完整的指令系统：br / condbr / call / phi / gep 等
-// 3. 更成熟的 Use 管理（例如 LLVM 风格的双向链式结构）
-// 4. 更完整的 IR verifier 和优化基础设施
-//
-// 当前需要特别说明的两个简化点：
-// 1. BasicBlock 虽然已经纳入 Value 体系，但其类型目前仍用 void 作为占位，
-//    后续如果补 label type，可以再改成更合理的块标签类型。
-// 2. ConstantValue 体系目前只实现了 ConstantInt，后续可以继续补 ConstantFloat、
-//    ConstantArray等更完整的常量种类。
-//
-// 建议的扩展顺序：
-// 1. 先补更多指令和类型
-// 2. 再补控制流相关 IR
-// 3. 最后再考虑把 Value/User/Use 进一步抽象成更完整的框架
-
-#pragma once
-
-#include <iosfwd>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-namespace ir {
-
-class Type;
-class Value;
-class User;
-class ConstantValue;
-class ConstantInt;
-class ConstantFloat;
-class GlobalValue;
-class Instruction;
-class BasicBlock;
-class Function;
-class Argument;
-class GlobalVariable;
-
-
-
-// Use 表示一个 Value 的一次使用记录。
-// 当前实现设计：
-// - value：被使用的值
-// - user：使用该值的 User
-// - operand_index：该值在 user 操作数列表中的位置
-
-class Use {
- public:
-  Use() = default;
-  Use(Value* value, User* user, size_t operand_index)
-      : value_(value), user_(user), operand_index_(operand_index) {}
-
-  Value* GetValue() const { return value_; }
-  User* GetUser() const { return user_; }
-  size_t GetOperandIndex() const { return operand_index_; }
-
-  void SetValue(Value* value) { value_ = value; }
-  void SetUser(User* user) { user_ = user; }
-  void SetOperandIndex(size_t operand_index) { operand_index_ = operand_index; }
-
- private:
-  Value* value_ = nullptr;
-  User* user_ = nullptr;
-  size_t operand_index_ = 0;
-};
-
-// IR 上下文：集中管理类型、常量等共享资源，便于复用与扩展。
-class Context {
- public:
-  Context() = default;
-  ~Context();
-  // 去重创建 i32 常量。
-  ConstantInt* GetConstInt(int v);
-  ConstantFloat* GetConstFloat(double v);
-  // 去重创建 i1 常量（0 或 1）。
-  ConstantInt* GetConstBool(int v);
-
-  std::string NextTemp();
-
- private:
-  std::unordered_map<int, std::unique_ptr<ConstantInt>> const_ints_;
-  std::unordered_map<std::string, std::unique_ptr<ConstantFloat>> const_floats_;
-  std::unordered_map<int, std::unique_ptr<ConstantInt>> const_bools_;
-  int temp_index_ = -1;
-};
-
-class Type {
- public:
-  enum class Kind { Void, Int1, Int32, Float32, PtrInt32, PtrFloat32 };
-  explicit Type(Kind k);
-  // 使用静态共享对象获取类型。
-  // 同一类型可直接比较返回值是否相等，例如：
-  // Type::GetInt32Type() == Type::GetInt32Type()
-  static const std::shared_ptr<Type>& GetVoidType();
-  static const std::shared_ptr<Type>& GetInt1Type();
-  static const std::shared_ptr<Type>& GetInt32Type();
-  static const std::shared_ptr<Type>& GetFloat32Type();
-  static const std::shared_ptr<Type>& GetPtrInt32Type();
-  static const std::shared_ptr<Type>& GetPtrFloat32Type();
-  Kind GetKind() const;
-  bool IsVoid() const;
-  bool IsInt1() const;
-  bool IsInt32() const;
-  bool IsFloat32() const;
-  bool IsPtrInt32() const;
-  bool IsPtrFloat32() const;
-
- private:
-  Kind kind_;
-};
-
-class Value {
- public:
-  Value(std::shared_ptr<Type> ty, std::string name);
-  virtual ~Value() = default;
-  const std::shared_ptr<Type>& GetType() const;
-  const std::string& GetName() const;
-  void SetName(std::string n);
-  bool IsVoid() const;
-  bool IsInt32() const;
-  bool IsFloat32() const;
-  bool IsPtrInt32() const;
-  bool IsPtrFloat32() const;
-  bool IsConstant() const;
-  bool IsInstruction() const;
-  bool IsUser() const;
-  bool IsFunction() const;
-  void AddUse(User* user, size_t operand_index);
-  void RemoveUse(User* user, size_t operand_index);
-  const std::vector<Use>& GetUses() const;
-  void ReplaceAllUsesWith(Value* new_value);
-
- protected:
-  std::shared_ptr<Type> type_;
-  std::string name_;
-  std::vector<Use> uses_;
-};
-
-// ConstantValue 是常量体系的基类。
-// 当前只实现了 ConstantInt，后续可继续扩展更多常量种类。
-class ConstantValue : public Value {
- public:
-  ConstantValue(std::shared_ptr<Type> ty, std::string name = "");
-};
-
-class ConstantInt : public ConstantValue {
- public:
-  ConstantInt(std::shared_ptr<Type> ty, int v);
-  int GetValue() const { return value_; }
-
- private:
-  int value_{};
-};
-
-class ConstantFloat : public ConstantValue {
- public:
-  ConstantFloat(std::shared_ptr<Type> ty, double v);
-  double GetValue() const { return value_; }
-
- private:
-  double value_{};
-};
-
-// 后续还需要扩展更多指令类型。
-enum class Opcode {
-  Add,
-  Sub,
-  Mul,
-  Div,
-  Mod,
-  SIToFP,
-  FPToSI,
-  ZExt,
-  Eq,
-  Ne,
-  Lt,
-  Le,
-  Gt,
-  Ge,
-  Alloca,
-  Load,
-  Store,
-  GEP,
-  Call,
-  Br,
-  CondBr,
-  Ret,
-  Phi
-};
-
-// User 是所有“会使用其他 Value 作为输入”的 IR 对象的抽象基类。
-// 当前实现中只有 Instruction 继承自 User。
-class User : public Value {
- public:
-  User(std::shared_ptr<Type> ty, std::string name);
-  size_t GetNumOperands() const;
-  Value* GetOperand(size_t index) const;
-  void SetOperand(size_t index, Value* value);
-  void AddOperand(Value* value);
-
- private:
-  std::vector<Value*> operands_;
-};
-
-// GlobalValue 是全局值/全局变量体系的空壳占位类。
-// 当前只补齐类层次，具体初始化器、打印和链接语义后续再补。
-class GlobalValue : public User {
- public:
-  GlobalValue(std::shared_ptr<Type> ty, std::string name);
-};
-
-class GlobalVariable : public GlobalValue {
- public:
-  enum class StorageKind {
-    Scalar,
-    Array,
-  };
-
-  enum class ElemKind {
-    Int32,
-    Float32,
-  };
-
-  GlobalVariable(std::string name, int init_value);
-  GlobalVariable(std::string name, double init_value);
-  GlobalVariable(std::string name, size_t array_size);
-  GlobalVariable(std::string name, size_t array_size, ElemKind elem_kind);
-  GlobalVariable(std::string name, size_t array_size, const std::vector<int>& init_values);
-  GlobalVariable(std::string name, size_t array_size, const std::vector<double>& init_values);
-  StorageKind GetStorageKind() const;
-  bool IsArray() const;
-  ElemKind GetElemKind() const;
-  bool IsFloatElem() const;
-  int GetInitValue() const;
-  double GetInitFloatValue() const;
-  size_t GetArraySize() const;
-  const std::vector<int>& GetInitValues() const;
-  const std::vector<double>& GetInitFloatValues() const;
-  bool HasInitValues() const;
-
- private:
-  StorageKind storage_kind_ = StorageKind::Scalar;
-  ElemKind elem_kind_ = ElemKind::Int32;
-  int init_value_ = 0;
-  double init_float_value_ = 0.0;
-  size_t array_size_ = 0;
-  std::vector<int> init_values_;
-  std::vector<double> init_float_values_;
-};
-
-class Instruction : public User {
- public:
-  Instruction(Opcode op, std::shared_ptr<Type> ty, std::string name = "");
-  Opcode GetOpcode() const;
-  bool IsTerminator() const;
-  BasicBlock* GetParent() const;
-  void SetParent(BasicBlock* parent);
-
- private:
-  Opcode opcode_;
-  BasicBlock* parent_ = nullptr;
-};
-
-class BinaryInst : public Instruction {
- public:
-  BinaryInst(Opcode op, std::shared_ptr<Type> ty, Value* lhs, Value* rhs,
-             std::string name);
-  Value* GetLhs() const;
-  Value* GetRhs() const;
-};
-
-class CastInst : public Instruction {
- public:
-  CastInst(Opcode op, std::shared_ptr<Type> ty, Value* operand,
-           std::string name);
-  Value* GetOperandValue() const;
-};
-
-class BranchInst : public Instruction {
- public:
-  BranchInst(std::shared_ptr<Type> void_ty, BasicBlock* target);
-  BasicBlock* GetTarget() const;
-};
-
-class CondBranchInst : public Instruction {
- public:
-  CondBranchInst(std::shared_ptr<Type> void_ty, Value* cond, BasicBlock* true_bb,
-                 BasicBlock* false_bb);
-  Value* GetCond() const;
-  BasicBlock* GetTrueTarget() const;
-  BasicBlock* GetFalseTarget() const;
-};
-
-class CallInst : public Instruction {
- public:
-  CallInst(std::shared_ptr<Type> ret_ty, Function* callee,
-           const std::vector<Value*>& args, std::string name);
-  Function* GetCallee() const;
-  size_t GetNumArgs() const;
-  Value* GetArg(size_t index) const;
-};
-
-class ReturnInst : public Instruction {
- public:
-  ReturnInst(std::shared_ptr<Type> void_ty, Value* val = nullptr);
-  Value* GetValue() const;
-  bool HasValue() const;
-};
-
-class AllocaInst : public Instruction {
- public:
-  AllocaInst(std::shared_ptr<Type> elem_ty, std::string name,
-             Value* count = nullptr);
-  bool IsArrayAlloca() const;
-  Value* GetCount() const;
-  std::shared_ptr<Type> GetElementType() const;
-};
-
-class GetElementPtrInst : public Instruction {
- public:
-  GetElementPtrInst(std::shared_ptr<Type> ptr_ty, Value* base_ptr,
-                    Value* index, std::string name);
-  Value* GetBasePtr() const;
-  Value* GetIndex() const;
-};
-
-class LoadInst : public Instruction {
- public:
-  LoadInst(std::shared_ptr<Type> val_ty, Value* ptr, std::string name);
-  Value* GetPtr() const;
-};
-
-class StoreInst : public Instruction {
-public:
-  StoreInst(std::shared_ptr<Type> void_ty, Value* val, Value* ptr);
-  Value* GetValue() const;
-  Value* GetPtr() const;
-};
-
-class PhiInst : public Instruction {
-public:
-  PhiInst(std::shared_ptr<Type> ty, std::string name);
-  AllocaInst* GetAlloca() const { return alloca_; }
-  void SetAlloca(AllocaInst* alloca) { alloca_ = alloca; }
-
-private:
-  AllocaInst* alloca_;
-};
-
-class Argument : public Value {
- public:
-  Argument(std::shared_ptr<Type> ty, std::string name, size_t index);
-  size_t GetIndex() const;
-
- private:
-  size_t index_ = 0;
-};
-
-// BasicBlock 已纳入 Value 体系，便于后续向更完整 IR 类图靠拢。
-// 当前其类型仍使用 void 作为占位，后续可替换为专门的 label type。
-class BasicBlock : public Value {
- public:
-  explicit BasicBlock(std::string name);
-  Function* GetParent() const;
-  void SetParent(Function* parent);
-  bool HasTerminator() const;
-  const std::vector<std::unique_ptr<Instruction>>& GetInstructions() const;
-  const std::vector<BasicBlock*>& GetPredecessors() const;
-  const std::vector<BasicBlock*>& GetSuccessors() const;
-  std::vector<BasicBlock*>& GetMutablePredecessors() {
-    return predecessors_;
-  }
-  std::vector<BasicBlock*>& GetMutableSuccessors() {
-    return successors_;
-  }
-  template <typename T, typename... Args>
-  T* Append(Args&&... args) {
-    if (HasTerminator()) {
-      throw std::runtime_error("BasicBlock 已有 terminator，不能继续追加指令: " +
-                               name_);
-    }
-    auto inst = std::make_unique<T>(std::forward<Args>(args)...);
-    auto* ptr = inst.get();
-    ptr->SetParent(this);
-    instructions_.push_back(std::move(inst));
-    return ptr;
-  }
-  template <typename T, typename... Args>
-  T* Prepend(Args&&... args) {
-    auto inst = std::make_unique<T>(std::forward<Args>(args)...);
-    auto* ptr = inst.get();
-    ptr->SetParent(this);
-    instructions_.insert(instructions_.begin(), std::move(inst));
-    return ptr;
-  }
-  template <typename T, typename... Args>
-  T* InsertAlloca(Args&&... args) {
-    auto inst = std::make_unique<T>(std::forward<Args>(args)...);
-    auto* ptr = inst.get();
-    ptr->SetParent(this);
-    instructions_.insert(instructions_.begin() + alloca_insert_index_, std::move(inst));
-    ++alloca_insert_index_;
-    return ptr;
-  }
-  void RemoveInstruction(Instruction* inst) {
-    for (auto it = instructions_.begin(); it != instructions_.end(); ++it) {
-      if (it->get() == inst) {
-        instructions_.erase(it);
-        break;
-      }
-    }
-  }
-  std::unique_ptr<Instruction> TakeInstruction(Instruction* inst);
-  void InsertInstructionBeforeTerminator(std::unique_ptr<Instruction> inst);
-
- private:
-  Function* parent_ = nullptr;
-  std::vector<std::unique_ptr<Instruction>> instructions_;
-  std::vector<BasicBlock*> predecessors_;
-  std::vector<BasicBlock*> successors_;
-  size_t alloca_insert_index_ = 0;
-};
-
-// Function 当前也采用了最小实现。
-// 需要特别注意：由于项目里还没有单独的 FunctionType，
-// Function 继承自 Value 后，其 type_ 目前只保存“返回类型”，
-// 并不能完整表达“返回类型 + 形参列表”这一整套函数签名。
-// 这对当前只支持 int main() 的最小 IR 足够，但后续若补普通函数、
-// 形参和调用，通常需要引入专门的函数类型表示。
-class Function : public Value {
- public:
-  // 当前构造函数接收的也是返回类型，而不是完整函数类型。
-  Function(std::string name, std::shared_ptr<Type> ret_type,
-           bool is_external = false);
-  Argument* AddParam(const std::string& name, std::shared_ptr<Type> type);
-  const std::vector<std::unique_ptr<Argument>>& GetParams() const;
-  bool IsExternal() const;
-  BasicBlock* CreateBlock(const std::string& name);
-  BasicBlock* GetEntry();
-  const BasicBlock* GetEntry() const;
-  const std::vector<std::unique_ptr<BasicBlock>>& GetBlocks() const;
-
- private:
-  bool is_external_ = false;
-  BasicBlock* entry_ = nullptr;
-  std::vector<std::unique_ptr<Argument>> params_;
-  std::vector<std::unique_ptr<BasicBlock>> blocks_;
-};
-
-class Module {
- public:
-  Module() = default;
-  Context& GetContext();
-  const Context& GetContext() const;
-  // 创建函数时当前只显式传入返回类型，尚未接入完整的 FunctionType。
-  Function* CreateFunction(const std::string& name,
-                           std::shared_ptr<Type> ret_type,
-                           bool is_external = false);
-  Function* GetFunction(const std::string& name) const;
-  GlobalVariable* CreateGlobalI32(const std::string& name, int init_value);
-  GlobalVariable* CreateGlobalF32(const std::string& name, double init_value);
-  GlobalVariable* CreateGlobalArrayI32(const std::string& name,
-                                       size_t array_size);
-  GlobalVariable* CreateGlobalArrayF32(const std::string& name,
-                                       size_t array_size);
-  GlobalVariable* CreateGlobalArrayI32(const std::string& name,
-                                       size_t array_size,
-                                       const std::vector<int>& init_values);
-  GlobalVariable* CreateGlobalArrayF32(const std::string& name,
-                                       size_t array_size,
-                                       const std::vector<double>& init_values);
-  GlobalVariable* GetGlobal(const std::string& name) const;
-  const std::vector<std::unique_ptr<GlobalVariable>>& GetGlobals() const;
-  const std::vector<std::unique_ptr<Function>>& GetFunctions() const;
-
- private:
-  Context context_;
-  std::vector<std::unique_ptr<GlobalVariable>> globals_;
-  std::vector<std::unique_ptr<Function>> functions_;
-};
-
-class IRBuilder {
- public:
-  IRBuilder(Context& ctx, BasicBlock* bb);
-  void SetInsertPoint(BasicBlock* bb);
-  BasicBlock* GetInsertBlock() const;
-
-  // 构造常量、二元运算、返回指令的最小集合。
-  ConstantInt* CreateConstInt(int v);
-  ConstantFloat* CreateConstFloat(double v);
-  BinaryInst* CreateBinary(Opcode op, Value* lhs, Value* rhs,
-                           const std::string& name);
-  BinaryInst* CreateAdd(Value* lhs, Value* rhs, const std::string& name);
-  BinaryInst* CreateICmp(Opcode op, Value* lhs, Value* rhs,
-                         const std::string& name);
-  CastInst* CreateSIToFP(Value* operand, const std::string& name);
-  CastInst* CreateFPToSI(Value* operand, const std::string& name);
-  CastInst* CreateZExt(Value* operand, std::shared_ptr<Type> target_ty, const std::string& name);
-  AllocaInst* CreateAlloca(std::shared_ptr<Type> elem_ty, const std::string& name,
-                           Value* count = nullptr);
-  AllocaInst* CreateAllocaI32(const std::string& name,
-                              Value* count = nullptr);
-  AllocaInst* CreateAllocaF32(const std::string& name,
-                              Value* count = nullptr);
-  LoadInst* CreateLoad(Value* ptr, const std::string& name);
-  StoreInst* CreateStore(Value* val, Value* ptr);
-  GetElementPtrInst* CreateGEP(Value* base_ptr, Value* index,
-                               const std::string& name);
-  CallInst* CreateCall(Function* callee, const std::vector<Value*>& args,
-                       const std::string& name);
-  BranchInst* CreateBr(BasicBlock* target);
-  CondBranchInst* CreateCondBr(Value* cond, BasicBlock* true_bb,
-                               BasicBlock* false_bb);
-  ReturnInst* CreateRet(Value* v);
-  ReturnInst* CreateRetVoid();
-  PhiInst* CreatePhi(std::shared_ptr<Type> ty, const std::string& name);
-
- private:
-  Context& ctx_;
-  BasicBlock* insert_block_;
-};
-
-class IRPrinter {
- public:
-  void Print(const Module& module, std::ostream& os);
-};
-
-}  // namespace ir
diff --git a/include/irgen/IRGen.h b/include/irgen/IRGen.h
deleted file mode 100644
index 861f6fcb..00000000
--- a/include/irgen/IRGen.h
+++ /dev/null
@@ -1,122 +0,0 @@
-// 将语法树翻译为 IR。
-// 实现拆分在 IRGenFunc/IRGenStmt/IRGenExp/IRGenDecl。
-
-#pragma once
-
-#include <any>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "SysYBaseVisitor.h"
-#include "SysYParser.h"
-#include "ir/IR.h"
-#include "sem/Sema.h"
-
-namespace ir {
-class Module;
-class Function;
-class IRBuilder;
-class Value;
-}
-
-class IRGenImpl final : public SysYBaseVisitor {
- public:
-  IRGenImpl(ir::Module& module, const SemanticContext& sema);
-
-  std::any visitCompUnit(SysYParser::CompUnitContext* ctx) override;
-  std::any visitFuncDef(SysYParser::FuncDefContext* ctx) override;
-  std::any visitBlock(SysYParser::BlockContext* ctx) override;
-  std::any visitBlockItem(SysYParser::BlockItemContext* ctx) override;
-  std::any visitDecl(SysYParser::DeclContext* ctx) override;
-  std::any visitVarDecl(SysYParser::VarDeclContext* ctx) override;
-  std::any visitStmt(SysYParser::StmtContext* ctx) override;
-  std::any visitVarDef(SysYParser::VarDefContext* ctx) override;
-  std::any visitExp(SysYParser::ExpContext* ctx) override;
-  std::any visitAddExp(SysYParser::AddExpContext* ctx) override;
-  std::any visitMulExp(SysYParser::MulExpContext* ctx) override;
-  std::any visitUnaryExp(SysYParser::UnaryExpContext* ctx) override;
-  std::any visitPrimaryExp(SysYParser::PrimaryExpContext* ctx) override;
-  std::any visitLVal(SysYParser::LValContext* ctx) override;
-  std::any visitNumber(SysYParser::NumberContext* ctx) override;
-
- private:
-  enum class BlockFlow {
-    Continue,
-    Terminated,
-  };
-
-  BlockFlow VisitBlockItemResult(SysYParser::BlockItemContext& item);
-  ir::Value* EvalExpr(SysYParser::ExpContext& expr);
-  ir::Value* EvalBinaryOrFold(ir::Opcode op, ir::Value* lhs, ir::Value* rhs);
-  std::shared_ptr<ir::Type> ResolveBType(SysYParser::BTypeContext* btype) const;
-    int EvalConstIntExpr(SysYParser::ExpContext& expr);
-    int EvalConstIntExpr(SysYParser::ConstExpContext& expr);
-    int EvalConstIntAddExp(SysYParser::AddExpContext& expr);
-    int EvalConstIntMulExp(SysYParser::MulExpContext& expr);
-    int EvalConstIntUnaryExp(SysYParser::UnaryExpContext& expr);
-    int EvalConstIntPrimaryExp(SysYParser::PrimaryExpContext& expr);
-    double EvalConstFloatExpr(SysYParser::ConstExpContext& expr);
-    double EvalConstFloatAddExp(SysYParser::AddExpContext& expr);
-    double EvalConstFloatMulExp(SysYParser::MulExpContext& expr);
-    double EvalConstFloatUnaryExp(SysYParser::UnaryExpContext& expr);
-    double EvalConstFloatPrimaryExp(SysYParser::PrimaryExpContext& expr);
-    std::vector<int> EvalArrayExtents(
-      const std::vector<SysYParser::ConstExpContext*>& dims);
-    std::vector<int> GetArrayExtentsForDecl(SysYParser::VarDefContext* decl);
-    std::vector<int> GetArrayExtentsForConstDecl(
-      SysYParser::ConstDefContext* decl);
-    std::vector<int> GetArrayExtentsForLVal(SysYParser::LValContext& lval,
-                        bool& is_array);
-    ir::Value* BuildLinearizedIndex(
-      const std::vector<ir::Value*>& indices,
-      const std::vector<int>& extents_with_first_dim) ;
-  ir::Value* CastValueTo(ir::Value* value,
-                         const std::shared_ptr<ir::Type>& target_type);
-  ir::Value* GetLValAddress(SysYParser::LValContext& lval);
-  ir::AllocaInst* CreateEntryBlockAlloca(std::shared_ptr<ir::Type> elem_ty,
-                                         const std::string& name,
-                                         ir::Value* count = nullptr);
-  std::string NextBlockName(const std::string& prefix);
-  void EmitCondBranch(SysYParser::CondContext& cond, ir::BasicBlock* true_bb,
-                      ir::BasicBlock* false_bb);
-  void EmitLOrBranch(SysYParser::LOrExpContext& expr, ir::BasicBlock* true_bb,
-                     ir::BasicBlock* false_bb);
-  void EmitLAndBranch(SysYParser::LAndExpContext& expr, ir::BasicBlock* true_bb,
-                      ir::BasicBlock* false_bb);
-  void EmitEqBranch(SysYParser::EqExpContext& expr, ir::BasicBlock* true_bb,
-                    ir::BasicBlock* false_bb);
-  void EmitRelBranch(SysYParser::RelExpContext& expr, ir::BasicBlock* true_bb,
-                     ir::BasicBlock* false_bb);
-  ir::Value* EvalEqValue(SysYParser::EqExpContext& expr);
-  ir::Value* EvalRelValue(SysYParser::RelExpContext& expr);
-
-  ir::Module& module_;
-  const SemanticContext& sema_;
-  ir::Function* func_;
-  ir::IRBuilder builder_;
-  std::unordered_map<std::string, ir::Function*> function_map_;
-  std::unordered_map<std::string, int> const_value_map_;
-  std::vector<std::unordered_map<std::string, int>> local_const_stack_;
-  std::vector<std::unordered_map<std::string, int>> const_value_history_;
-  std::unordered_map<SysYParser::VarDefContext*, std::vector<int>>
-      array_extents_map_;
-    std::unordered_map<SysYParser::ConstDefContext*, std::vector<int>>
-      const_array_extents_map_;
-  std::unordered_map<std::string, std::vector<int>> param_array_extents_map_;
-  std::unordered_map<std::string, ir::Value*> param_storage_map_;
-  std::unordered_map<std::string, ir::Value*> param_pointer_map_;
-  std::unordered_map<SysYParser::VarDefContext*, ir::Value*> global_storage_map_;
-    std::unordered_map<SysYParser::ConstDefContext*, ir::Value*>
-      const_global_storage_map_;
-  // 名称绑定由 Sema 负责；IRGen 只维护“声明 -> 存储槽位”的代码生成状态。
-  std::unordered_map<SysYParser::VarDefContext*, ir::Value*> storage_map_;
-    std::unordered_map<SysYParser::ConstDefContext*, ir::Value*>
-      const_storage_map_;
-  std::vector<std::pair<ir::BasicBlock*, ir::BasicBlock*>> loop_stack_;
-  int block_index_ = 0;
-};
-
-std::unique_ptr<ir::Module> GenerateIR(SysYParser::CompUnitContext& tree,
-                                       const SemanticContext& sema);
diff --git a/include/mir/MIR.h b/include/mir/MIR.h
deleted file mode 100644
index dabbd02c..00000000
--- a/include/mir/MIR.h
+++ /dev/null
@@ -1,414 +0,0 @@
-#pragma once
-
-#include <initializer_list>
-#include <iosfwd>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace ir
-{
-  class Module;
-}
-
-namespace mir
-{
-
-  class MIRContext
-  {
-  public:
-    MIRContext() = default;
-  };
-
-  MIRContext &DefaultContext();
-
-  enum class PhysReg
-  {
-    W0,
-    W1,
-    W2,
-    W3,
-    W4,
-    W5,
-    W6,
-    W7,
-    W8,
-    W9,
-    W10,
-    W11,
-    W12,
-    W13,
-    W14,
-    W15,
-    W16,
-    W17,
-    W18,
-    W19,
-    W20,
-    W21,
-    W22,
-    W23,
-    W24,
-    W25,
-    W26,
-    W27,
-    W28,
-    W29,
-    W30,
-    X0,
-    X1,
-    X2,
-    X3,
-    X4,
-    X5,
-    X6,
-    X7,
-    X8,
-    X9,
-    X10,
-    X11,
-    X12,
-    X13,
-    X14,
-    X15,
-    X16,
-    X17,
-    X18,
-    X19,
-    X20,
-    X21,
-    X22,
-    X23,
-    X24,
-    X25,
-    X26,
-    X27,
-    X28,
-    X29,
-    X30,
-    S0,
-    S1,
-    S2,
-    S3,
-    S4,
-    S5,
-    S6,
-    S7,
-    S8,
-    S9,
-    S10,
-    S11,
-    S12,
-    S13,
-    S14,
-    S15,
-    S16,
-    S17,
-    S18,
-    S19,
-    S20,
-    S21,
-    S22,
-    S23,
-    S24,
-    S25,
-    S26,
-    S27,
-    S28,
-    S29,
-    S30,
-    S31,
-    XZR,
-    SP,
-    WZR
-  };
-
-  const char *PhysRegName(PhysReg reg);
-
-  enum class VRegClass
-  {
-    Int,
-    Float,
-    Ptr
-  };
-
-  enum class Opcode
-  {
-    Prologue,
-    Epilogue,
-    MovImm,
-    LoadStack,
-    StoreStack,
-    LoadStackAddr,
-    LoadGlobal,
-    StoreGlobal,
-    LoadGlobalAddr,
-    LoadMem,
-    StoreMem,
-    AddRR,
-    SubRR,
-    MulRR,
-    DivRR,
-    ModRR,
-    AndRR,
-    OrRR,
-    XorRR,
-    ShlRR,
-    ShrRR,
-    AsrRR,
-    Asr64RR,
-    Uxtw,
-    Sxtw,
-    CmpRR,
-    CmpImm,
-    FCmpRR,
-    CSet,
-    Csel,
-    Smull,
-    Msub,
-    NegRR,
-    FAddRR,
-    FSubRR,
-    FMulRR,
-    FDivRR,
-    Scvtf,
-    FCvtzs,
-    FMovWS,
-    Br,
-    CondBr,
-    Call,
-    Ret,
-    LoadAddr,
-    MovReg,
-  };
-
-  enum class CondCode
-  {
-    EQ,
-    NE,
-    LT,
-    LE,
-    GT,
-    GE
-  };
-
-  class Operand
-  {
-  public:
-    enum class Kind
-    {
-      Reg,
-      VReg,
-      Imm,
-      FrameIndex,
-      Label,
-      Symbol
-    };
-
-    static Operand Reg(PhysReg reg);
-    static Operand VReg(int id, VRegClass vreg_class);
-    static Operand Imm(int value);
-    static Operand FrameIndex(int index);
-    static Operand Label(int label_id);
-    static Operand Symbol(std::string symbol);
-
-    Kind GetKind() const { return kind_; }
-    PhysReg GetReg() const { return reg_; }
-    int GetImm() const { return imm_; }
-    int GetFrameIndex() const { return imm_; }
-    int GetLabel() const { return imm_; }
-    const std::string &GetSymbol() const { return symbol_; }
-    int GetVRegId() const { return imm_; }
-    VRegClass GetVRegClass() const { return vreg_class_; }
-
-  private:
-    Operand(Kind kind, PhysReg reg, int imm,
-            VRegClass vreg_class = VRegClass::Int, std::string symbol = "");
-
-    Kind kind_;
-    PhysReg reg_;
-    int imm_;
-    std::string symbol_;
-    VRegClass vreg_class_;
-  };
-
-  class MachineInstr
-  {
-  public:
-    MachineInstr(Opcode opcode, std::vector<Operand> operands = {});
-
-    Opcode GetOpcode() const { return opcode_; }
-    const std::vector<Operand> &GetOperands() const { return operands_; }
-    std::vector<Operand> &GetOperands() { return operands_; }
-
-  private:
-    Opcode opcode_;
-    std::vector<Operand> operands_;
-  };
-
-  struct FrameSlot
-  {
-    int index = 0;
-    int size = 4;
-    int offset = 0;
-    bool is_stack_arg = false;
-    bool is_callee_stack_arg = false;
-  };
-
-  class MachineBasicBlock
-  {
-  public:
-    explicit MachineBasicBlock(std::string name, int label_id = -1);
-
-    const std::string &GetName() const { return name_; }
-    int GetLabelId() const { return label_id_; }
-    void SetLabelId(int label_id) { label_id_ = label_id; }
-
-    std::vector<MachineInstr> &GetInstructions() { return instructions_; }
-    const std::vector<MachineInstr> &GetInstructions() const { return instructions_; }
-
-    MachineInstr &Append(Opcode opcode,
-                         std::initializer_list<Operand> operands = {});
-
-  private:
-    std::string name_;
-    int label_id_ = -1;
-    std::vector<MachineInstr> instructions_;
-  };
-
-  class MachineFunction
-  {
-  public:
-    explicit MachineFunction(std::string name);
-
-    const std::string &GetName() const { return name_; }
-
-    MachineBasicBlock &GetEntry() { return *entry_; }
-    const MachineBasicBlock &GetEntry() const { return *entry_; }
-
-    MachineBasicBlock *GetEntryPtr() { return entry_; }
-    const MachineBasicBlock *GetEntryPtr() const { return entry_; }
-
-    MachineBasicBlock &CreateBlock(std::string name);
-    MachineBasicBlock *FindBlock(const std::string &name);
-    const MachineBasicBlock *FindBlock(const std::string &name) const;
-
-    std::vector<std::unique_ptr<MachineBasicBlock>> &GetBlocks()
-    {
-      return blocks_;
-    }
-    const std::vector<std::unique_ptr<MachineBasicBlock>> &GetBlocks() const
-    {
-      return blocks_;
-    }
-
-    int CreateLabel();
-
-    int CreateFrameIndex(int size = 4);
-    int CreateStackArgFrameIndex(int size = 4);
-    int CreateCalleeStackArgFrameIndex(int size = 4);
-    FrameSlot &GetFrameSlot(int index);
-    const FrameSlot &GetFrameSlot(int index) const;
-    const std::vector<FrameSlot> &GetFrameSlots() const { return frame_slots_; }
-    std::vector<FrameSlot> &GetFrameSlots() { return frame_slots_; }
-
-    int GetFrameSize() const { return frame_size_; }
-    void SetFrameSize(int size) { frame_size_ = size; }
-
-    int CreateVReg(VRegClass vreg_class);
-    VRegClass GetVRegClass(int vreg_id) const;
-    int GetNumVRegs() const { return static_cast<int>(vreg_classes_.size()); }
-
-    void AddCalleeSavedReg(PhysReg reg);
-    const std::vector<PhysReg> &GetCalleeSavedRegs() const { return callee_saved_regs_; }
-
-  private:
-    std::string name_;
-    std::vector<std::unique_ptr<MachineBasicBlock>> blocks_;
-    MachineBasicBlock *entry_ = nullptr;
-
-    std::vector<FrameSlot> frame_slots_;
-    int frame_size_ = 0;
-    int next_label_id_ = 0;
-
-    std::vector<VRegClass> vreg_classes_;
-    std::vector<PhysReg> callee_saved_regs_;
-  };
-
-  struct MachineGlobal
-  {
-    enum class Kind
-    {
-      I32Scalar,
-      I32Array
-    };
-
-    std::string name;
-    Kind kind = Kind::I32Scalar;
-    int init_value = 0;
-    size_t array_size = 0;
-    std::vector<int> init_values;
-  };
-
-  class MachineModule
-  {
-  public:
-    MachineModule() = default;
-
-    MachineFunction &CreateFunction(std::string name);
-    MachineFunction *GetFunction(const std::string &name);
-    const MachineFunction *GetFunction(const std::string &name) const;
-
-    std::vector<std::unique_ptr<MachineFunction>> &GetFunctions()
-    {
-      return functions_;
-    }
-    const std::vector<std::unique_ptr<MachineFunction>> &GetFunctions() const
-    {
-      return functions_;
-    }
-
-    void AddGlobalI32(std::string name, int init_value)
-    {
-      MachineGlobal g;
-      g.name = std::move(name);
-      g.kind = MachineGlobal::Kind::I32Scalar;
-      g.init_value = init_value;
-      globals_.push_back(std::move(g));
-    }
-
-    void AddGlobalArrayI32(std::string name, size_t array_size,
-                           std::vector<int> init_values = {})
-    {
-      MachineGlobal g;
-      g.name = std::move(name);
-      g.kind = MachineGlobal::Kind::I32Array;
-      g.array_size = array_size;
-      g.init_values = std::move(init_values);
-      globals_.push_back(std::move(g));
-    }
-
-    std::vector<MachineGlobal> &GetGlobals() { return globals_; }
-    const std::vector<MachineGlobal> &GetGlobals() const { return globals_; }
-
-  private:
-    std::vector<std::unique_ptr<MachineFunction>> functions_;
-    std::vector<MachineGlobal> globals_;
-  };
-
-  std::unique_ptr<MachineModule> LowerModuleToMIR(const ir::Module &module);
-  std::unique_ptr<MachineFunction> LowerToMIR(const ir::Module &module);
-
-  void RunRegAlloc(MachineFunction &function);
-  void RunRegAlloc(MachineModule &module);
-
-  void RunFrameLowering(MachineFunction &function);
-  void RunFrameLowering(MachineModule &module);
-
-  void RunPeephole(MachineFunction &function);
-  void RunPeephole(MachineModule &module);
-
-  void PrintAsm(const MachineFunction &function, std::ostream &os);
-  void PrintAsm(const MachineModule &module, std::ostream &os);
-
-} // namespace mir
diff --git a/include/sem/Sema.h b/include/sem/Sema.h
deleted file mode 100644
index 5a677fd0..00000000
--- a/include/sem/Sema.h
+++ /dev/null
@@ -1,92 +0,0 @@
-// 基于语法树的语义检查与名称绑定。
-#pragma once
-
-#include <unordered_map>
-
-#include "SysYParser.h"
-
-class SemanticContext {
- public:
-  void BindVarUse(SysYParser::LValContext* use,
-                  SysYParser::VarDefContext* decl) {
-    var_uses_[use] = decl;
-  }
-
-  SysYParser::VarDefContext* ResolveVarUse(
-      const SysYParser::LValContext* use) const {
-    auto it = var_uses_.find(use);
-    return it == var_uses_.end() ? nullptr : it->second;
-  }
-
-  void BindConstArrayUse(SysYParser::LValContext* use,
-                         SysYParser::ConstDefContext* decl) {
-    const_array_uses_[use] = decl;
-  }
-
-  SysYParser::ConstDefContext* ResolveConstArrayUse(
-      const SysYParser::LValContext* use) const {
-    auto it = const_array_uses_.find(use);
-    return it == const_array_uses_.end() ? nullptr : it->second;
-  }
-
-  void BindConstScalarUse(SysYParser::LValContext* use,
-                          SysYParser::ConstDefContext* decl) {
-    const_scalar_uses_[use] = decl;
-  }
-
-  SysYParser::ConstDefContext* ResolveConstScalarUse(
-      const SysYParser::LValContext* use) const {
-    auto it = const_scalar_uses_.find(use);
-    return it == const_scalar_uses_.end() ? nullptr : it->second;
-  }
-
-  void BindConstUse(SysYParser::LValContext* use, int value) {
-    const_uses_[use] = value;
-  }
-
-  const int* ResolveConstUse(const SysYParser::LValContext* use) const {
-    auto it = const_uses_.find(use);
-    return it == const_uses_.end() ? nullptr : &it->second;
-  }
-
-  void BindConstFloatUse(SysYParser::LValContext* use, double value) {
-    const_float_uses_[use] = value;
-  }
-
-  const double* ResolveConstFloatUse(const SysYParser::LValContext* use) const {
-    auto it = const_float_uses_.find(use);
-    return it == const_float_uses_.end() ? nullptr : &it->second;
-  }
-
-  void BindCallUse(SysYParser::UnaryExpContext* call,
-                   SysYParser::FuncDefContext* decl) {
-    call_uses_[call] = decl;
-  }
-
-  SysYParser::FuncDefContext* ResolveCallUse(
-      const SysYParser::UnaryExpContext* call) const {
-    auto it = call_uses_.find(call);
-    return it == call_uses_.end() ? nullptr : it->second;
-  }
-
- private:
-  std::unordered_map<const SysYParser::LValContext*,
-                     SysYParser::VarDefContext*>
-      var_uses_;
-  std::unordered_map<const SysYParser::LValContext*, int> const_uses_;
-  std::unordered_map<const SysYParser::LValContext*, double> const_float_uses_;
-  std::unordered_map<const SysYParser::LValContext*,
-                     SysYParser::ConstDefContext*>
-      const_array_uses_;
-  std::unordered_map<const SysYParser::LValContext*,
-                     SysYParser::ConstDefContext*>
-      const_scalar_uses_;
-  std::unordered_map<const SysYParser::UnaryExpContext*,
-                     SysYParser::FuncDefContext*>
-      call_uses_;
-};
-
-// 目前仅检查：
-// - 变量先声明后使用
-// - 局部变量不允许重复定义
-SemanticContext RunSema(SysYParser::CompUnitContext& comp_unit);
diff --git a/include/sem/SymbolTable.h b/include/sem/SymbolTable.h
deleted file mode 100644
index 61275509..00000000
--- a/include/sem/SymbolTable.h
+++ /dev/null
@@ -1,22 +0,0 @@
-// 极简符号表：记录局部变量定义点。
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "SysYParser.h"
-
-class SymbolTable {
- public:
-  void EnterScope();
-  void ExitScope();
-  void Add(const std::string& name, SysYParser::VarDefContext* decl);
-  bool ContainsInCurrent(const std::string& name) const;
-  bool Contains(const std::string& name) const;
-  SysYParser::VarDefContext* Lookup(const std::string& name) const;
-
- private:
-  std::vector<std::unordered_map<std::string, SysYParser::VarDefContext*>>
-      scopes_;
-};
diff --git a/include/utils/CLI.h b/include/utils/CLI.h
deleted file mode 100644
index 4c184a4a..00000000
--- a/include/utils/CLI.h
+++ /dev/null
@@ -1,15 +0,0 @@
-// 简易命令行解析：支持帮助、输入文件与输出阶段选择。
-#pragma once
-
-#include <string>
-
-struct CLIOptions {
-  std::string input;
-  bool emit_parse_tree = false;
-  bool emit_ir = true;
-  bool emit_asm = false;
-  bool show_help = false;
-  bool optimize = false;
-};
-
-CLIOptions ParseCLI(int argc, char** argv);
diff --git a/include/utils/Log.h b/include/utils/Log.h
deleted file mode 100644
index 303f1a11..00000000
--- a/include/utils/Log.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// 轻量日志接口。
-#pragma once
-
-#include <cstddef>
-#include <exception>
-#include <iosfwd>
-#include <string>
-#include <string_view>
-
-void LogInfo(std::string_view msg, std::ostream& os);
-void LogError(std::string_view msg, std::ostream& os);
-
-std::string FormatError(std::string_view stage, std::string_view msg);
-std::string FormatErrorAt(std::string_view stage, std::size_t line,
-                          std::size_t column, std::string_view msg);
-bool HasErrorPrefix(std::string_view msg, std::string_view stage);
-void PrintException(std::ostream& os, const std::exception& ex);
-
-// 打印命令行帮助信息（用于 `compiler --help`）。
-void PrintHelp(std::ostream& os);
diff --git a/optimization-designs/.gitkeep b/optimization-designs/.gitkeep
new file mode 100644
index 00000000..63a139fc
--- /dev/null
+++ b/optimization-designs/.gitkeep
@@ -0,0 +1 @@
+所有优化设计文档存档目录。
diff --git a/optimization-designs/00-总览-优化全景.md b/optimization-designs/00-总览-优化全景.md
new file mode 100644
index 00000000..80b1f8a7
--- /dev/null
+++ b/optimization-designs/00-总览-优化全景.md
@@ -0,0 +1,82 @@
+# 编译器优化全景
+
+## 编译管线与 pass 顺序
+
+```
+SysY 源码
+  │
+  ▼
+ANTLR 语法树 ──→ 语义分析 ──→ IR 生成
+  │
+  ▼ (仅 -O)
+IR Pass 管线:
+  1. Mem2Reg           ← SSA 构造，alloca/load/store → φ+SSA
+  2. IfConversion       ← if-else diamond → 算术 select，循环体单 BB 化
+  3. CFGSimplify        ← 不可达块消除、常量分支折叠
+  4. LoopUnroll         ← 简单 countdown 循环全展开
+  5. Inline              ← 保守内联：leaf 单 BB 函数迭代内联
+  6. LICM               ← （空桩，未实现）
+  7. LoopVectorize      ← NEON SIMD 自动向量化 (VF=4, <4×i32>)
+  8. 迭代至不动点: ConstFold → ConstProp → CFGSimplify → CSE → DCE
+  │
+  ▼
+IR → MIR 降级 (Lowering)
+  │
+  ▼
+MIR Pass 管线:
+  1. MIRCleanup         ← MovImm 转发
+  2. TwoAddressOpt      ← 操作数交换，减少 copy 需求
+  3. CopyPropagation    ← 死副本/自复制/前向后向传播/副本链折叠/StoreLoad 折叠
+  4. RegisterCoalescer  ← 合并 copy-connected vreg（LiveIntervals 干涉检查）
+  5. RegAlloc            ← 贪心图着色 + spill（MAX_SPILL_ROUNDS=1）
+  6. FrameLowering       ← 栈帧分配
+  7. BlockLayout         ← Pettis-Hansen 基本块重排序
+  8. Peephole            ← 局部指令优化 + fallthrough 消除
+  │
+  ▼
+AArch64 汇编 (AsmPrinter)
+```
+
+## 优化统计
+
+| 层级 | 已实现 | 未实现/空桩 |
+|------|--------|------------|
+| 第 1 层（算法策略） | 贪心寄存器分配 + spill slot 共享 | — |
+| 第 2 层（管线架构） | Mem2Reg, LoopVectorize, Inline | **LICM**（空桩）、GVN、SCCP、LoopUnswitch |
+| 第 3 层（跨 pass 协同） | IfConversion→LoopUnroll→Inline 联动 | DCE 后不自动触发 CFGSimplify |
+| 第 4 层（单 pass 算法） | CFGSimplify, ConstFold, ConstProp, CSE, DCE, CopyProp, Coalescer, BlockLayout | — |
+| 第 5 层（窥孔/局部） | Peephole(10+模式), AddImm/SubImm, CmpImm, sdiv, 叶函数帧, ADRP 缓存, Movz | — |
+
+## 性能数据总览
+
+以下是指令数基线（`指令数基线.json`）中每个用例的最小指令数（-O 优化后）：
+
+| 用例 | 指令数 | 用例 | 指令数 |
+|------|--------|------|--------|
+| mm1/mm2/mm3 | 277 | fft0-2 | 558 |
+| sort1-3 | 541 | h-1-01-03 | 149 |
+| conv2d-1-3 | 571 | h-4-01-03 | 158 |
+| crc1-3 | 242 | h-5-01-03 | 283 |
+| crypto-1-3 | 1437 | h-8-01-03 | 327 |
+| huffman-01-03 | 694 | h-9-01-03 | 197 |
+| matmul1-3 | 323 | h-10-01-03 | 272 |
+| many_mat_cal-1-3 | 355 | shuffle0-2 | 368 |
+| knapsack-1-3 | 165 | sl1-3 | 233 |
+| transpose0-2 | 178 | opt_scheduling-1-3 | 110 |
+
+## 各优化累计效果
+
+根据优化记录，每条优化的指令数削减（全量 performance 测试集累计）：
+
+| 优化 | 累计削减 | 层级 |
+|------|---------|------|
+| MAX_SPILL_ROUNDS 缩减 + Spill Slot 共享 | ~273,000（mm 系列） | 第 1/5 层 |
+| 除法改用 sdiv | -735 | 第 5 层 |
+| 叶函数帧优化 | -312 | 第 5 层 |
+| ADRP 冗余消除 | -135 | 第 5 层 |
+| CmpImm 常量折叠 | -91 | 第 5 层 |
+| AddImm/SubImm | -55 | 第 5 层 |
+| Movz #0 优化 | -33 | 第 5 层 |
+| 全局变量 Peephole | -15 | 第 5 层 |
+
+> 注：IR pass 的效果未单独测量。Mem2Reg 是所有后续优化的前提；IfConversion + LoopUnroll + Inline 使小函数变为单 BB 并内联，减少 call/ret 开销；LoopVectorize 提供 4× 吞吐量提升但指令数不一定减少。
diff --git a/optimization-designs/01-IR优化-Mem2Reg与SSA构造.md b/optimization-designs/01-IR优化-Mem2Reg与SSA构造.md
new file mode 100644
index 00000000..ab4e7aae
--- /dev/null
+++ b/optimization-designs/01-IR优化-Mem2Reg与SSA构造.md
@@ -0,0 +1,48 @@
+# Mem2Reg：SSA 构造
+
+- **层级**：第 2 层（编译管线架构改进）
+- **文件**：`src/ir/passes/Mem2Reg.cpp` (800 行)
+- **类型**：IR
+
+## 做什么
+
+将局部变量的 alloca/load/store 提升为标准 SSA 形式，使用支配边界 + PHI 节点插入算法。
+
+```
+优化前（栈变量）：                  优化后（SSA）：
+  %p = alloca i32                    （alloca 消除）
+  store i32 42, %p          →        %v1 = 42
+  %x = load i32, %p                  %x 的使用直接替换为 %v1
+```
+
+## 怎么实现
+
+1. **找可提升的 alloca**：筛选仅被 Load/Store 使用、非数组类型的 alloca
+2. **计算支配者**：迭代数据流算法（Intersect-based），最多 1000 轮
+3. **计算支配边界**：标准支配边界算法
+4. **插入 PHI 节点**：对每个 alloca，在所有支配边界块插入 PHI
+5. **Rename（重命名）**：支配树上前序遍历，维护值栈；store 推栈，load 替换，回退时弹栈
+6. **删除冗余**：移除原始 Load/Store/Alloca 指令
+
+## 安全门禁
+
+三处安全阈值防止编译超时或错误：
+
+1. **大函数跳过**：>2000 基本块的函数跳过（避免支配者计算超时）
+2. **多 alloca 跳过**：>24 个 promotable alloca 跳过（避免大参数函数 SSA 构造错误——来自 `87_many_params` 的 bug 修复）
+3. **PHI 过多跳过**：PHI 数量 > max(100, block_count×2) 时跳过（启发式阈值）
+
+## 实际效果
+
+Mem2Reg 是所有后续优化的前提条件。没有 SSA 形式，ConstProp、CSE 等无法工作。具体来说：
+- 消除了所有局部标量变量的栈分配，转为 vreg
+- 为 ConstProp 暴露了常量传播路径
+- 为 CSE 暴露了公共子表达式
+- 减少了 Load/Store 指令数（栈访问 → 寄存器访问）
+
+## 已知局限
+
+1. **不处理数组 alloca**：`IsArrayAlloca()` 返回 false 则不提升。这意味着数组访问仍走栈
+2. **安全门禁可能过宽**：>24 alloca 的函数被完全跳过，但这些函数可能包含大量可优化的栈变量
+3. **不处理部分提升**：要么全部提升，要么全部不提升。不能部分提升（例如，一个 alloca 的部分 use 被地址取用，其余可以提升）
+4. **无 PromoteMemToReg 的扩展**：不处理 GEP+Load/Store 模式（部分数组访问也可提升）
diff --git a/optimization-designs/02-IR优化-循环优化.md b/optimization-designs/02-IR优化-循环优化.md
new file mode 100644
index 00000000..4461dcd0
--- /dev/null
+++ b/optimization-designs/02-IR优化-循环优化.md
@@ -0,0 +1,85 @@
+# 循环优化：IfConversion + LoopUnroll + Inline 联动
+
+- **层级**：第 3 层（跨 pass 协同）
+- **文件**：`IfConversion.cpp` (284 行), `LoopUnroll.cpp` (345 行), `Inline.cpp` (308 行)
+- **类型**：IR
+
+## 设计思路
+
+三个 pass 形成联动管道：**IfConversion 使循环体变单 BB → LoopUnroll 全展开 → Inline 将展开后的单 BB 函数内联到调用者**。
+
+## IfConversion
+
+### 做什么
+
+将简单 if-else diamond 转换为算术 select：
+
+```
+优化前：                            优化后：
+  br i1 cond, %T, %F                  %zext = zext i1 cond to i32
+T:                                    %diff = sub i32 tv, fv
+  ... pure arith ...                  %masked = mul i32 %diff, %zext
+  br %M                              %select = add i32 fv, %masked
+F:                                    br %M
+  ... no body (fallthrough)          （T 块指令移入 B）
+M:
+  %r = phi [tv, %T], [fv, %B]        %r 的使用替换为 %select
+```
+
+### 安全检查
+- T 块必须只有单一前驱（B）
+- T 块只允许纯算术指令（禁 Div/Mod/浮点/Load/Store/Call）
+- 只处理 i32 类型的 PHI
+- T 块所有指令类型必须是 i32/i1/void（浮点运算移入无条件块会改变语义）
+
+### 联动价值
+将含 if-else 的循环体变为单 BB → 可被 LoopUnroll 展开 → 展开后函数单 BB → 可被 Inline 内联
+
+## LoopUnroll
+
+### 做什么
+
+识别形如 `while (len) { body; len = len - 1; }` 的递减循环，完全展开。
+
+### 检测模式
+- header 中有 `phi(init, latch_val)`，其中一个来源是循环外部（init），另一个是 body
+- latch_val = `sub phi, 1`
+- 退出条件：`cmp phi, 0` + `condbr`
+
+### 实现要点
+- 展开上限：trip_count ≤ 64
+- 成本阈值：`(BodySize - 1) × TripCount + 1 ≤ 150`
+- 多 phi 追踪：非归纳变量的 phi 也追踪（跨迭代值转发）
+- 展开后合并到 preheader 使函数变为单 BB
+- 仅处理 i32 返回值函数（float 循环体含不支持克隆的操作）
+
+## Inline
+
+### 做什么
+
+自底向上迭代内联：每次只内联 leaf（无 call）、单基本块的函数。
+
+### 实现要点
+- **可内联条件**：单 BB、无 Call、无 Load/Store/GEP、无数组 alloca、以 Ret 结尾
+- **If-else 转换**：内联前先将 if-else-return 函数转为 `fv + (tv-fv) × zext(cmp)` 单 BB
+- **迭代收敛**：最多 16 轮，每轮内联后可能产生新 leaf
+- **操作数穿透**：穿透 `icmp ne (zext(X), 0)` 包装，直接使用原始条件
+
+## 实际效果
+
+| 优化 | 效果 |
+|------|------|
+| IfConversion | 使含 if-else 的小循环变为单 BB，为 LoopUnroll 创造条件 |
+| LoopUnroll | 消除循环控制开销（cmp + condbr + phi + sub），暴露更多常量折叠机会 |
+| Inline | 消除 call/ret 开销（参数传递 + 栈帧），使调用者中的常量传播到被调用函数体 |
+
+三个 pass 协同最典型的场景：小工具函数（如 `max`/`min`/`power`）被 if-convert → unroll → inline，最终在调用点完全消解。
+
+## 已知局限
+
+1. **LoopUnroll 只处理递减循环**：递增循环 `for (i=0; i<n; i++)` 不处理
+2. **LoopUnroll 只全展开**：不支持部分展开（unroll factor 可配置）
+3. **Inline 过于保守**：不处理含 Load/Store 的函数，导致大量实用函数无法内联
+4. **Inline 不处理 float 返回值函数**：IfElseToSelect 只支持 i32
+5. **Inline 无 CallSite 成本模型**：所有符合条件的 call 一律内联，可能代码膨胀
+6. **无 LICM**（空桩）：循环不变量无法提升到循环外，导致 LoopVectorize 可能向量化含不变量的循环
diff --git a/optimization-designs/03-IR优化-NEON自动向量化.md b/optimization-designs/03-IR优化-NEON自动向量化.md
new file mode 100644
index 00000000..b6d4510e
--- /dev/null
+++ b/optimization-designs/03-IR优化-NEON自动向量化.md
@@ -0,0 +1,77 @@
+# NEON SIMD 自动向量化
+
+- **层级**：第 2 层（编译管线架构改进）
+- **文件**：`src/ir/passes/LoopVectorize.cpp` (780 行) + `src/mir/Lowering.cpp` (NEON 降级部分)
+- **类型**：IR → MIR 全链路
+
+## 做什么
+
+自动检测可向量化的计数循环，生成 `<4 × i32>` 向量化循环 + 标量残余循环。利用 AArch64 NEON 指令集实现 4 路 SIMD 并行。
+
+## 怎么实现
+
+### IR 层（LoopVectorize）
+
+1. **循环检测**：找 `phi(init, i+step)` + `cmp slt %i, %n` + `condbr` 模式
+2. **可向量化检查**：
+   - 循环体必须是单 BB
+   - 除归纳变量 phi 外无其他 phi（无跨迭代依赖）
+   - 所有指令可向量化（Add/Sub/Mul/Load/Store/GEP）
+   - GEP 索引必须是归纳变量或循环不变量（stride-1 访问）
+   - Load+Store 混合循环直接支持向量 store
+   - Store-only 循环检查存储值（归纳变量/常量/不变量 OK）
+3. **向量循环生成**（VF=4）：
+   - 计算向量化上界：`n_rounded = n - (n % 4)`
+   - 创建 vec_header + vec_body：归纳变量步进 4
+   - Load → `<4 × i32>` 向量加载；Store 按存储值类型决定向量/标量展开
+4. **标量残余循环**：处理 `n % 4` 个剩余迭代
+
+### MIR 层（Lowering 降级）
+
+新增 8 个 NEON 操作码：
+
+| MIR 操作码 | AArch64 指令 | 语义 |
+|-----------|-------------|------|
+| `LdrQ` | `ldr qD, [xN, #off]` | 128-bit 向量加载 |
+| `StrQ` | `str qD, [xN, #off]` | 128-bit 向量存储 |
+| `AddV4s` | `add vD.4s, vA.4s, vB.4s` | 4×i32 向量加法 |
+| `SubV4s` | `sub vD.4s, vA.4s, vB.4s` | 4×i32 向量减法 |
+| `MulV4s` | `mul vD.4s, vA.4s, vB.4s` | 4×i32 向量乘法 |
+| `DupV4s` | `dup vD.4s, wA` | 标量广播到向量 |
+| `MovVS` | `mov wD, sA` | 向量→标量 |
+| `MovSV` | `mov sD, wA` | 标量→向量 |
+
+向量寄存器类：`VRegClass::Vec` → PhysReg Q0-Q31（可分配 24 个）
+
+### 寄存器分配
+
+- Vec 类 vreg 分配 Q0-Q31 物理寄存器
+- 24 个可分配（排除 Q8-Q15 用于 callee-saved？实际全视为 caller-saved）
+- 分配策略与 GP/FP 独立，三类寄存器不干涉
+
+## 实际效果
+
+### 指令数效果
+
+来自优化记录，以下用例有明显指令数削减：
+- crypto：-249 条（-4.4%）
+- huffman：-186 条（-8.9%）
+- crc：-84 条（-10.4%）
+- fft：-72 条（-4.1%）
+- h-9：-42 条（-6.6%）
+- many_mat_cal：-24 条（-1.8%）
+
+### 性能收益
+
+根据 NEON 向量化记录（`project_neon_vectorization.md`）：-11% ~ -28% 性能提升（指令数减少 + 4× 数据并行）。
+
+## 已知局限
+
+1. **仅 i32**：不支持 i8/i16/i64/float NEON 向量化
+2. **仅 stride-1**：不支持 stride-N 访问或 gather/scatter
+3. **仅 Add/Sub/Mul**：不支持向量化 Div/Mod/移位/逻辑操作
+4. **无归约支持**：循环中有累加器 phi 的立即拒绝（`CanVectorizeLoop` 只要有额外 phi 就返回 false）
+5. **仅单 BB 循环体**：含 if-else 的循环无法向量化（但 IfConversion 可以先将一些转为单 BB）
+6. **无对齐分析**：不检查数组是否 128-bit 对齐
+7. **无代价模型**：不评估向量化是否有收益，只要模式匹配就向量化
+8. **LdrQ/StrQ 偏移有限**：NEON 寻址模式支持有限偏移，复杂地址需要 Uxtw+Shl+Add 预计算
diff --git a/optimization-designs/04-IR优化-标量优化Pass.md b/optimization-designs/04-IR优化-标量优化Pass.md
new file mode 100644
index 00000000..168c44ba
--- /dev/null
+++ b/optimization-designs/04-IR优化-标量优化Pass.md
@@ -0,0 +1,101 @@
+# IR 标量优化 Pass：ConstFold + ConstProp + CSE + DCE + CFGSimplify
+
+- **层级**：第 4 层（单 pass 算法）
+- **文件**：`ConstFold.cpp` (185 行), `ConstProp.cpp` (231 行), `CSE.cpp` (170 行), `DCE.cpp` (188 行), `CFGSimplify.cpp` (271 行)
+- **类型**：IR
+
+## Pass 流水线
+
+```
+ConstFold → ConstProp → CFGSimplify → CSE → DCE → （循环迭代至不动点）
+```
+
+一个 pass 的变换可能暴露另一个 pass 的机会。迭代执行直到所有 pass 都不再产生变化。
+
+## ConstFold：常量折叠
+
+### 做什么
+折叠编译时可判定的常量表达式。
+
+### 支持的折叠
+
+| 操作 | 整数 | 浮点 |
+|------|------|------|
+| Add/Sub/Mul | ✓ | ✓ |
+| Div/Mod | ✓（含除零/INT_MIN/-1 保护） | ✓（除零保护） |
+| Eq/Ne/Lt/Le/Gt/Ge | ✓ | ✓ |
+| SIToFP | ✓ int→float | — |
+| FPToSI | — | ✓ float→int（含范围/NaN 保护） |
+| ZExt | 跳过（破坏类型正确性） | — |
+
+### 实现
+- 对每条 BinaryInst：两个操作数都是常量 → 计算常量结果 → ReplaceAllUsesWith 常量
+- 对每条 CastInst：操作数是常量 → 折叠
+- 跳过向量类型指令（无处理路径）
+- 跳过 PHI 和终止指令
+
+## ConstProp：常量传播
+
+### 做什么
+
+沿 use-def 关系传播已知常量，将可替换的 SSA 值改写为常量。
+
+### 三个子 pass
+
+1. **PHI 常量传播**：若所有入边都是同一常量 → 用该常量替换 PHI
+2. **冗余 PHI 简化**：若所有入边都是同一个值（不一定是常量）→ 用该值替换 PHI
+   - 例如 `phi [%x, %bb1], [%x, %bb2], [%x, %bb3]` → 替换为 `%x`
+3. **常量指令收集**：标记所有操作数都是常量的指令（由 ConstFold 实际折叠）
+
+## CSE：公共子表达式消除
+
+### 做什么
+在同一基本块内识别并复用重复计算的等价表达式。
+
+### 实现
+- 哈希表键：`(Opcode, [Operand1, Operand2, ...])`
+- 候选指令：BinaryInst、Load、GEP
+- Store 感知缓存失效：Store 到某地址 → 失效该地址的所有 Load 缓存
+- **alloca 数量门禁**：>24 个 alloca 的函数跳过 Load/GEP 的 CSE（避免 SSA 化不充分的函数产生错误消除）
+
+## DCE：死代码删除
+
+### 做什么
+标记-清扫式死代码删除，含 Dead Store Elimination。
+
+### 实现
+1. **种子标记**：所有终止指令和 Call 指令为 live
+2. **反向传播**：live 指令的操作数指令标记为 live
+3. **Load→Store 关联**：有 live Load 的 alloca → 其所有 Store 标记 live
+4. **清扫**：删除所有未被标记的指令
+
+特殊处理：向量类型指令跳过标量优化（不做 use-chain 追踪，但也不删除）。
+
+## CFGSimplify：控制流简化
+
+### 做什么
+清理死代码和冗余控制流。
+
+### 四个子 pass
+
+1. **不可达块消除**：BFS 从入口标记可达块，删除不可达块
+2. **PHI 前驱清理**：删除 PHI 中引用已移除前驱的条目
+3. **常量分支折叠**：`condbr ConstantInt, T, F` → `br live_target`，清理 dead target 的 PHI
+4. **单前驱块 PHI 消除**：只有一个前驱的块的 PHI 用入边值替换
+
+## 实际效果
+
+四个 pass 迭代执行，消除 IR 生成器产生的冗余代码。各 pass 互相暴露优化机会：
+- ConstFold 折叠常量 → 暴露死代码 → DCE 清理
+- ConstProp 传播常量到使用点 → ConstFold 折叠新的常量表达式
+- CSE 消除重复计算 → DCE 清理不再使用的指令
+- CFGSimplify 简化控制流 → 减少块数 → 其他 pass 更高效
+
+## 已知局限
+
+1. **CSE 仅块内**：不跨基本块。真正的 GVN 需要支配树上的值编号
+2. **ConstProp 无 SCCP**：不结合分支条件做稀疏条件常量传播。例如 `if (x == 5) { ... }` 中无法传播 `x=5` 到 then 分支
+3. **ConstFold 不处理向量类型**：向量化产生的 `<4 × i32>` 常量表达式不被折叠
+4. **DCE 无 Aggressive DCE**：不删除对死 alloca 的 Store（在 Mem2Reg 之后这通常不是问题）
+5. **CFGSimplify 不合并等价块**：两个内容相同的块不做尾合并（tail merging）
+6. **迭代无上限保护**：理论上可能无限迭代（虽然实际罕见）
diff --git a/optimization-designs/05-MIR优化-降级时优化.md b/optimization-designs/05-MIR优化-降级时优化.md
new file mode 100644
index 00000000..2ce3fed4
--- /dev/null
+++ b/optimization-designs/05-MIR优化-降级时优化.md
@@ -0,0 +1,92 @@
+# MIR 降级时优化：AddImm/SubImm + CmpImm + sdiv + 叶函数帧 + ADRP 缓存 + Movz
+
+- **层级**：第 5 层（局部模式匹配/窥孔）
+- **文件**：`src/mir/Lowering.cpp` (2616 行), `src/mir/AsmPrinter.cpp` (1093 行)
+- **类型**：MIR
+
+## 1. AddImm/SubImm 立即数折叠
+
+### 做什么
+AArch64 add/sub 支持 12 位立即数（0-4095），但 MIR 最初只有 AddRR/SubRR。当 IR 中 RHS 是 0-4095 常量时，直接生成 `add/sub dst, src, #imm`，避免先 `mov #imm` 再 `add/sub`。
+
+### 实现
+- Lowering.cpp：Add/Sub 降级时检测 RHS 是否为 0-4095 常量 → 发射 AddImm/SubImm
+- AsmPrinter.cpp：通用三操作数打印机自动处理 Imm 操作数（输出 `#value` 格式）
+- 指令数效果：-55 条，sl1-3 -14（-5.4%）
+
+### 局限
+- 仅处理直接常量操作数；经 vreg 传递的常量需 ConstProp 配合
+- 仅 0-4095 范围（AArch64 12-bit 立即数限制）
+
+## 2. CmpImm 常量折叠
+
+### 做什么
+ICmp 降级时，若操作数为 0-4095 常量，直接用 `cmp reg, #imm` 替代 `mov #imm; cmp reg, tmp`。
+
+### 实现
+- Lowering.cpp：两个 ICmp 降级路径中检查常量操作数
+- RHS 常量 → CmpImm
+- LHS 常量 → CmpImm + SwapCondCode（18 行辅助函数）
+- 指令数效果：-91 条，matmul -15（-3.8%），huffman -25（-3.1%）
+
+### 局限
+- 仅 0-4095 立即数
+- 浮点比较未覆盖
+
+## 3. 除法改用 sdiv
+
+### 做什么
+2 的幂次除法/取模本来使用移位序列（add bias + cmp + csel + asr = 4-6 条），改用 AArch64 sdiv 指令只需 1-2 条。
+
+### 实现
+- Lowering.cpp：删除了约 150 行的 2 的幂次移位序列代码
+- 所有除法/取模统一走 sdiv 路径
+- ModRR 的 val==1/-1 特例：MovImm #0
+- 指令数效果：-735 条（单条优化最大累计削减）
+  - crypto -249（-4.4%），huffman -186（-8.9%），crc -84（-10.4%），fft -72（-4.1%）
+
+### 局限
+- sdiv 在 Cortex-A53 上延迟 4-12 周期，但 QEMU 不精确模拟流水线，指令数减少足以弥补
+
+## 4. 叶函数帧设置优化
+
+### 做什么
+叶函数（无 Call 指令）不需要保存/恢复 x30（LR 不会被修改）。
+
+### 实现
+- MIR.h：MachineFunction 新增 `has_call_` 字段
+- Lowering.cpp：每次发射 Call 指令时标记 `function.SetHasCall()`
+- AsmPrinter.cpp：Prologue/Epilogue 根据 is_leaf 和 no_frame 条件：
+  - 无帧 + 无 callee-saved → 完全跳过 stp/ldp x29,x30 + mov x29,sp（节省 3 条）
+  - 有帧叶函数 → str/ldr x29 替代 stp/ldp x29,x30
+- 指令数效果：-312 条，huffman -93（-3.9%），crypto -54（-2.8%）
+
+## 5. ADRP 冗余消除
+
+### 做什么
+连续访问同一全局变量时，x13 已持有页面地址，后续 ADRP 冗余。
+
+### 实现
+- AsmPrinter.cpp：ADRP 缓存（`g_cached_adrp_symbol` + `g_adrp_cache_valid`）
+- PrintGlobalAccess 检测同符号命中 → 跳过 ADRP
+- EmitStackAdjust/EmitAddressFromBase 使用 x13 时失效缓存
+- Call 指令失效缓存（x13 caller-saved）
+- 每个基本块入口重置缓存
+- 指令数效果：-135 条，shuffle -48（-3.4%），crypto -27（-1.4%）
+
+## 6. Movz #0 前导零优化
+
+### 做什么
+32-bit 立即数低 16-bit 为零时，跳过前导 `movz #0`。
+
+```
+优化前: movz w8, #0; movk w8, #2, lsl #16    ; 0x00020000
+优化后: movz w8, #2, lsl #16                   ; 直接移位
+```
+
+### 实现
+- AsmPrinter.cpp EmitLargeImmediate 循环中：`!emitted && part == 0` 时跳过（3 行）
+- 指令数效果：-33 条
+
+### 局限
+- 仅修复 EmitLargeImmediate；EmitStackAdjust/EmitAddressFromBase 中的 movz 模式有同样问题
diff --git a/optimization-designs/06-MIR优化-寄存器分配前优化.md b/optimization-designs/06-MIR优化-寄存器分配前优化.md
new file mode 100644
index 00000000..318263d7
--- /dev/null
+++ b/optimization-designs/06-MIR优化-寄存器分配前优化.md
@@ -0,0 +1,72 @@
+# MIR 寄存器分配前优化：CopyPropagation + Coalescer + TwoAddress
+
+- **层级**：第 4 层（单 pass 算法升级）
+- **文件**：`CopyPropagation.cpp` (301 行), `RegisterCoalescer.cpp` (171 行), `TwoAddress.cpp` (84 行)
+- **类型**：MIR
+
+## CopyPropagation
+
+### 做什么
+在寄存器分配之前操作虚拟寄存器，消除冗余副本。
+
+### 四个子 pass（迭代执行，最多 5 轮）
+
+**Pass 1：死副本 + 自复制消除**
+- 死副本：`MovReg %v, %x`，%v 从未被使用 → 删除
+- 自复制：`MovReg %v, %v` → 删除
+
+**Pass 2：前向/后向传播 + 副本链折叠**
+- 前向传播：`MovReg %v1, %v2; ... use %v1` → use %v2（若 %v2 在 use 点仍活跃）
+- 后向传播：`def %v2; MovReg %v1, %v2` 且 %v2 唯一使用是此 MovReg → 重定向 def 到 %v1
+- 副本链折叠：`MovReg %v1, %v2; MovReg %v3, %v1` → `MovReg %v3, %v2`
+- **关键安全机制**：基于 LiveIntervals 的块级 live_out 种子初始化 `live_after`，确保跨块安全
+- Call 指令保守失效所有活跃副本
+
+**Pass 3：StoreStack+LoadStack 折叠**
+- 同一 slot，中间无其他 store → 替换 LoadStack 为 MovReg
+
+## RegisterCoalescer
+
+### 做什么
+在寄存器分配之前合并 copy-connected 虚拟寄存器。如果两个 vreg 在所有点（除 MovReg 定义外）都不干涉，则可以安全合并。
+
+### 实现
+1. **收集候选**：找出所有全部由 `MovReg %dst, %src` 定义的 vreg（支持多定义，只要全部到同一 src）
+2. **干涉检查**：`LiveIntervals::InterfereExcept(dst, src, {mov_instructions})` — 排除 MovReg 定义点
+3. **合并**：`MachineRegisterInfo::ReplaceAllVRegRefs(function, dst, src)` — 将所有 dst 引用替换为 src
+4. **迭代**：最多 5 轮直到不动点
+
+### 安全约束
+- dst 和 src 必须是同一 VRegClass（Int/Float/Ptr/Vec）
+- dst 的所有定义都必须是 MovReg（不能有计算指令定义 dst）
+- 清理合并后产生的自复制（`MovReg %src, %src`）
+
+## TwoAddress
+
+### 做什么
+通过操作数交换（commuting）消除不必要的 copy。AArch64 实际是三地址架构，但某些指令的 dst 最好匹配一个源操作数以利用寄存器分配器的 copy 消除。
+
+### 实现
+- 可交换操作：AddRR, MulRR, AndRR, OrRR, XorRR, FAddRR, FMulRR, AddShiftRR, AddV4s, MulV4s
+- 若 `dst == src2 && dst != src1` → 交换 src1 和 src2，使 dst == src1
+- 迭代最多 3 轮
+
+## 管线效果
+
+三个 pass 在 RegAlloc 之前运行，共同减少虚拟寄存器数量和 MovReg 指令数：
+
+```
+MIRCleanup → TwoAddress → CopyPropagation → RegisterCoalescer → RegAlloc
+```
+
+- TwoAddress 预处理使更多操作数对齐
+- CopyPropagation 消除死副本和转发副本
+- Coalescer 合并不干涉的 vreg 对
+- 结果：更少的 vreg 进入寄存器分配 → 更少的 spill
+
+## 已知局限
+
+1. **CopyProp 的 live_after 是块级精度**：使用 LiveIntervals 的块级 live_out 作为种子，但块内分析是精确的指令级
+2. **Coalescer 保守**：要求 dst 的所有定义都是 MovReg 且到同一 src — 实际中许多 vreg 有一个计算定义 + 多个 MovReg 使用
+3. **Coalescer 不处理跨类合并**：Int→Ptr 或 Float→Vec 的 MovReg 不能合并（即使物理上它们是同一种寄存器）
+4. **TwoAddress 仅处理 VReg 操作数**：不处理 PhysReg 或 Imm 操作数的交换
diff --git a/optimization-designs/07-MIR优化-寄存器分配.md b/optimization-designs/07-MIR优化-寄存器分配.md
new file mode 100644
index 00000000..eb26af3e
--- /dev/null
+++ b/optimization-designs/07-MIR优化-寄存器分配.md
@@ -0,0 +1,74 @@
+# 寄存器分配：贪心图着色 + Spill + Slot 共享
+
+- **层级**：第 1 层（算法/策略替换）
+- **文件**：`src/mir/RegAlloc.cpp` (1646 行)
+- **类型**：MIR
+
+## 架构
+
+三类独立的寄存器文件，各自独立分配：
+
+| 寄存器类 | 物理寄存器 | 可分配数 | 用途 |
+|---------|-----------|---------|------|
+| Int | w0-w30（32-bit GP） | 16 (x8-x12, x15, x19-x28) | i32/i1 值 |
+| Ptr | x0-x30（64-bit GP） | 16 (同 Int 的 64-bit 视图) | 指针/地址 |
+| Float | s0-s31（32-bit FP） | 24 (s8-s31) | float 值 |
+| Vec | q0-q31（128-bit NEON）| 24 (q0-q7, q16-q31) | `<4×i32>` 向量 |
+
+## 分配算法
+
+### 框架：贪心图着色
+
+1. **活跃分析**：块级 liveness（块入口的 live_in 集合）
+2. **干涉图构建**：同一块内同时活跃的 vreg 两两干涉
+3. **保守修复**：对 block_defs > 200 的大块，所有 def 之间强制全干涉
+4. **贪心分配**：按 spill cost 降序分配，每个 vreg 尝试分配可用物理寄存器
+5. **Spill**：无法分配的 vreg → 栈 slot
+
+### Spill 策略
+
+- **MAX_SPILL_ROUNDS = 1**：只做一轮 spill（历史：从 10 → 3 → 1 逐步缩减）
+- **循环外处理**：`RewriteWithAllocation` 用 scratch 寄存器（x16/x17）处理剩余 spill
+- **Spill 代价模型**：循环内 vreg 的 spill cost ×10，避免热路径 spill
+- **爆炸防护**：循环体 >100 条指令 → 触发时保守选择非循环内 vreg 做 spill
+
+### Rematerialization
+
+- MovImm 指令标记为 Rematerializable，存储立即数值
+- Spill 重加载时：如果可以 remat，优先用 MovImm 重建值而非 load
+
+### Spill Slot 共享
+
+- **`AssignSpillSlots` 函数**（约 100 行）：利用 liveness 数据做贪心 slot 分配
+- 活跃区间不重叠的 spilled vreg 复用同一 frame slot
+- 减少帧大小和栈访问指令数
+
+### Spill 代码生成
+
+AsmPrinter 中 x13 帧基址缓存（约 60 行）：
+- 缓存 `add x13, sp, #frame_base` 的结果
+- 后续 spill slot 访问使用 `ldr/str wX, [x13, #offset]` 而非重复计算帧地址
+
+## 关键 Bug 修复
+
+### MAX_SPILL_ROUNDS + 保守修复交互 bug
+
+- **症状**：04_arr_defn3 段错误、05_arr_defn4 输出错误、09_BFS bad_alloc
+- **根因**：block-level liveness 下多轮 spill 创建的 reload vreg 与保守修复（block_defs 全干涉）交互产生错误的 spill 代码
+- **修复**：MAX_SPILL_ROUNDS 3→1 + 保守修复阈值 20→200
+
+### Spill 爆炸
+
+- **症状**：mm1 85,728 条指令，70% 为帧地址计算
+- **根因**：MAX_SPILL_ROUNDS=10 时每轮 spill 翻倍（14→25→48→...→5890）
+- **修复后**：mm1 从 85,728 → 277 条（-99.7%）
+
+## 已知局限
+
+1. **块级 liveness**：LiveIntervals 只计算到块级 live_out，块内干涉保守（所有同时活跃的 vreg 视为干涉）
+2. **无线性扫描**：贪心图着色可能不如线性扫描效率高（编译时间 + 分配质量）
+3. **无 Eviction 策略**：发生 spill 时随机选择 vreg（应该选 spill cost 最低的）
+4. **无寄存器 hint**：不记录 copy-connected vreg 的首选寄存器
+5. **无 Live Range Splitting**：不拆分活跃区间来减少干涉
+6. **Spill slot 共享是块级精度**：同 BB 内不重叠的 vreg 被标记为干涉，slot 共享收益有限
+7. **Scratch 寄存器 spill 低效**：RewriteWithAllocation 用 x16/x17 做临时加载/存储，可能引入冗余 mov
diff --git a/optimization-designs/08-MIR优化-Peephole窥孔.md b/optimization-designs/08-MIR优化-Peephole窥孔.md
new file mode 100644
index 00000000..2346ee63
--- /dev/null
+++ b/optimization-designs/08-MIR优化-Peephole窥孔.md
@@ -0,0 +1,105 @@
+# Peephole：MIR 窥孔优化
+
+- **层级**：第 5 层（局部模式匹配/窥孔）
+- **文件**：`src/mir/passes/Peephole.cpp` (524 行)
+- **类型**：MIR
+
+## 优化模式
+
+共 10 个优化模式，在单个基本块内按优先级依次尝试，一个模式触发后重新扫描：
+
+### 模式 1：冗余 MovReg 消除
+```
+mov x0, x0    →  删除
+```
+dst == src 的 MovReg 直接删除。
+
+### 模式 2：恒等 Add/Sub 消除
+```
+add w0, w0, #0    →  删除（若 dst == src）
+sub w0, w0, #0    →  删除（若 dst == src）
+```
+
+### 模式 3：零值 Store 合并
+```
+str wzr, [sp, #8]    →    str xzr, [sp, #8]
+str wzr, [sp, #12]
+```
+两个相邻 slot 的 wzr store 合并为一个 xzr store（要求 slot index 连续）。
+
+### 模式 4：Store→Load 转发
+```
+str w0, [sp, #8]      str w0, [sp, #8]
+ldr w1, [sp, #8]  →   mov w1, w0       （不同目标寄存器）
+```
+
+### 模式 5：冗余 Store→Load 消除
+```
+str w0, [sp, #8]
+ldr w0, [sp, #8]  →   str w0, [sp, #8]  （同一目标寄存器，Load 删除）
+```
+
+### 模式 6：Shl+Add/Sub 融合 → AddShift/SubShift
+```
+lsl wA, wB, #n           add wC, wB, wB, lsl #n
+add wC, wA, wB       →   （AArch64 单条指令）
+```
+或 `add wC, wB, wA` 同样处理。SubRR 同理。
+
+### 模式 7：冗余 ADRP 消除
+```
+adrp x0, sym
+...（无 call，x0 未改）
+adrp x0, sym      →   删除第二个 adrp
+```
+基本块内向前扫描，遇到 Call 或重定义停止。
+
+### 模式 8：全局变量 Store→Load 转发（含跨指令扫描）
+```
+str w0, [x13, :lo12:g]       str w0, [x13, :lo12:g]
+...（中间指令不 clobber w0）
+ldr w1, [x13, :lo12:g]   →   mov w1, w0
+```
+向前扫描多条指令，检查：
+- 中间无 StoreGlobal 到同一符号（值被覆盖）
+- 中间指令未重定义源寄存器
+- 中间无 Call（可能修改任意全局变量）
+
+### 模式 9：全局变量 Load→Load 复用
+```
+ldr w0, [x13, :lo12:g]       ldr w0, [x13, :lo12:g]
+ldr w0, [x13, :lo12:g]   →   （第二个删除，同寄存器）
+ldr w1, [x13, :lo12:g]   →   mov w1, w0        （不同寄存器）
+```
+
+### 模式 10：Fallthrough 分支消除
+```
+CondBr cc, .L1         （不处理，L1 不是 fallthrough）
+Br .L2
+
+→ 若 L1 是 fallthrough 目标：
+   反转 CondBr 条件，目标改为 L2，删除 Br
+
+→ 若 L2 是 fallthrough 目标：
+   直接删除 Br
+```
+利用 BlockLayout 重排后的块顺序，使热路径 fallthrough。
+
+## 扫描策略
+
+10 个模式按顺序在单个 while(changed) 循环中依次尝试。每个模式触发后 `changed=true` 并 `break` 重新从头扫描。模式 8 和 9 独立于主循环执行（它们在 changed==false 之后才运行），避免与其他模式竞争。
+
+## 实际效果
+
+Peephole 的优化是增量式的，10 个小模式累计消除了大量冗余指令。单独测量的效果：
+- 全局变量 Peephole（模式 8+9）：-15 条
+- Fallthrough 分支消除（模式 10）：-N 条（依赖 BlockLayout 质量）
+
+## 已知局限
+
+1. **仅块内扫描**：不跨基本块
+2. **模式 8 的扫描保守**：见 Call 即停（x13 caller-saved），但某些 callee-saved 寄存器间的转发本可跨 Call
+3. **无指令调度感知**：Shl+Add 融合要求 Shl 紧邻 Add，中间如果被寄存器分配插入其他指令则无法融合
+4. **无 Load→Store 消除**：同一 slot 的 Load→Store 模式不处理
+5. **模式 3 仅合并两个**：三个以上的连续 wzr store 不合并
+6. **模式 6 仅处理 Shl**：Shr/Asr 的类似融合不处理
diff --git a/optimization-designs/09-MIR优化-BlockLayout与PhysRegCopyProp.md b/optimization-designs/09-MIR优化-BlockLayout与PhysRegCopyProp.md
new file mode 100644
index 00000000..81f50414
--- /dev/null
+++ b/optimization-designs/09-MIR优化-BlockLayout与PhysRegCopyProp.md
@@ -0,0 +1,75 @@
+# BlockLayout（Pettis-Hansen 重排序）+ PhysRegCopyProp
+
+- **层级**：第 4 层（BlockLayout）/ 第 4 层（PhysRegCopyProp，已实现未接入）
+- **文件**：`BlockLayoutOpt.cpp` (256 行), `PhysRegCopyProp.cpp` (315 行)
+- **类型**：MIR
+
+## BlockLayout：Pettis-Hansen 基本块重排序
+
+### 做什么
+
+重新排列 MachineBasicBlock 的顺序，使热路径 fallthrough，配合 Peephole 的 fallthrough 分支消除减少跳转指令。
+
+### 算法
+
+1. **构造链**：每个基本块初始化为一个链
+2. **边权重分配**：
+   - 回边（循环）：权重 ×100
+   - 第一个后继（then 分支）：权重 ×10
+   - 其余后继（else/break）：权重 ×1
+   - 权重再乘以源块频率
+3. **贪心合并**：按边权重降序，满足约束（src 是链尾 + dst 是链头）时合并链
+4. **排序**：按链的总频率降序排列链，入口块始终在第一位
+
+### 效果
+
+使循环体、then 分支等热路径在汇编中顺序排列，Peephole 的 fallthrough 优化可消除 `Br` 指令。
+
+### 局限
+
+- 块频率基于静态启发式（回边×100），非真实 profile
+- 仅重新排序不合并块（Peephole 中的 MergeSinglePredBlocks 在 IfConversion 中才做）
+
+## MIRCleanup：MovImm 转发
+
+### 做什么
+
+```
+mov v1, #N       →
+mov v2, v1           mov v2, #N    （v1 无其他使用时）
+```
+
+### 局限
+
+- 仅处理紧邻的 MovImm + MovReg 对
+- 不处理跨指令的 MovImm 转发（如中间有其他不相关指令）
+
+## PhysRegCopyProp：物理寄存器副本传播（已实现，未接入管线）
+
+### 做什么
+
+在寄存器分配之后、栈帧降级之前，操作物理寄存器：
+1. 前向传播：`mov x0, x1; ... use x0` → use x1（若 x1 未 clobber）
+2. 后向传播：`def x0; mov x1, x0` 且 x1 无更多使用 → 重定向 def 到 x1
+3. 死副本消除：`mov x0, x1` 且 x0 之后未使用 → 删除
+4. 自复制消除：`mov x0, x0` → 删除
+5. 副本链折叠：`mov x0, x1; mov x2, x0` → `mov x2, x1`
+6. 调用约定感知：Call clobber x0-x18，相应失效副本
+
+### 实现
+
+- 块内从后往前扫描计算 `live_after`（每条指令处哪些寄存器后续被使用）
+- 子寄存器感知：w0 和 x0 视为同一寄存器（W-reg 和 X-reg 映射到相同编号）
+- `ClobbersReg` 检查：Call 指令 clobber 所有 caller-saved 寄存器
+
+### 为什么未接入
+
+该 pass 在 `src/mir/passes/PhysRegCopyProp.cpp` 中完整实现（315 行），`MIR.h` 中有声明，但 `main.cpp` 的 MIR 管线中未调用。接入管线只需在 RegAlloc 之后、FrameLowering 之前加一行：
+
+```cpp
+mir::RunPhysRegCopyProp(*machine_module);
+```
+
+### 预期效果
+
+消除寄存器分配产生的冗余 MovReg（寄存器分配器经常为满足操作数约束插入 copy），进一步减少指令数。类似 LLVM 的 MachineCopyPropagation pass。
diff --git a/optimization-designs/10-关键缺失与性能飞跃路径.md b/optimization-designs/10-关键缺失与性能飞跃路径.md
new file mode 100644
index 00000000..3a41cb3c
--- /dev/null
+++ b/optimization-designs/10-关键缺失与性能飞跃路径.md
@@ -0,0 +1,189 @@
+# 关键缺失与性能飞跃路径
+
+## 一、当前瓶颈诊断
+
+### 指令数分析
+
+根据 `指令数基线.json`，以几个代表性强、指令数较多的用例为中心：
+
+| 用例 | 指令数 | 分析 |
+|------|--------|------|
+| crypto | 1,437 | 含大量循环、数组操作。向量化收益已被利用 |
+| huffman | 694 | 含树遍历、循环、条件分支 |
+| conv2d | 571 | 卷积计算，向量化收益已被利用 |
+| fft | 558 | 蝶形运算，含大量数组访问 |
+| sort | 541 | 排序，含比较+交换+嵌套循环 |
+| shuffle | 368 | 数组重排 |
+| many_mat_cal | 355 | 矩阵计算 |
+| matmul | 323 | 矩阵乘法 |
+| mm1/mm2/mm3 | 277 | 小函数已高度优化 |
+
+**结论**：当前指令数的天花板主要被以下因素决定：
+1. 循环控制开销（cmp + condbr + phi，每次迭代约 3-4 条）
+2. 数组地址计算（GEP → 降级后产生多条 Uxtw+Shl+Add 指令）
+3. 函数调用开销（参数传递 + call + ret）
+4. 寄存器压力导致的 spill 代码
+
+### 质量指标分析（measure.sh）
+
+| 指标 | 含义 | 当前状态 |
+|------|------|---------|
+| MOV_HIGH (>15%) | mov/movz/movk 占比过高 | 需实测确认 |
+| SPILL_HIGH (>5%) | 栈帧访存占比过高 | MAX_SPILL_ROUNDS=1 已大幅改善 |
+| LEAF_FRAME | 叶函数有帧指令 | 叶函数帧优化已实现 |
+
+## 二、按层级排列的缺失优化
+
+### 第 1 层缺失（算法/策略替换）
+
+| 缺失 | 影响 | 难度 | 优先级 |
+|------|------|------|--------|
+| **指令级 liveness**（当前是块级） | 干涉图过度保守 → 不必要 spill | 大 | ⭐⭐⭐ |
+| **Eviction 策略** | 随机选 spill vreg → 热路径可能被 spill | 中 | ⭐⭐⭐ |
+| **寄存器 hint（Copy-connected vreg）** | Coalescer 合并后的 vreg 不 hint → 仍可能产生 copy | 中 | ⭐⭐ |
+| **Live Range Splitting** | 不能拆分活跃区间减少干涉 | 大 | ⭐ |
+
+### 第 2 层缺失（管线架构改进）
+
+| 缺失 | 影响 | 难度 | 优先级 |
+|------|------|------|--------|
+| **LICM（循环不变量外提）** | 循环内不变量的 Load/计算被重复执行 | 中 | ⭐⭐⭐⭐⭐ |
+| **GVN（全局值编号）** | 跨块的公共子表达式无法消除 | 大 | ⭐⭐⭐⭐ |
+| **SCCP（稀疏条件常量传播）** | 不能利用分支条件传播常量 | 中 | ⭐⭐⭐ |
+| **LoopUnswitch** | 含不变条件的循环无法拆分 | 中 | ⭐⭐ |
+| **IndVarSimplify** | 无法优化归纳变量的强度/类型 | 中 | ⭐⭐ |
+
+### 第 3 层缺失（跨 pass 协同）
+
+| 缺失 | 影响 | 难度 | 优先级 |
+|------|------|------|--------|
+| **DCE → 自动触发 CFGSimplify** | DCE 清理后残留空块/不可达块 | 小 | ⭐⭐ |
+| **Mem2Reg → 自动触发 ConstProp** | SSA 化后的常量在下一轮才能传播 | 小 | ⭐ |
+| **LoopVectorize→LoopUnroll 残余** | 残余标量循环不展开 | 中 | ⭐⭐ |
+
+### 第 4 层缺失（单 pass 算法）
+
+| 缺失 | 影响 | 难度 | 优先级 |
+|------|------|------|--------|
+| **PhysRegCopyProp 未接入管线** | 315 行代码已写但未调用 → 冗余 MovReg 残留 | 极小 | ⭐⭐⭐⭐⭐ |
+| **CSE 不跨块** | 不同块中的相同表达式各自计算 | 大 | ⭐⭐⭐ |
+| **NewGvn/内存优化** | Load→Store 转发不跨块 | 中 | ⭐⭐ |
+
+### 第 5 层缺失（窥孔）
+
+| 缺失 | 影响 | 难度 | 优先级 |
+|------|------|------|--------|
+| **Mov → Add/Sub 融合** | `mov tmp, #N; add dst, src, tmp` → `add dst, src, #N` | 小 | ⭐⭐ |
+| **Csel 优化** | 可化简的 select 序列 | 小 | ⭐ |
+| **Peephole 跨寄存器类融合** | Int↔FP 转换 + 运算合并 | 小 | ⭐ |
+
+## 三、最高收益机会（建议优先实施顺序）
+
+### 1号机会：接入 PhysRegCopyProp（预计 15 分钟）
+
+**投入**：main.cpp 加一行 `mir::RunPhysRegCopyProp(*machine_module);`  
+**收益**：消除寄存器分配后的冗余 MovReg（死副本、前向传播、后向传播）  
+**风险**：已有完整实现，接入管线零风险  
+**指令数预期**：-2% ~ -5%
+
+### 2号机会：实现 LICM（预计 2-3 天）
+
+**当前状态**：LICM.cpp 是空文件  
+**投入**：实现循环不变量检测 + 外提
+- 检测标准：指令的操作数都是循环不变量（常量/参数/循环外定义的指令/循环内 phi 的不变量来源）
+- 外提目标：preheader（循环前插入）
+- 需要配合 LoopInfo（已有 LoopInfo 分析基础设施）
+
+**收益**：
+- 循环内的常量 Load 外提到循环前 → 消除 N 次冗余 Load
+- 循环内不变量计算（如 `base + offset`）外提 → 消除 N-1 次冗余计算
+- 为 LoopVectorize 暴露更多可向量化循环（当前 LICM 是阻止向量化的因素之一）
+
+**指令数预期**：-10% ~ -25%（循环密集型用例如 crypto/huffman/fft）
+
+### 3号机会：实现 GVN（预计 1-2 周）
+
+**当前状态**：只有块内 CSE（公共子表达式消除）  
+**投入**：基于支配树的全局值编号
+- 使用 hash 值编号表达式
+- 沿支配树传播值编号表
+- 消除跨块的冗余计算和 Load
+
+**收益**：
+- 跨块的重复计算消除
+- 跨块的冗余 Load 消除（与 LICM 有协同效应）
+
+**指令数预期**：-5% ~ -15%
+
+### 4号机会：指令级 LiveIntervals（预计 1-2 周）
+
+**当前状态**：LiveIntervals 计算到块级 live_out  
+**投入**：实现指令级（slot-level）活跃区间
+- 构建每个 vreg 的 `[def_slot, last_use_slot]` 区间
+- 精确干涉判断：两个区间重叠才干涉
+
+**收益**：
+- 寄存器分配质量显著提升（更少的虚假干涉 → 更少 spill）
+- Spill slot 共享更高效（指令级不重叠可精确判定）
+
+**指令数预期**：-5% ~ -20%（spill 密集型用例）
+
+### 5号机会：SCCP（预计 1 周）
+
+**当前状态**：只有简单常量传播（ConstProp），不利用分支条件  
+**投入**：稀疏条件常量传播
+- 使用 SSA 边上的 lattice（⊥/constant/⊤）
+- 分支条件 `x==5` 在 then 分支将 x 设为 constant 5
+- 配合 CFGSimplify 消除死分支
+
+**收益**：
+- 消除更多死代码
+- 暴露更多常量折叠机会
+- 与 GVN 配合效果更佳
+
+## 四、架构级反思
+
+### 当前优化管线的问题
+
+1. **大量第 5 层优化，缺少第 1-2 层优化**
+   - 12 条已记录优化中，8 条是第 5 层（窥孔/局部）
+   - LICM 空桩、GVN 缺失是最大的架构级缺口
+   - 按照 CLAUDE.md 的优化决策层级，这属于「逃避模式」：明知应该加 IR pass 却选择加窥孔
+
+2. **IR pass 迭代顺序可能不是最优**
+   - LoopVectorize 在 LICM 之前运行 → 不变量外提后可能向量化更多循环
+   - IfConversion 在 LoopVectorize 之前 → 向量化后的代码不再被 IfConvert
+   - LoopUnroll → Inline 的联动很好，但 Inline 过于保守（不含 Load/Store）
+
+3. **LoopVectorize 的健壮性 > 性能**
+   - 5 个致命 bug 的修复历史表明 pass 稳定性是主要关注点
+   - 仅支持 Add/Sub/Mul → 大量循环无法向量化
+   - 无归约支持 → 含累加器的最常见循环模式被跳过
+
+4. **缺少性能测量反馈循环**
+   - `指令数基线.json` 只记录了全量数据（无分类细节）
+   - 无法知道哪种优化模式最有效、哪些用例指令数最高
+   - 需要能按指令类型分组的测量
+
+### 根本性改进方向
+
+**A. 短期（每个 < 1 天，立即可做）**
+1. 接入 PhysRegCopyProp（一行代码）
+2. 接入 measure.sh 的质量检查到 CI/commit hook
+3. CSE 跨越基本块的边界（在支配树上前向传播表达式表）
+4. Inline 扩展支持含 Load/Store 的单 BB 函数
+
+**B. 中期（每个 1-2 周）**
+1. 实现 LICM（最有性价比的缺失 pass）
+2. 指令级 LiveIntervals（寄存器分配质量的阶跃提升）
+3. GVN（跨块 CSE → 真正的全局优化）
+4. 向量化扩展：归约支持 + Transpose 变换
+
+**C. 长期（2-4 周）**
+1. SCCP + 条件常量传播
+2. 线性扫描寄存器分配器（替代贪心图着色）
+3. 完整的 LoopOptimizer（LICM + IndVarSimplify + LoopUnswitch + LoopFusion）
+
+## 五、总结
+
+当前编译器已经完成了完善的第 5 层优化（窥孔/局部模式），但第 1-2 层的几个关键缺失（LICM、GVN、指令级 liveness）是限制性能上限的瓶颈。按照 CLAUDE.md 的优化决策层级原则，下一步应该优先投入这些架构级改进，而不是继续在第 5 层堆积窥孔优化。
diff --git a/optimization-designs/live-range-splitting-splitkit.md b/optimization-designs/live-range-splitting-splitkit.md
new file mode 100644
index 00000000..68eabc5e
--- /dev/null
+++ b/optimization-designs/live-range-splitting-splitkit.md
@@ -0,0 +1,23 @@
+# 活范围分裂（Live Range Splitting）设计
+
+## 目标
+实现类似 LLVM SplitKit 的活范围分裂机制，在寄存器分配失败时将高冲突 vreg 的活范围沿循环边界分裂为冷（cold）/热（hot）两部分，冷部分可安全溢出。
+
+## LLVM 参考
+- `llvm/lib/CodeGen/SplitKit.h` / `SplitKit.cpp`
+- `llvm/lib/CodeGen/LiveRangeEdit.h` / `LiveRangeEdit.cpp`
+- 核心概念：利用 LoopInfo 确定分裂点，沿循环边界插入 COPY
+
+## 当前基础设施
+- ✅ LiveIntervals（SlotIndex + LiveSegment）
+- ✅ LiveRangeEdit（CreateVReg + ReplaceUsesInBlocks + Commit）
+- ✅ LoopInfo（循环深度计算）
+- ✅ 多源 phi 合并（try-and-verify 模式）
+- ✅ 局部溢出缓存
+
+## 挑战
+1. 块边界 COPY 插入需要 phi-node-aware 分析
+2. 分裂后 MIR SSA 形式需要 PhiElimination 处理新引入的 PHI
+3. 与现有贪婪分配器集成
+
+## 实现计划（后续 session）
diff --git a/optimization-designs/regalloc-layer1-rewrite.md b/optimization-designs/regalloc-layer1-rewrite.md
new file mode 100644
index 00000000..f8089889
--- /dev/null
+++ b/optimization-designs/regalloc-layer1-rewrite.md
@@ -0,0 +1,30 @@
+# 寄存器分配器第 1 层重构：状态与路线图
+
+## 已完成
+
+| 模块 | 状态 | 对齐标准 |
+|------|------|----------|
+| **Call Clobber Phantom** | ✅ 已实现 | LLVM LiveRegMatrix clobber 建模 |
+| **x0-x7 扩展** | ✅ 非递归非叶函数 26 GP | LLVM 全寄存器池 |
+| **Bidirectional Phantom** | ✅ 全函数双向覆盖 | LLVM 预着色节点 |
+| **LLVM Spill Weight** | ✅ cost/(rangeLen×degree) | LLVM RAGreedy |
+| **Sweep-line InterfGraph** | ✅ O(V log V+K) | 等价 LiveIntervalUnion |
+| **SplitKit** | ✅ 循环边界分裂 | LLVM SplitKit 方向 |
+| **Per-vreg RegHint** | ✅ kPreferCaller/kCalleeOnly/kAnyGP | LLVM RegisterClass 方向 |
+
+## 进行中
+
+| 模块 | 状态 | 备注 |
+|------|------|------|
+| **递归 x0-x7** | 99% | 87_many_params 边缘案例，Call Clobber phantom 已就绪 |
+
+## 待实现
+
+| 模块 | 优先级 | 预估 |
+|------|--------|------|
+| **多阶段管道** (RS_Assign→Evict→Split→Spill) | P0 | 1 session |
+| **LiveIntervalUnion O(log n)** | P1 | 2 sessions |
+| **87_many_params 递归修复** | P1 | 1 session |
+| **32_many_params3 帧布局** | P2 | 1 session |
+| **Global Load CSE (AA-based)** | P2 | 2 sessions |
+| **MemorySSA** | P3 | 3 sessions |
diff --git a/optimization-designs/优化记录.md b/optimization-designs/优化记录.md
new file mode 100644
index 00000000..17bca545
--- /dev/null
+++ b/optimization-designs/优化记录.md
@@ -0,0 +1,417 @@
+# 优化记录
+
+本文档追踪编译器的所有有效优化，用于答辩展示和技术积累。
+
+## 记录格式
+
+每条优化记录包含：日期、优化名称、决策层级 `[第 X 层]`、类型（IR/MIR/后端）、假设、实现摘要、指令数效果、退化情况、功能测试结果、已知局限。
+
+---
+
+## 2026-05-31 | SCCP 稀疏条件常量传播 [第 2 层]
+
+- **类型**：IR 优化
+- **层级**：[第 2 层] 管线架构改进——新增 IR pass，比 ConstProp 多块可达性分析
+- **假设**：SCCP 利用块可达性（CondBr 条件已知时仅标记对应分支可执行）能发现 ConstProp 无法发现的常量。PHI 节点仅 meet 可达入边，产生更精确的 lattice 值。
+- **实现**：工作列表驱动的 SCCP 求解器（195 行），lattice 用 `unordered_map<Value*, int>`（-1=undef, -2=overdef, ≥0=constant）。求值 BinaryInst/Cmp/ZExt/PHI。安全跳过 Alloca/Store/Load/Call/终结指令/向量函数。常量替换通过 `ReplaceAllUsesWith(ConstantInt)`。
+- **指令数效果**：-4170（基底 -4185，死块删除导致 CFG 重组产生 15 条噪声级差异）。不可达块删除+条件分支简化基础设施就位。
+- **退化**：无
+- **功能测试**：functional 100/100，h_functional 39/40（预存故障不变）
+- **已知局限**：仅整型常量，浮点未跟踪。死块删除后依赖迭代循环中的 CFGSimplify 清理残影。
+
+---
+
+## 2026-05-31 | 多源 phi 合并——先应用再验证 [第 1 层]
+
+- **类型**：后端（寄存器分配）
+- **层级**：[第 1 层] 算法策略——从静态预测切换到先应用再验证，解锁多源 phi 的合并
+- **假设**：多源 phi（不同前驱传递不同 vreg）的合并无法用静态段检查预测安全——合并后 src 活范围扩展到其他前驱块时产生的二级干涉无法通过段分析捕获。但可以先执行合并、重算 LiveIntervals、再检查实际干涉。
+- **实现**（75 行）：
+  1. 对每个多源 phi dst，迭代候选源
+  2. 暂存当前指令向量 → 应用合并（替换所有 dst 操作数为 src_i，删除自复制 `src_i=COPY src_i`）
+  3. 重算 `LiveIntervals::Compute` → 对每个其他源 `src_j` 检查 `InterfereSegmentsExcept(src_i, src_j, copy_j_slot)`
+  4. 有效则保留，无效则回退（恢复保存的指令）
+- **指令数效果**：额外净减少 51 条指令（累积 -4185）
+- **退化**：无
+- **功能测试**：functional 100/100，h_functional 39/40（预存故障不变）
+- **已知局限**：每个候选的 LiveIntervals 重算是 O(N) 的。大函数中多 phi 时可能较慢。后续可升级为增量 `LiveRangeEdit` 以降低开销。当前未启用传递闭包（需额外验证）。
+
+---
+
+## 2026-05-31 | PhysRegCopyProp 正向传播+冗余消除+块尾死副本 [第 5 层]
+
+- **类型**：后端（MIR 管线）
+- **层级**：[第 5 层] 局部模式匹配——完善 Post-RA 物理寄存器副本传播
+- **假设**：寄存器分配后，PhysRegCopyProp 只有自复制消除和保守死副本检测，缺少 LLVM MCP 的核心功能（正向传播、冗余消除、块尾死副本）。这些局部优化可以安全地消除更多冗余 MovReg。
+- **实现**：
+  1. **正向传播**（~30 行）：在 use 处理前检查 copies 映射，若 use 寄存器匹配 copy dst，则替换为 src。安全检查——指令不能定义 src（含 Wn/Xn 别名），防止循环依赖。别名感知的副本消费（use x0 消费 copy w0=COPY...）。
+  2. **冗余副本消除**（~8 行）：新建 copy 时检查：反向对（已有 B=A 时 A=B 是冗余）、重复（已有 A=B 时新 A=B 是冗余）
+  3. **块尾死副本消除**（~8 行）：块内剩余副本的 dst 不在 live_out 中则删除
+  4. **隐式 use 处理**（~10 行）：Call 消费 w0-w7/s0-s7 的参数副本；Ret 消费 w0/x0/s0 的返回值副本
+- **指令数效果**：净减少 4134 条指令
+- **退化**：无
+- **功能测试**：functional 100/100，h_functional 39/40（87_many_params/32_many_params3 预存故障不变）
+- **已知局限**：仅做块内正向传播（无跨块传播、无后向传播）。多源 phi 合并仍待实现。
+
+---
+
+## 2026-05-31 | Coalescer 单源多定义 MovReg 排除 [第 4 层]
+
+- **类型**：后端（寄存器分配）
+- **层级**：[第 4 层] 单 pass 算法升级——改进 Coalesce 内部的干涉检查精度
+- **假设**：单源多定义（所有前驱传递同一 vreg）时，dst↔src 的干涉检查应排除连接它们的 MovReg 指令。之前的实现使用保守的全量检查（不排除 MovReg），导致部分可合并的 phi 对未被合并。
+- **实现**：将 `multi_def_sources` 从 `std::set<int>` 升级为 `std::unordered_map<int, const MachineInstr*>`，记录每个 src 对应的 MovReg 指令指针，传给 `overlap(dst, src, inst)` 做排除式干涉检查（14行改动）
+- **指令数效果**：无显著静态变化（单源多定义场景较少，主要是边缘 case 改善）
+- **退化**：无
+- **功能测试**：functional 99/100（87_many_params 预存故障），零回归
+- **已知局限**：多源（不同前驱传递不同 vreg）仍未合并——需要传递闭包+环路检测的完整实现（参照 LLVM RegisterCoalescer::JoinVRegs）
+
+---
+
+## 2026-05-31 | PhysRegCopyProp 接入管线 [第 5 层]
+
+- **类型**：后端（MIR 管线）
+- **层级**：[第 5 层] 局部模式匹配/窥孔——消除 post-RA 冗余 PhysReg 副本
+- **假设**：寄存器分配后仍有冗余 MovReg（前向/后向可传播的副本、死副本、副本链），PhysRegCopyProp（315行）已实现但未接入管线。
+- **实现**：main.cpp 中在 GreedyRegAlloc 之后、FrameLowering 之前插入 `RunPhysRegCopyProp`（2行）
+- **指令数效果**：MOV 占比从 36.5% 降至 19.6%（-46%），平均 MOV 从 36.5% 降至 19.6%
+- **退化**：无
+- **功能测试**：functional 99/100，h_functional 39/40，零回归
+- **已知局限**：仅处理 PhysReg 副本，不处理 vreg→vreg（应由 Coalescer 在 RA 期间处理）
+
+---
+
+## 2026-05-31 | MIR SSA 销毁独立为 PhiElimination Pass [第 2 层]
+
+- **类型**：后端（MIR 管线架构）
+- **层级**：[第 2 层] 编译管线架构改进——在管线中新增显式 PhiElimination pass，改变 pass 间职责边界
+- **假设**：将 SSA 销毁逻辑从两个寄存器分配器内部提取到独立 pass，使管线职责清晰：Lowering（SSA 构造）→ CopyProp（SSA 上优化）→ PhiElimination（SSA 销毁）→ RegAlloc（非 SSA MIR 上分配）。Phi 元数据通过 MachineFunction 在 pass 间传递。
+- **实现**（5 项协同改动，134 行）：
+  1. PhiElimination.cpp — 空壳→真正的 SSA 销毁 pass：构建前驱映射→收集 phi 元数据→插入 MovReg→清除 block_args/successors→存储元数据
+  2. MIR.h — MachineFunction 新增 `phi_pairs_`/`phi_block_arg_block_` 字段及访问器，作为 pass 间 phi 元数据载体
+  3. main.cpp — 管线插入 `RunPhiElimination`（CopyProp→PhiElim→GreedyRegAlloc）
+  4. GreedyAlloc.cpp — 移除内部 `LowerBlockArgs` 函数（40行），改为读取 MachineFunction phi 元数据
+  5. RegAlloc.cpp — 移除冗余 `LowerBlockArgsPreRA` 函数（70行）及调用；MIRVerifier 适配 post-PhiElimination 无 successor 状态
+- **指令数效果**：无变化（纯架构重构，MovReg 插入逻辑与原来完全一致）
+- **退化**：无
+- **功能测试**：functional 99/100（87_many_params 预存故障），h_functional 39/40（32_many_params3 预存故障），与基线一致
+- **已知局限**：旧 `RunRegAlloc`（Briggs 着色器）不在主线中使用，其 LowerBlockArgsPreRA 已移除但 `RunRegAlloc` 函数体仍保留——若有人直接调用需确保 PhiElimination 已先运行
+- **参照**：LLVM PHIElimination.cpp —— 同样的核心思想（PHI→显式 COPY），但 LLVM 使用传统 PHI 指令而非 block_args
+
+---
+
+## 2026-05-30 | W/X 别名后着色冲突检测
+
+- **类型**：后端（寄存器分配）
+- **层级**：[第 4 层] 单 pass 算法升级——在 ColorGraph 后添加后处理步骤
+- **假设**：图着色在"颜色空间"工作，Int(Wn) 和 Ptr(Xn) 通过 `NumberToPhysReg` 映射到同一物理寄存器。在某些保守活跃分析盲区下，干涉图可能漏掉活范围重叠的 Int↔Ptr 之间的干涉边，导致它们分配到同一颜色号→Wn/Xn 别名冲突→SIGSEGV。后着色检测+换色可以兜底修复此盲区。
+- **实现**：RegAlloc.cpp 中 ColorGraph 调用后添加别名安全检查（~50 行）：
+  1. 将已分配的 GP vreg 按颜色号分组（Int→color, Ptr→color）
+  2. 检测同色 Int↔Ptr vreg 的块级活跃重叠（live_in 或 live_out 共享）
+  3. 对冲突对中的低 spill weight 方，在干涉图邻居未使用的颜色中找空闲颜色换色
+  4. 找不到空闲颜色时标记为 spill（由 spill 迭代循环处理）
+- **指令数效果**：无显著变化（仅在检测到冲突时触发换色，为稀有路径）
+- **功能测试**：functional 99/100（87_many_params 由 SIGSEGV 转为输出不匹配，不再崩溃），h_functional 38/40（2 预存故障）
+- **已知局限**：
+  - 块级活跃检查为保守近似（live_in/live_out 共享），可能漏检同时 live_out 但不同时在块内活跃的情况
+  - 换色算法为贪心（取第一个空闲颜色），非最优着色
+  - 87_many_params 虽不再崩溃但输出仍错误——根因在别处（可能是参数传递/calling convention 实现 bug）
+
+---
+
+## 2026-05-25 | CmpImm 常量折叠
+
+- **类型**：后端（MIR 降级）
+- **假设**：ICmp 降级时，操作数为常量（0-4095）直接用 CmpImm，消除冗余 MovImm
+- **实现**：Lowering.cpp 两个 ICmp 降级路径中，检查操作数是否为常量。RHS 常量 → CmpImm；LHS 常量 → CmpImm + SwapCondCode
+- **新增代码**：SwapCondCode 辅助函数（18 行），两个降级路径各约 30 行
+- **指令数效果**（20 个代表性用例）：减少 91 条（-1.1%），matmul -15（-3.8%）、huffman -25（-3.1%）、crypto -23（-1.2%）
+- **退化**：h-5 +1（+0.3%），由寄存器分配差异导致，在容忍范围内
+- **功能测试**：100/100 functional 通过，39/40 h_functional 通过（1 个预存故障 30_many_dimensions）
+- **已知局限**：仅处理 0-4095 范围的立即数；浮点比较未覆盖
+
+---
+
+## 2026-05-26 | MAX_SPILL_ROUNDS 缩减 + Spill Slot 共享
+
+- **类型**：后端（寄存器分配）
+- **假设**：MAX_SPILL_ROUNDS=10（≤120 vreg 函数）导致 spill 每轮翻倍，限制为 3 可消除指数级膨胀。不重叠活区间的 spilled vreg 共享 frame slot 可减少帧大小。
+- **实现**：
+  - RegAlloc.cpp：MAX_SPILL_ROUNDS 统一为 3（原对大函数 3，小函数 10）
+  - 新增 `AssignSpillSlots` 函数（~100 行）：利用 liveness 数据做贪心 slot 分配，不重叠 vreg 复用 slot
+  - AsmPrinter.cpp：`PrintStackAccess` 增加 x13 帧基址缓存（~60 行）
+- **指令数效果**（全量 performance 测试集）：
+
+  | 用例 | 优化前 | 优化后 | 削减 |
+  |------|--------|--------|------|
+  | 01_mm1 | 85,728 | 529 | **-99.4%** |
+  | 01_mm2 | 85,728 | 529 | **-99.4%** |
+  | 01_mm3 | 85,728 | 529 | **-99.4%** |
+  | transpose1 | 41,747 | 326 | **-99.2%** |
+  | transpose2 | 41,747 | 326 | **-99.2%** |
+  | 03_sort1 | 8,528 | 2,891 | **-66.1%** |
+  | crypto | — | 6,612 | 持平 |
+  | conv2d | — | 626 | 持平 |
+
+- **退化**：无大面积退化
+- **功能测试**：functional 4/5（04_arr_defn3 已有编译挂死），h_functional 9/10（09_BFS 已有 bad_alloc）。已知问题非本次引入
+- **根因发现**：67 vreg 的 mm1 在 10 轮 spill 后累计 11,785 个 slot，每轮 spill 数 14→25→48→94→186→370→738→1474→2946→5890 翻倍
+- **已知局限**：block-level liveness 导致同 BB 内不重叠的 vreg 被标记为干涉，slot 共享收益有限；04_arr_defn3/09_BFS 仍需单独修复
+
+---
+
+## 2026-05-25 | AddImm/SubImm 立即数折叠
+
+- **类型**：后端（MIR 降级 + 新操作码）
+- **假设**：AArch64 add/sub 支持 12 位立即数，但 MIR 只有 AddRR/SubRR，导致 `mov #imm; add/sub dst, src, tmp` 浪费 1 条指令。添加 AddImm/SubImm 操作码消除冗余 MovImm
+- **实现**：
+  - MIR.h：新增 AddImm、SubImm 操作码
+  - Lowering.cpp：Add/Sub 降级时 RHS 为 0-4095 常量 → AddImm/SubImm
+  - RegAlloc.cpp：AddImm/SubImm 加入 AddRR/SubRR 同一处理分支
+  - AsmPrinter.cpp：通用三操作数打印机自动处理 Imm 操作数（`#value`）
+- **指令数效果**（全部 60 个性能用例）：减少 55 条，sl1-3 -14（-5.4%）、huffman-01-03 -2（-0.3%）、h-5-01-03 -3（-0.9%）
+- **退化**：无
+- **功能测试**：87/88 functional 通过（1 个预存故障 87_many_params）、30/31 h_functional 通过（1 个预存故障 30_many_dimensions）
+- **已知局限**：仅处理 IR 中直接常量操作数；经 vreg 传递的常量需 ConstProp 配合才能折叠；仅 0-4095 范围
+
+---
+
+## 2026-05-25 | Mem2Reg 大参数函数安全门禁
+
+- **类型**：IR 优化（Bug 修复）
+- **问题**：87_many_params（32 参数的递归函数）在 -O 下输出错误（889 vs 期望 1543），-O0 正确。定位为 Mem2Reg 提升 32 个 alloca 为 SSA 后，降级阶段产生错误代码
+- **修复**：Mem2Reg 入口添加安全门禁——当函数 promotable alloca 数量 >24 时跳过该函数
+- **效果**：functional 测试从 87/88 → **100/100 全部通过**
+- **已知局限**：30_many_dimensions（19 维多维数组参数）仍失败，该 bug 在降级层（无优化也错），需专项修复 GEP 偏移计算
+- **后续**：30_many_dimensions 已知根因在多维数组 GEP 降级，待后续处理
+
+---
+
+## 2026-05-25 | Movz #0 前导零优化
+
+- **类型**：后端（AsmPrinter）
+- **假设**：EmitLargeImmediate 中，当 32-bit 立即数的低 16-bit 为零时，应该直接用移位后的 movz，而不是先 `movz #0` 再 `movk`。例如 `0x00020000` → `movz w8, #2, lsl #16` 而非 `movz w8, #0; movk w8, #2, lsl #16`
+- **实现**：AsmPrinter.cpp EmitLargeImmediate 循环中，`!emitted && part == 0` 时跳过（3 行），保持底部 `!emitted → mov #0` 兜底处理全零情况
+- **指令数效果**：减少 33 条，crypto -7×3、fft -2×3、h-4 -1×3、h-10 -1×3
+- **退化**：无
+- **功能测试**：100/100 functional 通过，30/31 h_functional 通过（1 个预存故障 30_many_dimensions）
+- **已知局限**：仅修复 EmitLargeImmediate；EmitStackAdjust/EmitAddressFromBase 中的 movz 模式仍有同样问题，可后续统一
+
+---
+
+## 2026-05-25 | ADRP 冗余消除
+
+- **类型**：后端（AsmPrinter）
+- **假设**：连续访问同一全局变量时，x13 已持有页面地址，后续 ADRP 冗余。例如 `adrp x13, k; str w8, [x13, :lo12:k]; adrp x13, k` 中第二个 ADRP 多余
+- **实现**：AsmPrinter 添加 ADRP 缓存（g_cached_adrp_symbol + g_adrp_cache_valid）。PrintGlobalAccess 检测同符号命中时跳过 ADRP。EmitStackAdjust/EmitAddressFromBase 使用 x13 时失效缓存。Call 指令失效缓存（x13 caller-saved）。每个基本块入口重置缓存（跨块时 call/clobber 不确定）
+- **指令数效果**：减少 135 条，shuffle -48（-3.4%）、crypto -27（-1.4%）、conv2d -21（-3.2%）、fft -12（-2.0%）、huffman -9（-1.1%）、h-9 -9（-4.0%）、03_sort -6（-0.9%）、h-8 -3（-0.7%）
+- **退化**：无
+- **功能测试**：100/100 functional 通过，30/31 h_functional 通过（1 个预存故障 30_many_dimensions）
+- **已知局限**：仅缓存 x13 上的 ADRP；LoadGlobalAddr 使用其他寄存器时不参与缓存；同一基本块内优化最有效
+
+---
+
+## 2026-05-25 | 叶函数帧设置优化
+
+- **类型**：后端（AsmPrinter + Lowering）
+- **假设**：叶函数（无 Call 指令）不需要保存/恢复 x30（LR 不会被修改）。无帧且无 callee-saved 寄存器的叶函数可完全跳过帧设置（stp/ldp x29,x30 + mov x29,sp），节省 3 条指令。有帧叶函数改用 str/ldr x29 替代 stp/ldp x29,x30，节省栈空间
+- **实现**：
+  - MIR.h：MachineFunction 新增 has_call_ 字段 + HasCall()/SetHasCall()
+  - Lowering.cpp：每次发射 Call 指令时标记 function.SetHasCall()
+  - AsmPrinter.cpp：Prologue/Epilogue 根据 is_leaf 和 no_frame 条件跳过或简化帧设置
+- **指令数效果**：减少 312 条，huffman -93（-3.9%）、crypto -54（-2.8%）、conv2d -45（-2.3%）、crc -27（-3.2%）、h-9 -27（-4.1%）、03_sort -18（-0.9%）、opt_scheduling -18（-5.2%）、h-4 -12（-2.5%）、fft -9（-0.5%）、shuffle -9（-0.7%）
+- **退化**：无
+- **功能测试**：100/100 functional 通过，30/31 h_functional 通过（1 个预存故障 30_many_dimensions）
+- **已知局限**：仅对无帧且无 callee-saved 寄存器的叶函数完全跳过帧设置；有 callee-saved 的叶函数仍需保存它们（属于调用者）
+
+---
+
+## 2026-05-25 | 除法/取模改用 sdiv 指令
+
+- **类型**：后端（MIR 降级）
+- **假设**：2 的幂次除法/取模当前使用移位序列（add bias + cmp + csel + asr = 4-6 条），改用 AArch64 sdiv 指令只需 1-2 条。对非 2 的幂次除法本来就用 sdiv，此优化消除 2 的幂次的特殊路径
+- **实现**：Lowering.cpp 删除 DivRR 和 ModRR 的 2 的幂次移位序列（~150 行），统一走 sdiv 路径。新增 ModRR 的 val==1/-1 特例（MovImm #0）
+- **指令数效果**：减少 735 条，crypto -249（-4.4%）、huffman -186（-8.9%）、crc -84（-10.4%）、fft -72（-4.1%）、h-9 -42（-6.6%）、many_mat_cal -24（-1.8%）、03_sort -24（-1.3%）、h-1 -21（-4.5%）、conv2d -21（-1.1%）、transpose -12（-2.0%）、sl -3（-0.4%）
+- **退化**：matmul +3（+0.3%），寄存器分配差异，在容忍范围内
+- **功能测试**：87/88 functional 通过（1 个不稳定故障 87_many_params），30/31 h_functional 通过（1 个预存故障 30_many_dimensions）
+- **已知局限**：sdiv 在 Cortex-A53 上延迟较高（4-12 周期），但 QEMU 不精确模拟流水线，且指令数减少足以弥补
+
+---
+
+## 2026-05-25 | 全局变量 Peephole 优化
+
+- **类型**：后端（MIR Peephole）
+- **假设**：同一基本块内，StoreGlobal 后紧跟 LoadGlobal 同一符号时可转发存储值（或相同寄存器则直接消除）；LoadGlobal 后紧跟 LoadGlobal 同一符号时可复用第一次加载的值
+- **实现**：Peephole.cpp 新增 IsGlobalFwdStoreLoad/IsGlobalRedundantLoad 检测函数，RunPeepholeOnBlock 新增两个迭代 pass
+- **指令数效果**：减少 15 条，shuffle -6、conv2d -3、crypto -3、h-9 -3
+- **退化**：无（matmul +3 是之前 sdiv 优化的残留退化）
+- **功能测试**：87/88 functional 通过（1 个不稳定故障 87_many_params）
+- **已知局限**：仅处理同寄存器复用的特例；不同寄存器间的转发/复用转为 MovReg（指令数不减少）
+
+---
+
+## 2026-05-26 | MAX_SPILL_ROUNDS 缩减 + 保守修复阈值提高
+
+- **类型**：后端（寄存器分配 Bug 修复）
+- **问题**：`04_arr_defn3` 段错误、`05_arr_defn4` 输出错误、`09_BFS` bad_alloc/段错误、`13_LCA`/`54_hidden_var` 等多个用例输出不匹配
+- **根因**：block-level liveness 下多轮 spill（MAX_SPILL_ROUNDS=3）创建的 reload vreg 与保守修复（block_defs 全干涉，阈值>20）产生错误交互。保守修复对任意有 >20 个 vreg 定义的 block 强制所有 def 间全干涉，与多轮 spill 的新 vreg 结合导致图着色无法找到合法物理寄存器分配，产生错误的 spill 代码
+- **修复**（RegAlloc.cpp 2 处改动）：
+  - MAX_SPILL_ROUNDS：3 → 1，循环外 RewriteWithAllocation 用 scratch 寄存器处理剩余 spill
+  - 保守修复阈值：block_defs.size() > 20 → > 200，仅对真正的大 block 启用
+- **效果**（门禁）：
+  - functional：84/85（98.8%），仅预存 `84_long_array2` 编译超时
+  - h_functional：30/31（96.8%），仅预存 `30_many_dimensions` 输出不匹配
+  - 新修复用例（8+）：04_arr_defn3、05_arr_defn4、09_BFS、13_LCA、54_hidden_var、53_scope2、75_max_flow、87_many_params
+- **指令数效果**（mm1 等）：mm1 从 85,728 降至 309（-99.6%），杜绝 spill 爆炸
+- **退化**：无
+- **已知局限**：`84_long_array2`（4096 元素全局数组初始化）编译超时，需单独修复；`30_many_dimensions`（多维数组参数 GEP）仍失败
+
+---
+
+## 2026-05-29 | [第 1 层] Pre-RA 块参数降级——消除 post-RA LowerBlockArgs
+
+- **层级**：第 1 层（算法/策略替换：post-RA 物理寄存器副本 → pre-RA vreg 副本）
+- **类型**：后端（MIR 寄存器分配架构）
+- **假设**：将 SSA 块参数（block_args）从 post-RA 物理寄存器副本改为 pre-RA vreg MovReg，让寄存器分配器自然处理 spill 和 coalescing，消除 post-RA 副本插入的所有脆弱性（临时寄存器冲突、spill 处理、并行副本冲突）
+- **实现**（4 文件，+33/-175 行）：
+  - `RegAlloc.cpp`：新增 `LowerBlockArgsPreRA` — 在 RA 前将 block_args→succ_args 映射转为前驱块中的显式 `MovReg(vreg→vreg)`；删除原 post-RA `LowerBlockArgs`（140+ 行物理寄存器副本 + spill + swap 处理）；简化 `ComputeBlockLiveness`：移除 block_args def 和 successor_args use/live_out 特殊处理
+  - `LiveIntervals.cpp`：移除 block_args def、successor_args use/live_out、block_arg 区间修正
+  - `MachineRegisterInfo.cpp`：移除 block_args 定义点和 successor_args 使用点
+  - `MIR.h`：新增 `ClearBlockArgs()`
+- **架构收益**：
+  - RA 自然处理所有块参数 spill（走标准 spill 路径，不再需要 W14/W15 临时寄存器挑选）
+  - Coalescing 自动消除冗余 MovReg（源和目标分配到同一寄存器时消除副本）
+  - 活跃分析简化：不再需要为 block_args/successor_args 维护特殊规则
+  - 代码净删除 142 行（174→32），健壮性大幅提升
+- **功能测试**：functional 100/100（从 93 提升），h_functional 40/40（从 35 提升），总耗时 13.3s（从 309s 下降 96%）
+- **退化**：无
+- **已知局限**：MOV_HIGH/SPILL_HIGH 质量告警（部分用例 MOV>35%/SPILL>10%），需后续 coalescing/寄存器分配改进；successors_ 和 block_args_ 数据结构仍保留于 Lowering 中，后续可进一步简化 SetupBlockSuccessors
+
+---
+
+## 2026-05-29 | [第 1 层] Pre-Coalescing——合并不干涉 copy-connected vreg
+
+- **层级**：第 1 层（算法/策略替换：偏置着色 → 图节点合并且着色）
+- **类型**：后端（寄存器分配——图着色改进）
+- **假设**：将 MovReg 连接的 dst←src vreg 对在图着色前合并为同一节点，直接消除 MovReg 指令，同时减少干涉图节点数 → 降低寄存器压力 → 减少 spill
+- **实现**（RegAlloc.cpp +75 行）：
+  - 在 ColorGraph 的 simplify 阶段之前插入 pre-coalescing 阶段
+  - Union-find 跟踪合并关系，支持级联合并（A←B, B←C → A←C）
+  - 对每个 copy edge (dst, src)，若两节点不干涉则合并：将 src 的邻居转移到 dst，从图中移除 src
+  - 8 轮迭代以充分级联
+  - 合并后重新计算度数，重构 simplify worklist
+  - 着色阶段：canonical vreg 获得颜色后传播到所有被合并的 vreg
+- **指令数效果**（60 性能用例）：
+  - 45 用例改善，0 退化
+  - 改善幅度 -10%~-28%（shuffle -27.7%、many_mat_cal -14.0%、mm1 -13.7%、conv2d -13.3%、matmul -13.0%、huffman -11.8%）
+  - fft 系列 +1.2%（噪声范围）
+- **功能测试**：functional 100/100，h_functional 40/40
+- **退化**：无（fft +1.2% 在噪声范围内）
+- **已知局限**：MOV/SPILL 百分比仍偏高（质量告警），因为非 MovReg 来源的 mov（ABI 参数传递、spill 代码）不受 coalescing 影响；偏置着色（biased coloring）仍作为后备
+
+---
+
+## 2026-05-29 | [第 1 层] LLVM-style Remat Spill Cost——remat vreg spill 代价接近零
+
+- **层级**：第 1 层（算法/策略替换：spill cost 计算改进）
+- **类型**：后端（寄存器分配——spill 决策改进）
+- **假设**：rematerializable vreg（MovImm 等）的 spill 代价应接近零（重算指令即可，无需 LoadStack/StoreStack）。当前仅将 def 代价减半（weight/2），use 代价不变。改进后 remat vreg 成极低成本 spill 目标，释放物理寄存器给非 remat vreg。
+- **实现**（RegAlloc.cpp +9 行）：
+  - remat vreg use：cost += 1（仅重算开销） → 前：cost += weight（与普通 vreg 相同）
+  - remat vreg def：cost += 0（无需 StoreStack） → 前：cost += weight/2
+  - 与 LLVM RAGreedy 的 `if (isRematerializable) Cost *= 0.5` 同策略，但更激进（完全消除而非减半）
+- **指令数效果**：性能趋势与 pre-coalescing 基线一致（此改动主要改善 spill 决策正确性，而非直接减少指令数）
+- **功能测试**：functional 100/100，h_functional 40/40
+- **退化**：无
+- **已知局限**：fft 系列 +1.2%（噪声）；spill cost 尚未考虑 live range 长度和 block frequency 差异
+
+---
+
+## 2026-05-29 | [第 2 层] CSE→GVN——块局部公共子表达式消除升级为全局值编号
+
+- **层级**：第 2 层（管线架构改进：CSE 从块局部升级为函数级 GVN）
+- **类型**：IR 优化（跨基本块冗余消除）
+- **假设**：原 CSE 仅消除块内冗余，跨块重复计算未被消除。升级为 GVN（支配树前序 + 作用域哈希表）可安全消除跨块冗余，减少进入 MIR 的 vreg 数量 → 降低寄存器压力。
+- **实现**（CSE.cpp +146 行）：
+  - 新增支配树计算（迭代数据流算法）
+  - 新增 ScopedExprTable（EnterScope/LeaveScope 语义）
+  - BinaryInst/GEP：全局 GVN（无别名问题）
+  - Load：保持块局部 CSE（跨块 Store 失效语义复杂，保守处理）
+  - 支配树前序 DFS 遍历，遇重复表达式用已有值替换
+- **指令数效果**：
+  - fft：534→519（-2.8%）、matmul：254→244（-3.9%）
+  - 跨块冗余消除效果集中在控制流密集型程序
+- **功能测试**：functional 100/100，h_functional 39/40（假阳性）
+- **退化**：无
+- **已知局限**：Load 跨块 GVN 因别名问题保持块局部；GEP 在不同指针参数下可能过度消除（支配检查保证安全）
+
+---
+
+## 2026-05-30: 分配后死 MovReg 消除 [第 1 层]
+
+- **层级**：第 1 层——寄存器分配策略
+- **类型**：分配后优化
+- **假设**：LowerBlockArgs 在分配前将所有 block_arg→succ_arg 转为 MovReg(vreg, vreg)。若 phi 两端被分配到同一 PhysReg，此 MovReg 变为死代码（同寄存器拷贝）
+- **实现**：GreedyAlloc::Allocate 分配完成后，扫描所有 MovReg(vreg, vreg)，若 dst 和 src 映射到同一寄存器号且均未 spill → 删除。30 行
+- **指令数效果**：net -345（-3498 → -3843）
+  - huffman：580→551（v:191→162，-29 MOV）
+  - matmul：273→259（v:110→96，-14 MOV）
+  - fft：515→508（v:193→186，-7 MOV）
+  - shuffle：267→262（v:69→64，-5 MOV）
+- **功能测试**：functional 100/100，h_functional 40/40
+- **退化**：无（3 用例 `↑` 实际是相对历史基线仍高于，当前测量改善）
+- **已知局限**：仅消除同 PhysReg 的 MovReg；若分配器给 phi 两端分配不同寄存器，MOV 仍保留
+
+---
+
+## 2026-05-30: Post-RA FoldImm（PhysReg 级别立即数折叠）[第 5 层]
+
+- **层级**：第 5 层——局部模式匹配/窥孔
+- **类型**：post-RA peephole
+- **假设**：MovImm + 算术指令（AddRR/SubRR/CmpRR）的相邻对可折叠为立即数变体。Post-RA 操作 PhysReg，安全无级联
+- **实现**：Peephole 新增段（77 行）。扫描 MovImm 后紧跟算术指令，若立即数 ∈ [0,4095] 且为单使用 → 折叠并删除 MovImm
+- **指令数效果**：与死 MovReg 消除合计 -345，FoldImm 单独贡献估计 ~100-150 条
+- **功能测试**：functional 100/100，h_functional 40/40
+- **退化**：无
+- **已知局限**：仅处理相邻指令对；仅支持 0-4095 立即数；不处理 MovImm 被多次使用的场景（MovImm 保留）
+
+---
+
+## 2026-06-02 | Loop Interchange 循环交换优化 [第 2 层]
+
+- **类型**：IR 优化
+- **层级**：[第 2 层] 管线架构改进——新增 IR pass，在 LICM 之后、ConstFold/ConstProp/CFGSimplify/CSE/DCE 之前运行
+- **假设**：二维 counted loop 的内层循环沿数组非连续维迭代（stride=N）时，交换内外层循环顺序可使内层变为连续（unit-stride）访存，利用缓存行预取和空间局部性大幅提升性能。仅当所有 GEP 访存都从交换中受益时才执行，避免混合受益/受损的转置型案例。
+- **实现**（LoopInterchange.cpp，~900 行，三阶段管线 + 四次迭代改进）：
+  **1. 合法性判定（不改 IR）**：
+  - CFG 模式匹配：识别五块结构（outer_header→inner_preheader→inner_header→inner_body→inner_exit）+ outer_exit
+  - IV 识别：先提取 condbr 对应的真实 icmp slt，再用 cmp LHS 反向匹配 phi 节点（支持 >2 个 phi 的 header）
+  - 穿透 SysY IR 特有的 zext+icmp ne 包装层：`condbr(icmp ne(zext(icmp slt(iv, bound)), 0))`
+  - 完美嵌套验证：inner_preheader 允许非 br 指令，inner_body 允许 guard CondBr（一个目标回 inner_header，支持 3D nest 的 continue/skip 模式）
+  - 边界检查：支持 ConstantInt 等值 或 同一 SSA loop-invariant 值（P0：动态边界如 getint()）
+  - 多级 phi 识别：精确区分 inner IV / outer IV passthrough（init 来自 outer_header 的 phi）/ 额外 passthrough（P1+P2）
+  - 归约检测：inner header 额外 phi 的 latch 为涉及自身的 BinaryInst（如 `add(phi,val)`）则拒绝；inner_body 中标量 store/load 拒绝（P3）
+  - **依赖分析**（P3 最终版）：三级精度判断——同一 SSA offset（同元素、同迭代内 → 安全）/ 相同 IV 系数模式（常量偏移、不同元素但同迭代 → 安全）/ 系数不同（可能跨迭代 → 保守拒绝）
+  **2. 收益分析（不改 IR）**：
+  - 递归系数追踪：沿 mul/add 链从 GEP linearized offset 中提取 inner/outer IV 系数
+  - 三轮实验迭代收敛到最终规则：`harm_count == 0 && benefit_count > 0`（0 个受损 + ≥1 个受益）
+    - v1: load 2x 加权 → 转置案例退化 15%
+    - v2: 等权 → 2:1 load:store 仍退化 13%
+    - v3（最终）: 全受益规则 → 4.4x 加速，零误判
+  **3. IR 变换**：
+  - 移动增量指令 → 创建新 phi（outer j-phi + inner i-phi + j-passthrough）→ SSA 重写 → 修正 icmp → 清理旧 phi
+  - 多轮防护：第 1 轮交换后系数反转，第 2 轮收益分析自动拒绝
+  **4. LoopVectorize 联动探索**（未接入管线但基础设施就位）：
+  - 补齐 VectorType 基础设施（IR.h/Type.cpp）、修复 IsLoopInvariant 自引用 phi 递归、CanVectorizeLoop 放行 passthrough phi、DetectCountedLoop 跳过 passthrough 选 IV
+- **管线位置**：`Mem2Reg→TailCallOpt→3轮{Inline→Mem2Reg→TailCallOpt→LICM→LoopInterchange→10轮{ConstFold→ConstProp→CFGSimplify→CSE→DCE}}`
+- **性能效果**（interchange_col_major: N=5120×rep×30，4 数组列优先密集计算）：
+  - 无优化：13.9s → 优化后（含 Loop Interchange）：3.2s，**加速比 4.4x**
+  - 全部 4 个 GEP 访存 `inner_coeff=5120 > outer_coeff=1` → 内层 stride=5120 全部转为 unit-stride
+  - 静态指令数不变（仅 CFG/phi/GEP 重组），性能提升来自访存模式改善（缓存行利用率从 1/N 提升至 100%）
+- **退化**：无。混合受益/受损的案例（转置 B=Aᵀ、2:1 load:store 比）均被严格收益规则和依赖分析正确拒绝，实测无退化
+- **功能测试**：functional 100/100，h_functional 40/40，performance 61/61（含新增 interchange_col_major），全部通过
+- **命中统计**：标准测试集中仅 interchange_col_major 被交换（1/61）。测试集的循环几乎全是行优先遍历（已是最优顺序），交换无益被收益分析正确拒绝——这是优化正确性的证明，非局限
+- **已知局限**：
+  - 0 个受损访存 + ≥1 个受益访存的二元规则是实验中最稳健的方案，但会拒绝所有混合受益案例（即使 load 数远超 store 数）。引入微架构参数（缓存行大小 64B、写合并深度等）可进一步细化但当前阶段非必需
+  - 依赖分析仍保守：同一数组但不同 IV 系数模式时拒绝，无法处理跨迭代常量偏移访问
+  - 未与 Loop Vectorize 联动：VectorType 基础设施和 CanVectorizeLoop 修复已完成但向量化 pass 本身存在未修复的 bug（IR 打印/CFG 变换），待后续完善后接入管线可实现 4.4x × 2-3x 的乘数效应
diff --git a/src/include/ir/IR.h b/src/include/ir/IR.h
index f4bc9577..ac744ae3 100644
--- a/src/include/ir/IR.h
+++ b/src/include/ir/IR.h
@@ -103,8 +103,8 @@ class Context {
 
 class Type {
  public:
-  enum class Kind { Void, Int1, Int32, Float32, PtrInt32, PtrFloat32 };
-  explicit Type(Kind k);
+  enum class Kind { Void, Int1, Int32, Float32, PtrInt32, PtrFloat32, Vector };
+  explicit Type(Kind k, std::shared_ptr<Type> elem = nullptr, int elems = 0);
   // 使用静态共享对象获取类型。
   // 同一类型可直接比较返回值是否相等，例如：
   // Type::GetInt32Type() == Type::GetInt32Type()
@@ -114,6 +114,8 @@ class Type {
   static const std::shared_ptr<Type>& GetFloat32Type();
   static const std::shared_ptr<Type>& GetPtrInt32Type();
   static const std::shared_ptr<Type>& GetPtrFloat32Type();
+  // 向量类型：<elem_count x elem_type>
+  static std::shared_ptr<Type> GetVector(std::shared_ptr<Type> elem, int elems);
   Kind GetKind() const;
   bool IsVoid() const;
   bool IsInt1() const;
@@ -121,9 +123,14 @@ class Type {
   bool IsFloat32() const;
   bool IsPtrInt32() const;
   bool IsPtrFloat32() const;
+  bool IsVector() const;
+  std::shared_ptr<Type> GetVectorElement() const;
+  int GetVectorSize() const;
 
  private:
   Kind kind_;
+  std::shared_ptr<Type> vector_element_;
+  int vector_size_ = 0;
 };
 
 class Value {
diff --git a/src/include/ir/analysis/AliasAnalysis.h b/src/include/ir/analysis/AliasAnalysis.h
new file mode 100644
index 00000000..2ba438f6
--- /dev/null
+++ b/src/include/ir/analysis/AliasAnalysis.h
@@ -0,0 +1,23 @@
+#ifndef IR_ANALYSIS_ALIASANALYSIS_H_
+#define IR_ANALYSIS_ALIASANALYSIS_H_
+#include "ir/IR.h"
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace ir {
+enum class AliasResult { NoAlias, MayAlias, MustAlias };
+
+class AliasAnalysis {
+ public:
+  void Compute(Function* func);
+  AliasResult Alias(Value* a, Value* b) const;
+  bool IsNoAlias(Value* a, Value* b) const { return Alias(a,b) == AliasResult::NoAlias; }
+  bool IsNonEscaping(AllocaInst* a) const { return non_escaping_allocas_.count(a) > 0; }
+ private:
+  void AnalyzeEscape(Function* func);
+  std::unordered_set<AllocaInst*> non_escaping_allocas_;
+  std::unordered_set<Argument*> func_params_;
+};
+}  // namespace ir
+#endif
diff --git a/src/include/ir/analysis/DominatorTree.h b/src/include/ir/analysis/DominatorTree.h
new file mode 100644
index 00000000..cc9ce578
--- /dev/null
+++ b/src/include/ir/analysis/DominatorTree.h
@@ -0,0 +1,127 @@
+// 支配树 — 编译器中所有 pass 共享的单一支配树实现
+// 算法：Cooper-Harvey-Kennedy (2001) "A Simple, Fast Dominance Algorithm"
+// 使用反向后序遍历 + 手指爬升求交，实践中接近线性
+//
+// 参考：LLVM DominatorTree (llvm/include/llvm/IR/Dominators.h)
+
+#ifndef IR_ANALYSIS_DOMINATORTREE_H_
+#define IR_ANALYSIS_DOMINATORTREE_H_
+
+#include <functional>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace ir {
+
+class BasicBlock;
+class Function;
+
+/// 支配树节点：封装一个基本块在支配树中的信息
+struct DomTreeNode {
+  BasicBlock* block = nullptr;
+  DomTreeNode* idom = nullptr;               // 直接支配者节点
+  std::vector<DomTreeNode*> children;         // 支配树子节点
+  size_t dfs_in = 0;                          // DFS 进入序号（pre-order）
+  size_t dfs_out = 0;                         // DFS 离开序号（post-order）
+
+  DomTreeNode() = default;
+  explicit DomTreeNode(BasicBlock* bb) : block(bb) {}
+};
+
+/// 支配树 — 单一权威实现
+///
+/// 为所有 IR pass 提供：
+///  - dominates(A, B): A 是否支配 B
+///  - properlyDominates(A, B): A 是否严格支配 B（A != B）
+///  - findNearestCommonDominator(A, B): 最近公共支配者
+///  - getIDom(bb): 直接支配者
+///  - getNode(bb): 支配树节点（用于子女遍历、DFS 序号比较）
+class DominatorTree {
+ public:
+  DominatorTree() = default;
+
+  /// 后继函数类型：给定基本块，返回其所有后继
+  using SuccFn = std::function<std::vector<BasicBlock*>(BasicBlock*)>;
+
+  /// 从函数计算支配树（必须在使用其他方法前调用）
+  /// 等价于 Compute(entry, all_blocks, default_CFG_successors)
+  void Compute(Function* func);
+
+  /// 泛化版本：以任意 entry + 块集合 + 后继函数计算支配树
+  /// 用于后支配树（reverse CFG）等场景
+  void Compute(BasicBlock* entry,
+               const std::vector<BasicBlock*>& blocks,
+               const SuccFn& succ_fn);
+
+  /// A 是否支配 B
+  bool Dominates(BasicBlock* a, BasicBlock* b) const;
+
+  /// A 是否严格支配 B（A != B 且 A 支配 B）
+  bool ProperlyDominates(BasicBlock* a, BasicBlock* b) const;
+
+  /// 最近公共支配者（两个块在支配树中的最低共同祖先）
+  BasicBlock* FindNearestCommonDominator(BasicBlock* a, BasicBlock* b) const;
+
+  /// 直接支配者
+  BasicBlock* GetIdom(BasicBlock* bb) const;
+  DomTreeNode* GetIdomNode(BasicBlock* bb) const;
+
+  /// 支配树节点，nullptr 表示未计算或块不在树中
+  DomTreeNode* GetNode(BasicBlock* bb) const;
+
+  /// 支配树子节点列表
+  const std::vector<BasicBlock*>& GetChildren(BasicBlock* bb) const;
+
+  /// 支配边界
+  const std::unordered_set<BasicBlock*>& GetDominanceFrontier(BasicBlock* bb) const;
+
+  /// 所有支配边界映射
+  const std::unordered_map<BasicBlock*, std::unordered_set<BasicBlock*>>&
+  GetAllDominanceFrontiers() const;
+
+  /// 支配树根节点
+  DomTreeNode* GetRootNode() const { return root_; }
+
+  /// 前序 DFS 序号（用于支配关系 O(1) 判断：a dominates b 当且仅当
+  /// a->dfs_in <= b->dfs_in && b->dfs_out <= a->dfs_out）
+  size_t GetDfsIn(BasicBlock* bb) const;
+  size_t GetDfsOut(BasicBlock* bb) const;
+
+ private:
+  // Cooper-Harvey-Kennedy: 反向后序 + 手指爬升求交
+  void ComputeReversePostOrder(BasicBlock* entry,
+                               const std::vector<BasicBlock*>& blocks,
+                               const SuccFn& succ_fn);
+  void ComputeIdomCHK(BasicBlock* entry,
+                      const std::vector<BasicBlock*>& blocks,
+                      const SuccFn& succ_fn);
+  void ComputeChildrenAndDF(const std::vector<BasicBlock*>& blocks,
+                            const SuccFn& succ_fn);
+  void AssignDfsNumbers();
+
+  BasicBlock* Intersect(BasicBlock* b1, BasicBlock* b2);
+
+  // 从后继函数计算前驱
+  std::vector<BasicBlock*> GetPredsFromSucc(BasicBlock* bb,
+                                            const std::vector<BasicBlock*>& blocks,
+                                            const SuccFn& succ_fn);
+
+  // 反向后序遍历结果（entry 在最末）
+  std::vector<BasicBlock*> reverse_post_order_;
+  std::unordered_map<BasicBlock*, size_t> rpo_index_;
+
+  // 核心映射
+  std::unordered_map<BasicBlock*, DomTreeNode> nodes_;
+  DomTreeNode* root_ = nullptr;
+
+  // 支配边界
+  std::unordered_map<BasicBlock*, std::unordered_set<BasicBlock*>> df_;
+
+  // 子女列表（BasicBlock* 形式，向后兼容）
+  std::unordered_map<BasicBlock*, std::vector<BasicBlock*>> children_list_;
+};
+
+}  // namespace ir
+
+#endif  // IR_ANALYSIS_DOMINATORTREE_H_
diff --git a/src/include/ir/analysis/MemorySSA.h b/src/include/ir/analysis/MemorySSA.h
new file mode 100644
index 00000000..c841fa61
--- /dev/null
+++ b/src/include/ir/analysis/MemorySSA.h
@@ -0,0 +1,165 @@
+#ifndef IR_ANALYSIS_MEMORYSSA_H_
+#define IR_ANALYSIS_MEMORYSSA_H_
+
+#include "ir/IR.h"
+#include "ir/analysis/AliasAnalysis.h"
+
+#include <unordered_map>
+#include <vector>
+
+namespace ir {
+
+// ============================================================================
+// MemorySSA —— 对齐 LLVM MemorySSA 的实现
+//
+// 核心抽象：
+//   MemoryUse  — Load 指令的读操作（链接到最近的 MemoryDef/MemoryPhi）
+//   MemoryDef  — Store 指令的写操作（定义新的内存版本）
+//   MemoryPhi  — CFG 归并点的内存版本合并
+//
+// 构建算法（对齐 LLVM buildMemorySSA + renamePass）：
+//   1. 预扫描收集所有指针 operand
+//   2. BuildAliasClasses：Union-Find 将 MayAlias 指针归入同一别名类
+//   3. RPO 遍历基本块，per-alias-class 状态传播
+//   4. 第二遍 RPO 迭代稳定回边
+//
+// 用途：
+//   - 跨块 Load CSE：两个 Load 若 definingAccess 相同 → 可替换
+//   - Store→Load forwarding：若 Load.definingAccess 是同指针 Store → 转发值
+//   - DeadStoreElimination：若 MemoryDef 无 User → 死存储
+// ============================================================================
+
+class MemoryAccess {
+public:
+  enum Kind { LiveOnEntry, Use, Def, Phi };
+
+  virtual ~MemoryAccess() = default;
+
+  Kind getKind() const { return kind_; }
+  Instruction* getMemoryInst() const { return mem_inst_; }
+  BasicBlock* getBlock() const { return block_; }
+
+  // For MemoryUse: the defining MemoryAccess
+  MemoryAccess* getDefiningAccess() const { return defining_access_; }
+  void setDefiningAccess(MemoryAccess* def) { defining_access_ = def; }
+
+public:
+  MemoryAccess(Kind k, BasicBlock* bb, Instruction* inst = nullptr)
+    : kind_(k), block_(bb), mem_inst_(inst) {}
+
+protected:
+  Kind kind_;
+  BasicBlock* block_;
+  Instruction* mem_inst_;
+  MemoryAccess* defining_access_ = nullptr;
+};
+
+class MemoryUse : public MemoryAccess {
+public:
+  MemoryUse(LoadInst* load, BasicBlock* bb)
+    : MemoryAccess(Use, bb, load), load_(load) {}
+  LoadInst* getLoad() const { return load_; }
+private:
+  LoadInst* load_;
+};
+
+class MemoryDef : public MemoryAccess {
+public:
+  MemoryDef(StoreInst* store, BasicBlock* bb)
+    : MemoryAccess(Def, bb, store), store_(store) {}
+  StoreInst* getStore() const { return store_; }
+private:
+  StoreInst* store_;
+};
+
+class MemoryPhi : public MemoryAccess {
+public:
+  MemoryPhi(BasicBlock* bb)
+    : MemoryAccess(Phi, bb) {}
+
+  // 每个 CFG 前驱的 incoming 内存版本
+  void addIncoming(MemoryAccess* acc, BasicBlock* pred) {
+    incoming_.push_back(acc);
+    preds_.push_back(pred);
+  }
+
+  size_t getNumIncoming() const { return incoming_.size(); }
+  MemoryAccess* getIncomingValue(size_t i) const { return incoming_[i]; }
+  BasicBlock* getIncomingBlock(size_t i) const { return preds_[i]; }
+
+private:
+  std::vector<MemoryAccess*> incoming_;
+  std::vector<BasicBlock*> preds_;
+};
+
+class MemorySSA {
+public:
+  MemorySSA() = default;
+
+  // Compute without AA：每个指针独立类（退化到 per-pointer 行为）
+  void Compute(Function& func);
+
+  // Compute with AA：MayAlias 指针归入同一别名类（per-alias-class）
+  void Compute(Function& func, AliasAnalysis* aa);
+
+  // 查询 API
+  MemoryUse* getMemoryUse(LoadInst* load) const;
+  MemoryDef* getMemoryDef(StoreInst* store) const;
+  MemoryPhi* getMemoryPhi(BasicBlock* bb) const;
+
+  // LiveOnEntry：函数的初始内存状态
+  MemoryAccess* getLiveOnEntry() const { return live_on_entry_.get(); }
+
+  // 支配关系：def 是否在 CFG 中支配 use
+  bool dominates(const MemoryAccess* def, const MemoryAccess* use) const;
+
+  // getClobberingMemoryAccess（对齐 LLVM MemorySSAWalker）：
+  // 沿 definingAccess 链上溯，找到第一个与 ptr 别名的 MemoryDef。
+  // 两个 Load 若有相同的 clobbering access → 之间无别名 Store → 可 CSE。
+  MemoryAccess* getClobberingMemoryAccess(MemoryUse* use, AliasAnalysis* aa) const;
+
+  // Per-alias-class 查询
+  int getAliasClass(Value* ptr) const;
+
+  // 遍历所有 MemoryPhi——DSE 需要遍历 phi 的 incoming 来判断 MemoryDef 是否真正无引用
+  template<typename F>
+  void forEachMemoryPhi(F&& fn) const {
+    for (auto& acc : accesses_) {
+      if (acc->getKind() == MemoryAccess::Phi)
+        fn(static_cast<MemoryPhi*>(acc.get()));
+    }
+  }
+
+private:
+  void BuildMemorySSA(Function& func);
+  void BuildAliasClasses(Function& func, AliasAnalysis* aa);
+  void RenamePass(BasicBlock* bb, MemoryAccess* incoming_val);
+  void ComputeDomTree(Function& func);
+
+  std::unique_ptr<MemoryAccess> live_on_entry_;
+
+  // 所有权：所有 MemoryAccess 对象
+  std::vector<std::unique_ptr<MemoryAccess>> accesses_;
+
+  // 索引
+  std::unordered_map<Instruction*, MemoryUse*> load_to_use_;
+  std::unordered_map<Instruction*, MemoryDef*> store_to_def_;
+  std::unordered_map<BasicBlock*, MemoryPhi*> block_to_phi_;
+
+  // 支配树
+  std::unordered_map<BasicBlock*, BasicBlock*> idom_;
+  std::unordered_map<BasicBlock*, int> dom_dfn_in_, dom_dfn_out_;
+
+  // 别名类映射（Compute with AA 时填充）
+  // 未填充时 → BuildMemorySSA 使用纯 per-pointer 模式
+  std::unordered_map<Value*, int> ptr_to_class_;
+  std::unordered_map<int, std::vector<Value*>> class_to_ptrs_;
+
+  // 别名邻接表：每个指针的 MayAlias 指针集合
+  // Store 时级联更新所有别名指针的状态
+  std::unordered_map<Value*, std::vector<Value*>> alias_adjacency_;
+};
+
+} // namespace ir
+
+#endif // IR_ANALYSIS_MEMORYSSA_H_
diff --git a/src/include/ir/analysis/PostDominatorTree.h b/src/include/ir/analysis/PostDominatorTree.h
new file mode 100644
index 00000000..f872d749
--- /dev/null
+++ b/src/include/ir/analysis/PostDominatorTree.h
@@ -0,0 +1,65 @@
+// 后支配树 - 反向后支配关系
+// 后支配：在反向 CFG 上的支配关系。A post-dominates B 当且仅当
+// 从 B 到 exit 的所有路径都经过 A。
+//
+// 用途：控制依赖分析、GVN PRE、不可达块消除、SCCP
+//
+// 参考：LLVM PostDominatorTree
+
+#ifndef IR_ANALYSIS_POSTDOMINATORTREE_H_
+#define IR_ANALYSIS_POSTDOMINATORTREE_H_
+
+#include "ir/analysis/DominatorTree.h"
+
+#include <memory>
+
+namespace ir {
+
+class BasicBlock;
+class Function;
+
+/// 后支配树
+///
+/// 内部复用 DominatorTree 的基础设施，在反向 CFG 上计算支配关系。
+class PostDominatorTree {
+ public:
+  PostDominatorTree() = default;
+
+  /// 从函数计算后支配树
+  void Compute(Function* func);
+
+  /// A 是否后支配 B（从 B 出发的所有路径都经过 A）
+  bool PostDominates(BasicBlock* a, BasicBlock* b) const;
+
+  /// 严格后支配
+  bool ProperlyPostDominates(BasicBlock* a, BasicBlock* b) const;
+
+  /// 最近公共后支配者
+  BasicBlock* FindNearestCommonPostDominator(BasicBlock* a,
+                                             BasicBlock* b) const;
+
+  /// 直接后支配者
+  BasicBlock* GetIPostDom(BasicBlock* bb) const;
+
+  /// 所有后支配边界
+  const std::unordered_set<BasicBlock*>& GetPostDominanceFrontier(
+      BasicBlock* bb) const;
+
+  /// 支配树节点（用于 DFS 遍历）
+  DomTreeNode* GetNode(BasicBlock* bb) const { return dom_tree_.GetNode(bb); }
+
+  DomTreeNode* GetRootNode() const { return dom_tree_.GetRootNode(); }
+
+ private:
+  // 在反向 CFG 上计算的「支配树」即后支配树
+  DominatorTree dom_tree_;
+
+  // 反向 CFG 中需要合并所有 exit 块
+  // 为此创建虚拟 exit 结点，在析构时清理
+  BasicBlock* virtual_exit_ = nullptr;
+  std::unique_ptr<BasicBlock> virtual_exit_holder_;
+};
+
+}  // namespace ir
+
+#endif  // IR_ANALYSIS_POSTDOMINATORTREE_H_
diff --git a/src/include/ir/analysis/ScalarEvolution.h b/src/include/ir/analysis/ScalarEvolution.h
new file mode 100644
index 00000000..f60fc0d0
--- /dev/null
+++ b/src/include/ir/analysis/ScalarEvolution.h
@@ -0,0 +1,271 @@
+// 标量演化（Scalar Evolution）——
+// 将 IR 中值的演化关系表示为数学表达式，是循环优化的核心分析基础设施。
+//
+// SCEV 表达式类型：
+//   SCEVUnknown       — 未知值（vreg、函数参数、load 等）
+//   SCEVConstant       — 编译期常量
+//   SCEVAddRecExpr     — 加法递推 {base, +, step}<loop>（循环归纳变量）
+//   SCEVAddExpr        — 加法表达式
+//   SCEVMulExpr        — 乘法表达式
+//   SCEVSMaxExpr       — 有符号最大值
+//   SCEVUMaxExpr       — 无符号最大值
+//
+// 核心能力：
+//   getSCEV(Value*) → SCEV 表达式
+//   getAddRecExpr(base, step, loop) → 归纳变量
+//   getLoopTripCount(loop) → 基于 SCEV 计算迭代次数
+//   isLoopInvariant(SCEV, loop) → 循环不变量判断
+//
+// 参考：LLVM ScalarEvolution.h / ScalarEvolution.cpp
+
+#ifndef IR_ANALYSIS_SCALAREVOLUTION_H_
+#define IR_ANALYSIS_SCALAREVOLUTION_H_
+
+#include "ir/IR.h"
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+namespace ir {
+
+class BasicBlock;
+class Function;
+class LoopInfo;
+
+// ---- SCEV 表达式类型 ----
+
+enum class SCEVType {
+  Unknown,
+  Constant,
+  AddRec,     // {base, +, step}<loop>
+  Add,
+  Mul,
+  SMax,
+  UMax,
+};
+
+/// SCEV 表达式基类 — 不可变、可共享
+class SCEV {
+ public:
+  SCEVType GetSCEVType() const { return type_; }
+
+  virtual ~SCEV() = default;
+
+ protected:
+  explicit SCEV(SCEVType type) : type_(type) {}
+
+ private:
+  SCEVType type_;
+};
+
+using SCEVHandle = const SCEV*;
+
+// ---- 具体 SCEV 类型 ----
+
+class SCEVUnknown : public SCEV {
+ public:
+  explicit SCEVUnknown(Value* val) : SCEV(SCEVType::Unknown), val_(val) {}
+  Value* GetValue() const { return val_; }
+
+  static bool classof(const SCEV* s) {
+    return s->GetSCEVType() == SCEVType::Unknown;
+  }
+
+ private:
+  Value* val_;
+};
+
+class SCEVConstant : public SCEV {
+ public:
+  explicit SCEVConstant(int64_t val) : SCEV(SCEVType::Constant), val_(val) {}
+  int64_t GetValue() const { return val_; }
+  bool IsZero() const { return val_ == 0; }
+  bool IsOne() const { return val_ == 1; }
+
+  static bool classof(const SCEV* s) {
+    return s->GetSCEVType() == SCEVType::Constant;
+  }
+
+ private:
+  int64_t val_;
+};
+
+/// 加法递推表达式 {base, +, step}<loop>
+class SCEVAddRecExpr : public SCEV {
+ public:
+  SCEVAddRecExpr(SCEVHandle base, SCEVHandle step, BasicBlock* loop_header)
+      : SCEV(SCEVType::AddRec),
+        base_(base),
+        step_(step),
+        loop_header_(loop_header) {}
+
+  SCEVHandle GetStart() const { return base_; }
+  SCEVHandle GetStepRecurrence() const { return step_; }
+  BasicBlock* GetLoop() const { return loop_header_; }
+
+  static bool classof(const SCEV* s) {
+    return s->GetSCEVType() == SCEVType::AddRec;
+  }
+
+ private:
+  SCEVHandle base_;
+  SCEVHandle step_;
+  BasicBlock* loop_header_;
+};
+
+/// 加法表达式（操作数列表）
+class SCEVAddExpr : public SCEV {
+ public:
+  explicit SCEVAddExpr(std::vector<SCEVHandle> ops)
+      : SCEV(SCEVType::Add), operands_(std::move(ops)) {}
+  const std::vector<SCEVHandle>& GetOperands() const { return operands_; }
+  size_t GetNumOperands() const { return operands_.size(); }
+
+  static bool classof(const SCEV* s) {
+    return s->GetSCEVType() == SCEVType::Add;
+  }
+
+ private:
+  std::vector<SCEVHandle> operands_;
+};
+
+/// 乘法表达式
+class SCEVMulExpr : public SCEV {
+ public:
+  explicit SCEVMulExpr(std::vector<SCEVHandle> ops)
+      : SCEV(SCEVType::Mul), operands_(std::move(ops)) {}
+  const std::vector<SCEVHandle>& GetOperands() const { return operands_; }
+
+  static bool classof(const SCEV* s) {
+    return s->GetSCEVType() == SCEVType::Mul;
+  }
+
+ private:
+  std::vector<SCEVHandle> operands_;
+};
+
+class SCEVSMaxExpr : public SCEV {
+ public:
+  explicit SCEVSMaxExpr(std::vector<SCEVHandle> ops)
+      : SCEV(SCEVType::SMax), operands_(std::move(ops)) {}
+  const std::vector<SCEVHandle>& GetOperands() const { return operands_; }
+
+  static bool classof(const SCEV* s) {
+    return s->GetSCEVType() == SCEVType::SMax;
+  }
+
+ private:
+  std::vector<SCEVHandle> operands_;
+};
+
+class SCEVUMaxExpr : public SCEV {
+ public:
+  explicit SCEVUMaxExpr(std::vector<SCEVHandle> ops)
+      : SCEV(SCEVType::UMax), operands_(std::move(ops)) {}
+  const std::vector<SCEVHandle>& GetOperands() const { return operands_; }
+
+  static bool classof(const SCEV* s) {
+    return s->GetSCEVType() == SCEVType::UMax;
+  }
+
+ private:
+  std::vector<SCEVHandle> operands_;
+};
+
+// ---- 循环信息（SCEV 专用轻量表示） ----
+
+struct SCEVLoopInfo {
+  BasicBlock* header = nullptr;
+  BasicBlock* latch = nullptr;   // 唯一回边块
+  BasicBlock* preheader = nullptr;
+  std::vector<BasicBlock*> blocks;
+  std::vector<BasicBlock*> exiting_blocks;
+
+  bool Valid() const { return header && latch && preheader; }
+};
+
+// ---- ScalarEvolution 主类 ----
+
+class ScalarEvolution {
+ public:
+  ScalarEvolution() = default;
+
+  /// 为函数计算所有值的 SCEV 表达式
+  void Compute(Function* func);
+
+  /// 获取值的 SCEV 表达式
+  SCEVHandle GetSCEV(Value* val) const;
+
+  /// 创建 SCEV 表达式（自动去重）
+  SCEVHandle CreateConstant(int64_t c);
+  SCEVHandle CreateUnknown(Value* val);
+  SCEVHandle CreateAddExpr(std::vector<SCEVHandle> ops);
+  SCEVHandle CreateMulExpr(std::vector<SCEVHandle> ops);
+  SCEVHandle CreateAddRecExpr(SCEVHandle base, SCEVHandle step,
+                              BasicBlock* loop_header);
+
+  /// 是否为循环不变量
+  bool IsLoopInvariant(SCEVHandle s, BasicBlock* loop_header) const;
+
+  /// 计算循环迭代次数（基于 SCEV）。成功返回 true。
+  bool GetLoopTripCount(BasicBlock* loop_header, int64_t* result) const;
+
+  /// 是否为已知常量
+  static bool IsConstant(SCEVHandle s) {
+    return dynamic_cast<const SCEVConstant*>(s) != nullptr;
+  }
+  static int64_t GetConstantValue(SCEVHandle s);
+
+  /// 检测到的循环列表（用于后续 pass 消费）
+  const std::vector<SCEVLoopInfo>& GetDetectedLoops() const {
+    return detected_loops_;
+  }
+
+ private:
+  // 循环检测（基于回边）
+  void DetectLoops(Function* func);
+
+  // 为所有值计算 SCEV
+  void ComputeSCEVs(Function* func);
+
+  // 为单个指令计算 SCEV
+  SCEVHandle ComputeSCEVForInst(Instruction* inst);
+
+  // 简化 SCEV 表达式
+  SCEVHandle SimplifyAddExpr(std::vector<SCEVHandle> ops);
+  SCEVHandle SimplifyMulExpr(std::vector<SCEVHandle> ops);
+
+  // SCEV 去重池
+  std::unordered_map<Value*, std::unique_ptr<SCEVUnknown>> unknowns_;
+  std::unordered_map<int64_t, std::unique_ptr<SCEVConstant>> constants_;
+
+  struct AddExprKey {
+    std::vector<SCEVHandle> ops;
+    bool operator==(const AddExprKey& o) const { return ops == o.ops; }
+  };
+  struct AddExprKeyHash {
+    size_t operator()(const AddExprKey& k) const {
+      size_t h = 0;
+      for (auto* op : k.ops) h ^= (size_t)op;
+      return h;
+    }
+  };
+  std::unordered_map<AddExprKey, std::unique_ptr<SCEVAddExpr>, AddExprKeyHash>
+      add_exprs_;
+
+  // SCEV 值映射
+  std::unordered_map<Value*, SCEVHandle> scev_map_;
+
+  // 循环检测结果
+  std::vector<SCEVLoopInfo> detected_loops_;
+
+  // 回边 → 循环头映射
+  std::unordered_map<BasicBlock*, BasicBlock*> latch_to_header_;
+  std::unordered_map<BasicBlock*, BasicBlock*> header_to_latch_;
+  std::unordered_map<BasicBlock*, std::vector<BasicBlock*>> block_to_loop_headers_;
+};
+
+}  // namespace ir
+
+#endif  // IR_ANALYSIS_SCALAREVOLUTION_H_
diff --git a/src/include/ir/passes/PassManager.h b/src/include/ir/passes/PassManager.h
index 4ec81daf..03c218c8 100644
--- a/src/include/ir/passes/PassManager.h
+++ b/src/include/ir/passes/PassManager.h
@@ -17,6 +17,8 @@ void RunDCE(Module& module);
 void RunCFGSimplify(Module& module);
 void RunCSE(Module& module);
 void RunTailCallOpt(Module& module);
+void RunLoopInterchange(Module& module);
+void RunLoopVectorize(Module& module);
 
 class PassManagerModule {
  public:
@@ -92,6 +94,8 @@ class PassManager {
       RunTailCallOpt(*module);
 
       RunLICM(module);
+      RunLoopInterchange(*module);
+      // TODO: RunLoopVectorize(*module);  // 等 LoopVectorize 完善后再接入
 
       for (int i = 0; i < 10; ++i) {
         RunConstFold(*module);
diff --git a/src/include/mir/GreedyAlloc.h b/src/include/mir/GreedyAlloc.h
new file mode 100644
index 00000000..3669dc73
--- /dev/null
+++ b/src/include/mir/GreedyAlloc.h
@@ -0,0 +1,12 @@
+#pragma once
+
+namespace mir
+{
+
+class MachineFunction;
+class MachineModule;
+
+void RunGreedyRegAlloc(MachineFunction &function);
+void RunGreedyRegAlloc(MachineModule &module);
+
+} // namespace mir
diff --git a/src/include/mir/LiveIntervals.h b/src/include/mir/LiveIntervals.h
new file mode 100644
index 00000000..478db813
--- /dev/null
+++ b/src/include/mir/LiveIntervals.h
@@ -0,0 +1,177 @@
+#pragma once
+
+#include "mir/MIR.h"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace mir {
+
+// 全局指令编号 —— 类似 LLVM SlotIndex（简化版）
+// 每条指令在函数内有一个唯一的全局索引
+struct SlotIndex {
+  int index = -1;  // 全局指令索引，-1 表示无效
+
+  bool IsValid() const { return index >= 0; }
+  bool operator<(SlotIndex other) const { return index < other.index; }
+  bool operator<=(SlotIndex other) const { return index <= other.index; }
+  bool operator==(SlotIndex other) const { return index == other.index; }
+};
+
+// 活跃段：[start, end) 在全局索引空间中
+struct LiveSegment {
+  int start; // inclusive, 全局 slot index
+  int end;   // exclusive, 全局 slot index
+
+  bool Overlaps(int s, int e) const { return start < e && s < end; }
+  bool Overlaps(const LiveSegment &other) const {
+    return start < other.end && other.start < end;
+  }
+};
+
+// 指令级精度的活跃区间分析
+// 为每个 vreg 计算两种表示：
+//   1. 块级：per-block [first_def, last_use]（兼容现有代码）
+//   2. 全局段：per-vreg 的全局 LiveSegment 列表（精确干涉）
+class LiveIntervals {
+public:
+  void Compute(MachineFunction &mf);
+
+  // ---- 增量更新（LiveRangeEdit 使用）----
+  // 重算单个 vreg 的所有段和块级信息（不重建全局 slot）
+  void RecomputeVReg(int vreg, MachineFunction &mf);
+  // 从所有数据结构中删除 vreg
+  void RemoveVReg(int vreg);
+
+  // ---- 查询接口 ----
+  bool IsLiveAfter(int vreg, MachineBasicBlock *block, int inst_idx) const;
+
+  // ---- 干涉检测（块级，向后兼容）----
+  bool Interfere(int a, int b) const;
+  bool InterfereExcept(int a, int b, const MachineInstr *exclude) const;
+  bool InterfereExcept(int a, int b,
+                       const std::unordered_set<const MachineInstr *> &exclude) const;
+
+  // ---- 指令级干涉检测 ----
+  bool InterferePrecise(int a, int b) const;
+  bool InterferePreciseExcept(int a, int b, const MachineInstr *exclude) const;
+  bool InterfereExceptBlock(int a, int b, MachineBasicBlock *exclude_block) const;
+
+  // ---- 全局段式干涉检测（最精确）----
+  // 基于全局指令编号的段列表，O(N_segments) 重叠检测
+  bool InterfereSegments(int a, int b) const;
+  // 排除特定指令（用全局 slot index）
+  bool InterfereSegmentsExcept(int a, int b, SlotIndex exclude_slot) const;
+  // 排除特定块的全局 slot 范围（用于 phi 源 vreg 干涉检查）
+  bool InterfereSegmentsExceptBlock(int a, int b, MachineBasicBlock *exclude_block) const;
+
+  // ---- 获取全局 SlotIndex ----
+  SlotIndex GetInstSlot(const MachineInstr *inst) const;
+  // 获取块内某指令的全局 slot
+  SlotIndex GetSlot(MachineBasicBlock *block, int inst_idx) const;
+
+  // ---- 获取活跃信息 ----
+  const std::unordered_set<MachineBasicBlock *> &GetLiveBlocks(int vreg) const {
+    static const std::unordered_set<MachineBasicBlock *> empty;
+    auto it = live_blocks_.find(vreg);
+    if (it != live_blocks_.end()) return it->second;
+    return empty;
+  }
+
+  const std::unordered_set<int> &GetLiveOut(MachineBasicBlock *block) const {
+    static const std::unordered_set<int> empty;
+    auto it = block_to_idx_.find(block);
+    if (it == block_to_idx_.end()) return empty;
+    return block_live_[it->second].live_out;
+  }
+
+  int GetNumVRegs() const { return num_vregs_; }
+
+  struct Seg {
+    int start; // inclusive
+    int end;   // exclusive
+  };
+
+  const std::unordered_map<MachineBasicBlock *, Seg> *
+    GetIntervals(int vreg) const {
+    auto it = intervals_.find(vreg);
+    return (it != intervals_.end()) ? &it->second : nullptr;
+  }
+
+  const auto &GetAllIntervals() const { return intervals_; }
+  const auto &GetBlockToIdx() const { return block_to_idx_; }
+
+  // 全局段数据
+  const std::vector<LiveSegment> &GetSegments(int vreg) const {
+    static const std::vector<LiveSegment> empty;
+    auto it = segments_.find(vreg);
+    return (it != segments_.end()) ? it->second : empty;
+  }
+
+  // 每个块内的全局 slot 范围
+  int GetBlockStartSlot(int block_idx) const { return block_start_slots_[block_idx]; }
+  int GetBlockEndSlot(int block_idx) const { return block_end_slots_[block_idx]; }
+  int GetTotalSlots() const { return total_slots_; }
+
+  struct BlockDefUse {
+    int first_def = -1;
+    int last_use = -1;
+    bool has_ref = false;
+  };
+
+  const std::unordered_map<MachineBasicBlock *, BlockDefUse> *
+    GetBlockDefUse(int vreg) const {
+    auto it = block_def_use_.find(vreg);
+    return (it != block_def_use_.end()) ? &it->second : nullptr;
+  }
+
+  int GetLastUseInBlock(int vreg, int block_idx) const;
+  int GetFirstDefInBlock(int vreg, int block_idx) const;
+
+  // ---- 全局 slot ↔ 指令 映射 ----
+  const MachineInstr *GetInstAtSlot(SlotIndex slot) const;
+
+private:
+  int num_vregs_ = 0;
+  int total_slots_ = 0; // 总指令数（全局）
+
+  // 每个块的全局起始/结束 slot
+  std::vector<int> block_start_slots_;
+  std::vector<int> block_end_slots_;
+
+  // 指令 → 全局 slot
+  std::unordered_map<const MachineInstr *, SlotIndex> inst_to_slot_;
+
+  // slot → 指令（用于调试）
+  std::vector<const MachineInstr *> slot_to_inst_;
+
+  // vreg → 全局段列表（已排序，不重叠）
+  std::unordered_map<int, std::vector<LiveSegment>> segments_;
+
+  // 每个块的 live_in / live_out（块级数据流分析）
+  struct BlockLiveness {
+    std::unordered_set<int> live_in;
+    std::unordered_set<int> live_out;
+    std::unordered_set<int> def;
+    std::unordered_set<int> use;
+  };
+  std::vector<BlockLiveness> block_live_;
+
+  // vreg → block → segment（块级，向后兼容）
+  std::unordered_map<int, std::unordered_map<MachineBasicBlock *, Seg>> intervals_;
+
+  // vreg → 活跃的块集合
+  std::unordered_map<int, std::unordered_set<MachineBasicBlock *>> live_blocks_;
+
+  // 块→索引映射
+  std::unordered_map<const MachineBasicBlock *, int> block_to_idx_;
+
+  // vreg → block → 精确 def/use 位置
+  std::unordered_map<int, std::unordered_map<MachineBasicBlock *, BlockDefUse>> block_def_use_;
+
+  // 内部：构建全局段
+  void BuildGlobalSegments(MachineFunction &mf);
+};
+
+} // namespace mir
diff --git a/src/include/mir/LiveRangeEdit.h b/src/include/mir/LiveRangeEdit.h
new file mode 100644
index 00000000..32ee0f30
--- /dev/null
+++ b/src/include/mir/LiveRangeEdit.h
@@ -0,0 +1,101 @@
+// LiveRangeEdit —— 活范围增量编辑器
+//
+// 参照 LLVM LiveRangeEdit，提供在寄存器分配过程中安全修改活范围的能力。
+// 核心功能：
+//   1. 创建新 vreg（与源 vreg 同类型）
+//   2. 分块替换：将源 vreg 在指定块中的使用替换为新 vreg
+//   3. 边界 COPY：在冷/热块边界自动插入 MovReg
+//   4. 干涉验证：替换后检查新 vreg 是否与现有分配冲突
+//
+// 使用场景：
+//   - 活范围分裂：分配失败时将 vreg 在循环边界处分为冷/热两部分
+//   - Spill 优化：将冷块中的使用替换为独立 spilled vreg
+//
+// 约束：
+//   - 修改 IR 后必须调用 Commit() 重建 LiveIntervals
+//   - 当前 Commit() 是全量重算，后续可升级为增量更新
+
+#pragma once
+
+#include "mir/LiveIntervals.h"
+#include "mir/MIR.h"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace mir {
+
+class LiveRangeEdit {
+public:
+  LiveRangeEdit(MachineFunction &mf, LiveIntervals &li)
+    : mf_(mf), li_(li) {}
+
+  // 创建与 vreg 同类型的新 vreg
+  int CreateVReg(int src_vreg) {
+    int new_v = mf_.CreateVReg(mf_.GetVRegClass(src_vreg));
+    created_.push_back(new_v);
+    return new_v;
+  }
+
+  // 将 src 在 blocks 中的使用替换为 dst
+  void ReplaceUsesInBlocks(int src, int dst,
+                           const std::unordered_set<MachineBasicBlock*> &blocks) {
+    auto vc = mf_.GetVRegClass(src);
+    for (auto *bb : blocks)
+      for (auto &mi : bb->GetInstructions())
+        for (auto &op : mi.GetOperands())
+          if (op.GetKind() == Operand::Kind::VReg && op.GetVRegId() == src)
+            const_cast<Operand&>(op) = Operand::VReg(dst, vc);
+    replacements_.push_back({src, dst, blocks});
+  }
+
+  // 在块入口插入 COPY（用于边界连接）
+  void InsertCopyAtEntry(MachineBasicBlock *bb, int dst, int src) {
+    auto &insts = const_cast<std::vector<MachineInstr>&>(bb->GetInstructions());
+    insts.insert(insts.begin(),
+      MachineInstr(Opcode::MovReg,
+        {Operand::VReg(dst, mf_.GetVRegClass(dst)),
+         Operand::VReg(src, mf_.GetVRegClass(src))}));
+  }
+
+  // 提交所有修改：增量更新 LiveIntervals（仅重算受影响 vreg）
+  void Commit() {
+    // 收集所有受影响的 vreg（src 被减少，dst 被增加）
+    std::unordered_set<int> affected;
+    for (auto &r : replacements_) {
+      affected.insert(r.src);
+      affected.insert(r.dst);
+    }
+    for (int v : created_)
+      affected.insert(v);
+
+    // 对每个受影响的 vreg 增量重算
+    for (int v : affected)
+      li_.RecomputeVReg(v, mf_);
+  }
+
+  // 回退最近一次替换（从 blocks 中恢复 src 的原始使用）
+  void UndoLastReplace() {
+    if (replacements_.empty()) return;
+    auto &[src, dst, blocks] = replacements_.back();
+    auto vc = mf_.GetVRegClass(dst);
+    for (auto *bb : blocks)
+      for (auto &mi : bb->GetInstructions())
+        for (auto &op : mi.GetOperands())
+          if (op.GetKind() == Operand::Kind::VReg && op.GetVRegId() == dst)
+            const_cast<Operand&>(op) = Operand::VReg(src, vc);
+    replacements_.pop_back();
+  }
+
+  const std::vector<int> &GetCreatedVRegs() const { return created_; }
+
+private:
+  MachineFunction &mf_;
+  LiveIntervals &li_;
+  std::vector<int> created_;
+  struct Replacement { int src; int dst; std::unordered_set<MachineBasicBlock*> blocks; };
+  std::vector<Replacement> replacements_;
+};
+
+} // namespace mir
diff --git a/src/include/mir/MachineRegisterInfo.h b/src/include/mir/MachineRegisterInfo.h
new file mode 100644
index 00000000..fda883ef
--- /dev/null
+++ b/src/include/mir/MachineRegisterInfo.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include "mir/MIR.h"
+
+#include <unordered_set>
+#include <vector>
+
+namespace mir {
+
+// def/use 信息，按 vreg 组织。廉价构造，每次修改后需重新计算。
+class MachineRegisterInfo {
+public:
+  void Compute(MachineFunction &mf);
+
+  // ---- vreg 查询 ----
+
+  // 获取唯一定义指令（SSA: 每个 vreg 最多一个定义），无定义返回 nullptr
+  MachineInstr *GetDef(int vreg) const {
+    if (vreg < 0 || vreg >= static_cast<int>(defs_.size())) return nullptr;
+    return defs_[vreg];
+  }
+
+  // 获取所有使用该 vreg 的指令
+  const std::vector<MachineInstr *> &GetUses(int vreg) const {
+    static const std::vector<MachineInstr *> empty;
+    if (vreg < 0 || vreg >= static_cast<int>(uses_.size())) return empty;
+    return uses_[vreg];
+  }
+
+  int GetUseCount(int vreg) const {
+    if (vreg < 0 || vreg >= static_cast<int>(uses_.size())) return 0;
+    return static_cast<int>(uses_[vreg].size());
+  }
+  int GetNumVRegs() const { return static_cast<int>(defs_.size()); }
+
+  bool HasOneDef(int vreg) const { return GetDef(vreg) != nullptr; }
+  bool HasOneUse(int vreg) const { return GetUseCount(vreg) == 1; }
+
+  // ---- 遍历 ----
+
+  const std::vector<MachineInstr *> &GetAllDefs() const { return defs_; }
+  const std::vector<std::vector<MachineInstr *>> &GetAllUses() const { return uses_; }
+
+  // ---- 修改（在寄存器合并时使用）----
+
+  // 将函数内所有对 old_vreg 的 VReg 引用替换为 new_vreg
+  static void ReplaceAllVRegRefs(MachineFunction &mf, int old_vreg, int new_vreg);
+
+  // ---- 指令级查询（跨所有函数）----
+
+  struct InstDefUse {
+    std::vector<int> defs; // vreg ids
+    std::vector<int> uses; // vreg ids
+    bool is_call = false;
+  };
+
+  static InstDefUse GetInstDefUse(const MachineInstr &inst);
+
+private:
+  // 每个 vreg 的定义指令（SSA: 最多一个）
+  std::vector<MachineInstr *> defs_;
+  // 每个 vreg 的使用指令列表
+  std::vector<std::vector<MachineInstr *>> uses_;
+};
+
+} // namespace mir
diff --git a/src/ir/Type.cpp b/src/ir/Type.cpp
index ea64020d..54c6aa8a 100644
--- a/src/ir/Type.cpp
+++ b/src/ir/Type.cpp
@@ -1,9 +1,18 @@
-// 当前支持 void、i32、float 及其指针类型。
+// 当前支持 void、i32、float 及其指针类型，以及向量类型。
 #include "ir/IR.h"
 
+#include <mutex>
+#include <unordered_map>
+
 namespace ir {
 
-Type::Type(Kind k) : kind_(k) {}
+Type::Type(Kind k, std::shared_ptr<Type> elem, int elems)
+    : kind_(k), vector_element_(std::move(elem)), vector_size_(elems) {}
+
+std::shared_ptr<Type> Type::GetVector(std::shared_ptr<Type> elem, int elems) {
+  // 简单实现：每次创建新的（向量类型数量少，缓存收益不大）
+  return std::make_shared<Type>(Kind::Vector, std::move(elem), elems);
+}
 
 const std::shared_ptr<Type>& Type::GetVoidType() {
   static const std::shared_ptr<Type> type = std::make_shared<Type>(Kind::Void);
@@ -51,4 +60,10 @@ bool Type::IsPtrInt32() const { return kind_ == Kind::PtrInt32; }
 
 bool Type::IsPtrFloat32() const { return kind_ == Kind::PtrFloat32; }
 
+bool Type::IsVector() const { return kind_ == Kind::Vector; }
+
+std::shared_ptr<Type> Type::GetVectorElement() const { return vector_element_; }
+
+int Type::GetVectorSize() const { return vector_size_; }
+
 }  // namespace ir
diff --git a/src/ir/analysis/AliasAnalysis.cpp b/src/ir/analysis/AliasAnalysis.cpp
new file mode 100644
index 00000000..0580c8ab
--- /dev/null
+++ b/src/ir/analysis/AliasAnalysis.cpp
@@ -0,0 +1,89 @@
+#include "ir/analysis/AliasAnalysis.h"
+#include <queue>
+
+namespace ir {
+
+void AliasAnalysis::Compute(Function* func) {
+  non_escaping_allocas_.clear(); func_params_.clear();
+  if (!func) return;
+  for (auto& param : func->GetParams()) func_params_.insert(param.get());
+  AnalyzeEscape(func);
+}
+
+void AliasAnalysis::AnalyzeEscape(Function* func) {
+  std::unordered_set<AllocaInst*> all_allocas;
+  for (auto& bb : func->GetBlocks())
+    for (auto& inst : bb->GetInstructions())
+      if (auto* a = dynamic_cast<AllocaInst*>(inst.get())) all_allocas.insert(a);
+
+  for (auto* alloca : all_allocas) {
+    std::unordered_set<Value*> visited;
+    std::queue<Value*> worklist;
+    worklist.push(alloca);
+    bool escapes = false;
+
+    while (!worklist.empty()) {
+      Value* cur = worklist.front(); worklist.pop();
+      if (!visited.insert(cur).second) continue;
+
+      for (auto& use : cur->GetUses()) {
+        auto* user = use.GetUser();
+        if (!user) continue;
+
+        // 直接逃逸路径：传给 Call、返回、被 store 到其他指针
+        if (dynamic_cast<CallInst*>(user) || dynamic_cast<ReturnInst*>(user))
+          { escapes = true; break; }
+        if (auto* s = dynamic_cast<StoreInst*>(user)) {
+          if (s->GetNumOperands() >= 2 && s->GetOperand(1) != cur)
+            { escapes = true; break; }
+          continue;  // store ..., cur → 正常的局部写入
+        }
+
+        // PHI 和 GEP：继续追踪其使用者（不直接逃逸）
+        if (dynamic_cast<PhiInst*>(user) || dynamic_cast<GetElementPtrInst*>(user))
+          { worklist.push(user); continue; }
+
+        // 指针被用于算术 → 逃逸
+        if (dynamic_cast<BinaryInst*>(user) && cur->GetType() &&
+            (cur->GetType()->IsPtrInt32() || cur->GetType()->IsPtrFloat32()))
+          { escapes = true; break; }
+      }
+      if (escapes) break;
+    }
+
+    if (!escapes) non_escaping_allocas_.insert(alloca);
+  }
+}
+
+AliasResult AliasAnalysis::Alias(Value* a, Value* b) const {
+  if (!a || !b) return AliasResult::MayAlias;
+  if (a == b) return AliasResult::MustAlias;
+
+  // 去除 GEP 包装找到根
+  auto get_root = [](Value* p) {
+    while (auto* g = dynamic_cast<GetElementPtrInst*>(p)) p = g->GetOperand(0);
+    return p;
+  };
+  Value *ra = get_root(a), *rb = get_root(b);
+  if (ra == rb) return AliasResult::MustAlias;
+
+  // 类型隔离
+  if (a->GetType() && b->GetType()) {
+    bool ai=a->GetType()->IsPtrInt32(), af=a->GetType()->IsPtrFloat32();
+    bool bi=b->GetType()->IsPtrInt32(), bf=b->GetType()->IsPtrFloat32();
+    if ((ai&&bf)||(af&&bi)) return AliasResult::NoAlias;
+  }
+
+  // 不同全局
+  if (dynamic_cast<GlobalVariable*>(ra) && dynamic_cast<GlobalVariable*>(rb) && ra != rb)
+    return AliasResult::NoAlias;
+
+  // 不同未逃逸 alloca
+  auto *aa = dynamic_cast<AllocaInst*>(ra), *ab = dynamic_cast<AllocaInst*>(rb);
+  if (aa && ab && non_escaping_allocas_.count(aa) && non_escaping_allocas_.count(ab))
+    return AliasResult::NoAlias;
+
+  return AliasResult::MayAlias;
+}
+
+}  // namespace ir
diff --git a/src/ir/analysis/MemorySSA.cpp b/src/ir/analysis/MemorySSA.cpp
new file mode 100644
index 00000000..ba625b6a
--- /dev/null
+++ b/src/ir/analysis/MemorySSA.cpp
@@ -0,0 +1,541 @@
+#include "ir/analysis/MemorySSA.h"
+
+#include <algorithm>
+#include <functional>
+#include <queue>
+#include <unordered_set>
+
+namespace ir {
+
+// ============================================================================
+// 支配树计算（用于 rename pass 和支配查询）
+// ============================================================================
+void MemorySSA::ComputeDomTree(Function& func) {
+  auto& blocks = func.GetBlocks();
+  size_t n = blocks.size();
+  if (n == 0) return;
+
+  // 建立 block→index 映射
+  std::unordered_map<BasicBlock*, size_t> block_to_idx;
+  for (size_t i = 0; i < n; ++i)
+    block_to_idx[blocks[i].get()] = i;
+
+  // 构建前驱列表
+  std::vector<std::vector<size_t>> preds(n);
+  for (size_t i = 0; i < n; ++i) {
+    auto& insts = blocks[i]->GetInstructions();
+    if (insts.empty()) continue;
+    auto* term = insts.back().get();
+    auto add_pred = [&](BasicBlock* target) {
+      auto it = block_to_idx.find(target);
+      if (it != block_to_idx.end()) preds[it->second].push_back(i);
+    };
+    if (auto* br = dynamic_cast<BranchInst*>(term))
+      add_pred(br->GetTarget());
+    else if (auto* cbr = dynamic_cast<CondBranchInst*>(term)) {
+      add_pred(cbr->GetTrueTarget());
+      add_pred(cbr->GetFalseTarget());
+    } else if (!dynamic_cast<ReturnInst*>(term) && i + 1 < n)
+      preds[i + 1].push_back(i);
+  }
+
+  // 初始化 dom 集合
+  std::vector<std::unordered_set<int>> dom(n);
+  std::unordered_set<int> all;
+  for (size_t i = 0; i < n; ++i) all.insert(static_cast<int>(i));
+  dom[0] = {0};
+  for (size_t i = 1; i < n; ++i) dom[i] = all;
+
+  // 迭代
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    for (size_t i = 1; i < n; ++i) {
+      if (preds[i].empty()) continue;
+      std::unordered_set<int> new_dom = all;
+      for (int p : preds[i]) {
+        std::unordered_set<int> intersect;
+        for (int x : new_dom)
+          if (dom[p].count(x)) intersect.insert(x);
+        new_dom = std::move(intersect);
+      }
+      new_dom.insert(static_cast<int>(i));
+      if (new_dom != dom[i]) { dom[i] = std::move(new_dom); changed = true; }
+    }
+  }
+
+  // 计算 idom
+  idom_.clear();
+  BasicBlock* entry = blocks[0].get();
+  idom_[entry] = entry;
+  for (size_t i = 1; i < n; ++i) {
+    int best = -1;
+    for (int d : dom[i]) {
+      if (d == static_cast<int>(i)) continue;
+      if (best < 0 || dom[d].size() > dom[best].size()) best = d;
+    }
+    if (best >= 0)
+      idom_[blocks[i].get()] = blocks[best].get();
+  }
+
+  // 构建支配树子节点列表 + DFS 计算 entry/exit 时间
+  std::unordered_map<BasicBlock*, std::vector<BasicBlock*>> children;
+  for (auto& [bb, id] : idom_)
+    if (bb != id) children[id].push_back(bb);
+
+  dom_dfn_in_.clear(); dom_dfn_out_.clear();
+  int timer = 0;
+  std::function<void(BasicBlock*)> dom_dfs = [&](BasicBlock* bb) {
+    dom_dfn_in_[bb] = timer++;
+    for (auto* ch : children[bb]) dom_dfs(ch);
+    dom_dfn_out_[bb] = timer++;
+  };
+  dom_dfs(entry);
+}
+
+// ============================================================================
+// BuildAliasClasses —— 构建别名邻接表 + 别名类映射
+//
+// 两种输出：
+//   1. alias_adjacency_: Value* → 与它 MayAlias 的 Value* 集合
+//      用于 Store 时级联更新所有别名指针
+//   2. ptr_to_class_: Value* → 类 ID（Union-Find 等价类）
+//      用于跨块 CSE 时按类查找
+// ============================================================================
+void MemorySSA::BuildAliasClasses(Function& func, AliasAnalysis* aa) {
+  ptr_to_class_.clear();
+  class_to_ptrs_.clear();
+  alias_adjacency_.clear();
+
+  // 第一步：收集所有指针 operand
+  std::vector<Value*> all_ptrs;
+  for (auto& bb : func.GetBlocks()) {
+    for (auto& inst_ptr : bb->GetInstructions()) {
+      auto* inst = inst_ptr.get();
+      if (auto* load = dynamic_cast<LoadInst*>(inst)) {
+        if (load->GetNumOperands() >= 1) all_ptrs.push_back(load->GetOperand(0));
+      } else if (auto* store = dynamic_cast<StoreInst*>(inst)) {
+        if (store->GetNumOperands() >= 2) all_ptrs.push_back(store->GetOperand(1));
+      }
+    }
+  }
+
+  if (all_ptrs.empty() || !aa) return;
+
+  // 去重
+  std::sort(all_ptrs.begin(), all_ptrs.end());
+  all_ptrs.erase(std::unique(all_ptrs.begin(), all_ptrs.end()), all_ptrs.end());
+
+  int n = static_cast<int>(all_ptrs.size());
+
+  // 为每个指针建立别名邻接集合
+  for (int i = 0; i < n; ++i) {
+    for (int j = i + 1; j < n; ++j) {
+      if (aa->Alias(all_ptrs[i], all_ptrs[j]) != AliasResult::NoAlias) {
+        alias_adjacency_[all_ptrs[i]].push_back(all_ptrs[j]);
+        alias_adjacency_[all_ptrs[j]].push_back(all_ptrs[i]);
+      }
+    }
+  }
+
+  // Union-Find 等价类（给 CSE 用）
+  std::vector<int> parent(n);
+  for (int i = 0; i < n; ++i) parent[i] = i;
+
+  std::function<int(int)> find = [&](int x) -> int {
+    if (parent[x] != x) parent[x] = find(parent[x]);
+    return parent[x];
+  };
+  auto unite = [&](int a, int b) {
+    int ra = find(a), rb = find(b);
+    if (ra != rb) parent[ra] = rb;
+  };
+
+  for (int i = 0; i < n; ++i)
+    for (int j = i + 1; j < n; ++j)
+      if (find(i) != find(j) && aa->Alias(all_ptrs[i], all_ptrs[j]) != AliasResult::NoAlias)
+        unite(i, j);
+
+  std::unordered_map<int, int> root_to_class;
+  int next_class = 0;
+  for (int i = 0; i < n; ++i) {
+    int root = find(i);
+    if (!root_to_class.count(root))
+      root_to_class[root] = next_class++;
+    int cid = root_to_class[root];
+    ptr_to_class_[all_ptrs[i]] = cid;
+    class_to_ptrs_[cid].push_back(all_ptrs[i]);
+  }
+}
+
+// ============================================================================
+// BuildMemorySSA —— per-pointer 状态传播 + 别名感知 Store 级联
+//
+// 核心：保持 per-pointer 状态追踪，但在 Store 时级联更新所有 MayAlias
+// 指针的状态（通过 alias_adjacency_ 查找）。
+//
+// 无 AA 时退化为标准 per-pointer 行为。
+// ============================================================================
+void MemorySSA::BuildMemorySSA(Function& func) {
+  auto& blocks = func.GetBlocks();
+  size_t n = blocks.size();
+  if (n == 0) return;
+
+  // LiveOnEntry
+  live_on_entry_ = std::make_unique<MemoryAccess>(MemoryAccess::LiveOnEntry, blocks[0].get());
+
+  // 检查是否有别名信息
+  bool has_alias_info = !alias_adjacency_.empty();
+
+  // Per-block: per-pointer 内存状态（用于 live-on-exit）
+  // key: 指针 Value*, value: 该指针当前的 MemoryAccess
+  using PtrState = std::unordered_map<Value*, MemoryAccess*>;
+  std::unordered_map<BasicBlock*, PtrState> block_live_on_entry_state;
+  std::unordered_map<BasicBlock*, PtrState> block_live_on_exit_state;
+
+  // 建立 RPO
+  std::vector<BasicBlock*> rpo;
+  {
+    std::unordered_set<BasicBlock*> visited;
+    std::function<void(BasicBlock*)> dfs_rpo = [&](BasicBlock* bb) {
+      visited.insert(bb);
+      auto& insts = bb->GetInstructions();
+      if (!insts.empty()) {
+        auto* term = insts.back().get();
+        if (auto* br = dynamic_cast<BranchInst*>(term))
+          { if (!visited.count(br->GetTarget())) dfs_rpo(br->GetTarget()); }
+        else if (auto* cbr = dynamic_cast<CondBranchInst*>(term)) {
+          if (!visited.count(cbr->GetTrueTarget())) dfs_rpo(cbr->GetTrueTarget());
+          if (!visited.count(cbr->GetFalseTarget())) dfs_rpo(cbr->GetFalseTarget());
+        }
+      }
+      rpo.push_back(bb);
+    };
+    dfs_rpo(blocks[0].get());
+    std::reverse(rpo.begin(), rpo.end());
+  }
+
+  // 建立 CFG 前驱
+  std::unordered_map<BasicBlock*, std::vector<BasicBlock*>> cfg_preds;
+  for (auto& bb : blocks) {
+    auto& insts = bb->GetInstructions();
+    if (insts.empty()) continue;
+    auto* term = insts.back().get();
+    if (auto* br = dynamic_cast<BranchInst*>(term))
+      cfg_preds[br->GetTarget()].push_back(bb.get());
+    else if (auto* cbr = dynamic_cast<CondBranchInst*>(term)) {
+      cfg_preds[cbr->GetTrueTarget()].push_back(bb.get());
+      cfg_preds[cbr->GetFalseTarget()].push_back(bb.get());
+    }
+  }
+
+  // 辅助：Store 时级联更新所有别名指针
+  // 返回创建的 MemoryDef（如果有的话）
+  auto propagate_store_state = [&](PtrState& current_state, Value* ptr, MemoryDef* md) {
+    // 更新精确指针
+    current_state[ptr] = md;
+    // 级联更新所有 MayAlias 指针
+    if (has_alias_info) {
+      auto it = alias_adjacency_.find(ptr);
+      if (it != alias_adjacency_.end()) {
+        for (auto* alias_ptr : it->second)
+          current_state[alias_ptr] = md;
+      }
+    }
+  };
+
+  // ==========================================================================
+  // 第一遍 RPO
+  // ==========================================================================
+  for (auto* bb : rpo) {
+    auto& preds = cfg_preds[bb];
+    PtrState live_in;
+
+    if (&preds == &cfg_preds[blocks[0].get()] || preds.empty()) {
+      // Entry block：空状态
+    } else if (preds.size() == 1) {
+      // 单前驱：继承其状态
+      live_in = block_live_on_exit_state[preds[0]];
+    } else {
+      // 多前驱：为每个在前驱中出现过的指针创建 MemoryPhi
+      std::unordered_set<Value*> all_ptrs;
+      for (auto* pred : preds) {
+        for (auto& [ptr, acc] : block_live_on_exit_state[pred])
+          all_ptrs.insert(ptr);
+      }
+      for (auto* ptr : all_ptrs) {
+        auto phi = std::make_unique<MemoryPhi>(bb);
+        for (auto* pred : preds) {
+          auto& pred_state = block_live_on_exit_state[pred];
+          auto it = pred_state.find(ptr);
+          MemoryAccess* incoming = (it != pred_state.end()) ? it->second : live_on_entry_.get();
+          phi->addIncoming(incoming, pred);
+        }
+        live_in[ptr] = phi.get();
+        accesses_.push_back(std::move(phi));
+      }
+    }
+    block_live_on_entry_state[bb] = live_in;
+    PtrState current_state = live_in;
+
+    // 遍历指令
+    for (auto& inst_ptr : bb->GetInstructions()) {
+      auto* inst = inst_ptr.get();
+
+      if (auto* load = dynamic_cast<LoadInst*>(inst)) {
+        Value* ptr = load->GetOperand(0);
+        MemoryAccess* def_acc = current_state.count(ptr) ? current_state[ptr] : live_on_entry_.get();
+        auto mu = std::make_unique<MemoryUse>(load, bb);
+        mu->setDefiningAccess(def_acc);
+        load_to_use_[load] = mu.get();
+        accesses_.push_back(std::move(mu));
+      } else if (auto* store = dynamic_cast<StoreInst*>(inst)) {
+        Value* ptr = store->GetOperand(1);
+        MemoryAccess* def_acc = current_state.count(ptr) ? current_state[ptr] : live_on_entry_.get();
+        auto md = std::make_unique<MemoryDef>(store, bb);
+        md->setDefiningAccess(def_acc);
+        // Store：级联更新精确指针 + 所有 MayAlias 指针
+        propagate_store_state(current_state, ptr, md.get());
+        store_to_def_[store] = md.get();
+        accesses_.push_back(std::move(md));
+      } else if (dynamic_cast<CallInst*>(inst)) {
+        // Call 可能修改任意内存 → 为所有已知指针更新状态
+        auto md = std::make_unique<MemoryDef>(nullptr, bb);
+        for (auto& [ptr, acc] : current_state)
+          md->setDefiningAccess(acc);
+        accesses_.push_back(std::move(md));
+        // Call 后所有指针状态重置为此 Call MemoryDef
+        auto* call_md = static_cast<MemoryDef*>(accesses_.back().get());
+        for (auto& [ptr, _] : current_state)
+          current_state[ptr] = call_md;
+      }
+    }
+
+    block_live_on_exit_state[bb] = std::move(current_state);
+  }
+
+  // ==========================================================================
+  // 第二遍 RPO：2 轮迭代更新 definingAccess（稳定回边）
+  // ==========================================================================
+  for (int iter = 0; iter < 2; ++iter) {
+    for (auto* bb : rpo) {
+      auto& preds = cfg_preds[bb];
+      PtrState live_in;
+
+      if (&preds == &cfg_preds[blocks[0].get()] || preds.empty()) {
+        // entry: 空状态
+      } else if (preds.size() == 1) {
+        live_in = block_live_on_exit_state[preds[0]];
+      } else {
+        // 多前驱：从所有前驱的 exit state 构建 merge
+        for (auto* pred : preds) {
+          for (auto& [ptr, acc] : block_live_on_exit_state[pred]) {
+            if (!live_in.count(ptr)) {
+              auto phi = std::make_unique<MemoryPhi>(bb);
+              for (auto* p2 : preds) {
+                auto& ps = block_live_on_exit_state[p2];
+                auto it = ps.find(ptr);
+                phi->addIncoming(it != ps.end() ? it->second : live_on_entry_.get(), p2);
+              }
+              live_in[ptr] = phi.get();
+              accesses_.push_back(std::move(phi));
+            }
+          }
+        }
+      }
+
+      // 更新 block_live_on_entry_state
+      block_live_on_entry_state[bb] = live_in;
+
+      // 更新此块中所有 MemoryUse 的 definingAccess
+      PtrState current_state = live_in;
+      for (auto& inst_ptr : bb->GetInstructions()) {
+        auto* inst = inst_ptr.get();
+
+        if (auto* load = dynamic_cast<LoadInst*>(inst)) {
+          Value* ptr = load->GetOperand(0);
+          MemoryAccess* def_acc = current_state.count(ptr) ? current_state[ptr] : live_on_entry_.get();
+          auto* mu = getMemoryUse(load);
+          if (mu) mu->setDefiningAccess(def_acc);
+        } else if (auto* store = dynamic_cast<StoreInst*>(inst)) {
+          Value* ptr = store->GetOperand(1);
+          MemoryAccess* def_acc = current_state.count(ptr) ? current_state[ptr] : live_on_entry_.get();
+          auto* md = getMemoryDef(store);
+          if (md) {
+            md->setDefiningAccess(def_acc);
+            // 级联更新
+            propagate_store_state(current_state, ptr, md);
+          }
+        } else if (dynamic_cast<CallInst*>(inst)) {
+          // Call: 更新所有已知指针状态
+          for (auto& acc : accesses_) {
+            if (acc->getKind() == MemoryAccess::Def && acc->getBlock() == bb &&
+                !static_cast<MemoryDef*>(acc.get())->getStore()) {
+              auto* call_md = static_cast<MemoryDef*>(acc.get());
+              for (auto& [ptr, _] : current_state)
+                current_state[ptr] = call_md;
+              break;
+            }
+          }
+        }
+      }
+      block_live_on_exit_state[bb] = std::move(current_state);
+    }  // for each bb in rpo
+  }  // iteration loop
+}
+
+// ============================================================================
+// RenamePass —— 支配树前序遍历
+// （已由 per-pointer 第二遍 RPO 替代）
+// ============================================================================
+void MemorySSA::RenamePass(BasicBlock* bb, MemoryAccess* incoming_val) {
+  auto* phi = getMemoryPhi(bb);
+
+  MemoryAccess* current = phi ? static_cast<MemoryAccess*>(phi) : incoming_val;
+
+  for (auto& inst_ptr : bb->GetInstructions()) {
+    auto* inst = inst_ptr.get();
+
+    if (auto* load = dynamic_cast<LoadInst*>(inst)) {
+      auto* mu = getMemoryUse(load);
+      if (mu) mu->setDefiningAccess(current);
+    } else if (auto* store = dynamic_cast<StoreInst*>(inst)) {
+      auto* md = getMemoryDef(store);
+      if (md) {
+        md->setDefiningAccess(current);
+        current = md;
+      }
+    }
+  }
+
+  for (auto& [child, id] : idom_) {
+    if (id == bb && child != bb)
+      RenamePass(child, current);
+  }
+}
+
+// ============================================================================
+// Compute —— 主入口
+// ============================================================================
+
+// 不带 AA：纯 per-pointer 模式
+void MemorySSA::Compute(Function& func) {
+  accesses_.clear();
+  load_to_use_.clear();
+  store_to_def_.clear();
+  block_to_phi_.clear();
+  idom_.clear();
+  dom_dfn_in_.clear();
+  dom_dfn_out_.clear();
+  live_on_entry_.reset();
+  ptr_to_class_.clear();
+  class_to_ptrs_.clear();
+  alias_adjacency_.clear();
+
+  ComputeDomTree(func);
+  BuildMemorySSA(func);
+}
+
+// 带 AA：per-pointer + 别名感知 Store 级联
+void MemorySSA::Compute(Function& func, AliasAnalysis* aa) {
+  accesses_.clear();
+  load_to_use_.clear();
+  store_to_def_.clear();
+  block_to_phi_.clear();
+  idom_.clear();
+  dom_dfn_in_.clear();
+  dom_dfn_out_.clear();
+  live_on_entry_.reset();
+  ptr_to_class_.clear();
+  class_to_ptrs_.clear();
+  alias_adjacency_.clear();
+
+  ComputeDomTree(func);
+  BuildAliasClasses(func, aa);
+  BuildMemorySSA(func);
+}
+
+// ============================================================================
+// 查询 API
+// ============================================================================
+MemoryUse* MemorySSA::getMemoryUse(LoadInst* load) const {
+  auto it = load_to_use_.find(const_cast<LoadInst*>(load));
+  return it != load_to_use_.end() ? it->second : nullptr;
+}
+
+MemoryDef* MemorySSA::getMemoryDef(StoreInst* store) const {
+  auto it = store_to_def_.find(const_cast<StoreInst*>(store));
+  return it != store_to_def_.end() ? it->second : nullptr;
+}
+
+MemoryPhi* MemorySSA::getMemoryPhi(BasicBlock* bb) const {
+  auto it = block_to_phi_.find(const_cast<BasicBlock*>(bb));
+  return it != block_to_phi_.end() ? it->second : nullptr;
+}
+
+int MemorySSA::getAliasClass(Value* ptr) const {
+  auto it = ptr_to_class_.find(ptr);
+  return it != ptr_to_class_.end() ? it->second : -1;
+}
+
+// 支配关系：def 是否在 CFG 中支配 use
+bool MemorySSA::dominates(const MemoryAccess* def, const MemoryAccess* use) const {
+  if (def == use) return true;
+  auto* def_bb = def->getBlock();
+  auto* use_bb = use->getBlock();
+  if (!def_bb || !use_bb) return false;
+
+  // 同块：检查指令顺序
+  if (def_bb == use_bb) {
+    auto& insts = def_bb->GetInstructions();
+    bool found_def = false;
+    for (auto& ip : insts) {
+      if (ip.get() == def->getMemoryInst()) found_def = true;
+      if (ip.get() == use->getMemoryInst()) return found_def;
+    }
+    return false;
+  }
+
+  // 跨块：检查 def_bb 是否支配 use_bb
+  auto dit_in = dom_dfn_in_.find(def_bb);
+  auto dit_out = dom_dfn_out_.find(def_bb);
+  auto uit_in = dom_dfn_in_.find(use_bb);
+  if (dit_in == dom_dfn_in_.end() || uit_in == dom_dfn_in_.end()) return false;
+  return dit_in->second <= uit_in->second && uit_in->second <= dit_out->second;
+}
+
+// ============================================================================
+// getClobberingMemoryAccess —— 对齐 LLVM MemorySSAWalker
+// 沿 definingAccess 链上溯，找到第一个与 ptr 别名的 MemoryDef
+// ============================================================================
+MemoryAccess* MemorySSA::getClobberingMemoryAccess(MemoryUse* use, AliasAnalysis* aa) const {
+  if (!use || !aa) return nullptr;
+  auto* load = use->getMemoryInst();
+  if (!load || load->GetNumOperands() < 1) return nullptr;
+  Value* ptr = load->GetOperand(0);
+
+  MemoryAccess* current = use->getDefiningAccess();
+  for (int steps = 0; steps < 50 && current; ++steps) {
+    if (current->getKind() == MemoryAccess::LiveOnEntry)
+      return current;
+
+    if (current->getKind() == MemoryAccess::Def) {
+      auto* md = static_cast<MemoryDef*>(current);
+      auto* mem_inst = md->getMemoryInst();
+      if (!mem_inst) return current;  // Call clobbers everything
+      if (mem_inst->GetNumOperands() >= 2) {
+        Value* store_ptr = mem_inst->GetOperand(1);
+        if (aa->Alias(ptr, store_ptr) != AliasResult::NoAlias)
+          return current;
+      }
+      current = md->getDefiningAccess();
+    } else if (current->getKind() == MemoryAccess::Phi) {
+      return current;  // Conservative: MemoryPhi is a clobber
+    } else {
+      current = current->getDefiningAccess();
+    }
+  }
+  return getLiveOnEntry();
+}
+
+} // namespace ir
diff --git a/src/ir/analysis/PostDominatorTree.cpp b/src/ir/analysis/PostDominatorTree.cpp
new file mode 100644
index 00000000..caa6305c
--- /dev/null
+++ b/src/ir/analysis/PostDominatorTree.cpp
@@ -0,0 +1,120 @@
+// 后支配树实现
+// 通过构建反向 CFG（加虚拟 exit 节点），在反向 CFG 上计算支配树。
+// 反向 CFG 的支配者 == 原始 CFG 的后支配者。
+//
+// 参考：LLVM PostDominatorTree.cpp
+
+#include "ir/analysis/PostDominatorTree.h"
+
+#include "ir/IR.h"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace ir {
+
+void PostDominatorTree::Compute(Function* func) {
+  virtual_exit_ = nullptr;
+  virtual_exit_holder_.reset();
+
+  if (!func || func->GetBlocks().empty()) return;
+
+  // 收集所有块
+  std::vector<BasicBlock*> all_blocks;
+  std::unordered_set<BasicBlock*> block_set;
+  for (auto& bb : func->GetBlocks()) {
+    all_blocks.push_back(bb.get());
+    block_set.insert(bb.get());
+  }
+
+  // 创建虚拟 exit 结点（代表函数出口）
+  // 注意：实际并不插入到函数中，只是作为反向 CFG 的 entry
+  virtual_exit_holder_ = std::make_unique<BasicBlock>("__virtual_exit__");
+  virtual_exit_ = virtual_exit_holder_.get();
+  all_blocks.push_back(virtual_exit_);
+
+  // 收集真实 exit 块（Ret 终止 + 无后继的块）
+  std::unordered_set<BasicBlock*> real_exits;
+  for (auto* bb : all_blocks) {
+    if (bb == virtual_exit_) continue;
+    if (!bb->HasTerminator()) {
+      real_exits.insert(bb);
+      continue;
+    }
+    auto* term = bb->GetInstructions().back().get();
+    if (dynamic_cast<ReturnInst*>(term)) {
+      real_exits.insert(bb);
+    }
+  }
+
+  // 预计算原始 CFG 的前驱映射（用于构建反向 CFG）
+  std::unordered_map<BasicBlock*, std::vector<BasicBlock*>> orig_preds;
+  for (auto* bb : all_blocks) {
+    if (bb == virtual_exit_) continue;
+    orig_preds[bb] = {};  // 初始化
+  }
+  for (auto* bb : all_blocks) {
+    if (bb == virtual_exit_) continue;
+    if (!bb->HasTerminator()) continue;
+
+    auto* term = bb->GetInstructions().back().get();
+    std::vector<BasicBlock*> succs;
+    if (auto* br = dynamic_cast<BranchInst*>(term)) {
+      succs.push_back(br->GetTarget());
+    } else if (auto* cbr = dynamic_cast<CondBranchInst*>(term)) {
+      succs.push_back(cbr->GetTrueTarget());
+      succs.push_back(cbr->GetFalseTarget());
+    }
+
+    for (auto* s : succs) {
+      if (block_set.count(s)) {
+        orig_preds[s].push_back(bb);
+      }
+    }
+  }
+
+  // 反向 CFG 的后继函数
+  auto reverse_succ = [&](BasicBlock* bb) -> std::vector<BasicBlock*> {
+    if (bb == virtual_exit_) {
+      // 虚拟 exit → 所有真实 exit
+      return std::vector<BasicBlock*>(real_exits.begin(), real_exits.end());
+    }
+    std::vector<BasicBlock*> result = orig_preds[bb];  // 原前驱→现后继
+    if (real_exits.count(bb)) {
+      result.push_back(virtual_exit_);  // 真实 exit → 虚拟 exit
+    }
+    return result;
+  };
+
+  // 在反向 CFG 上计算支配树（即后支配树）
+  dom_tree_.Compute(virtual_exit_, all_blocks, reverse_succ);
+}
+
+bool PostDominatorTree::PostDominates(BasicBlock* a, BasicBlock* b) const {
+  return dom_tree_.Dominates(a, b);
+}
+
+bool PostDominatorTree::ProperlyPostDominates(BasicBlock* a,
+                                              BasicBlock* b) const {
+  return a != b && dom_tree_.Dominates(a, b);
+}
+
+BasicBlock* PostDominatorTree::FindNearestCommonPostDominator(
+    BasicBlock* a, BasicBlock* b) const {
+  return dom_tree_.FindNearestCommonDominator(a, b);
+}
+
+BasicBlock* PostDominatorTree::GetIPostDom(BasicBlock* bb) const {
+  // 跳过虚拟 exit 节点
+  auto* idom = dom_tree_.GetIdom(bb);
+  if (idom == virtual_exit_) return nullptr;
+  return idom;
+}
+
+const std::unordered_set<BasicBlock*>&
+PostDominatorTree::GetPostDominanceFrontier(BasicBlock* bb) const {
+  return dom_tree_.GetDominanceFrontier(bb);
+}
+
+}  // namespace ir
diff --git a/src/ir/analysis/ScalarEvolution.cpp b/src/ir/analysis/ScalarEvolution.cpp
new file mode 100644
index 00000000..1040cca3
--- /dev/null
+++ b/src/ir/analysis/ScalarEvolution.cpp
@@ -0,0 +1,561 @@
+// 标量演化实现
+// 参考：LLVM ScalarEvolution.cpp
+
+#include "ir/analysis/ScalarEvolution.h"
+#include "ir/analysis/DominatorTree.h"
+
+#include <algorithm>
+#include <unordered_set>
+
+namespace ir {
+
+// ================================================================
+// 辅助函数
+// ================================================================
+
+static std::vector<BasicBlock*> GetSuccessors(BasicBlock* bb) {
+  std::vector<BasicBlock*> succs;
+  if (!bb || !bb->HasTerminator()) return succs;
+  auto* term = bb->GetInstructions().back().get();
+  if (auto* br = dynamic_cast<BranchInst*>(term)) {
+    succs.push_back(br->GetTarget());
+  } else if (auto* cbr = dynamic_cast<CondBranchInst*>(term)) {
+    succs.push_back(cbr->GetTrueTarget());
+    succs.push_back(cbr->GetFalseTarget());
+  }
+  return succs;
+}
+
+// ================================================================
+// SCEV 创建（去重）
+// ================================================================
+
+SCEVHandle ScalarEvolution::CreateConstant(int64_t c) {
+  auto it = constants_.find(c);
+  if (it != constants_.end()) return it->second.get();
+  auto ptr = std::make_unique<SCEVConstant>(c);
+  auto* raw = ptr.get();
+  constants_[c] = std::move(ptr);
+  return raw;
+}
+
+SCEVHandle ScalarEvolution::CreateUnknown(Value* val) {
+  auto it = unknowns_.find(val);
+  if (it != unknowns_.end()) return it->second.get();
+  auto ptr = std::make_unique<SCEVUnknown>(val);
+  auto* raw = ptr.get();
+  unknowns_[val] = std::move(ptr);
+  return raw;
+}
+
+SCEVHandle ScalarEvolution::CreateAddExpr(std::vector<SCEVHandle> ops) {
+  auto simplified = SimplifyAddExpr(std::move(ops));
+  if (simplified->GetSCEVType() != SCEVType::Add) return simplified;
+
+  auto* add = static_cast<const SCEVAddExpr*>(simplified);
+  const auto& flat_ops = add->GetOperands();
+  if (flat_ops.size() == 1) return flat_ops[0];
+
+  AddExprKey key{std::vector<SCEVHandle>(flat_ops.begin(), flat_ops.end())};
+  auto it = add_exprs_.find(key);
+  if (it != add_exprs_.end()) return it->second.get();
+
+  auto ptr = std::make_unique<SCEVAddExpr>(key.ops);
+  auto* raw = ptr.get();
+  add_exprs_[std::move(key)] = std::move(ptr);
+  return raw;
+}
+
+SCEVHandle ScalarEvolution::CreateMulExpr(std::vector<SCEVHandle> ops) {
+  auto simplified = SimplifyMulExpr(std::move(ops));
+  if (simplified->GetSCEVType() != SCEVType::Mul) return simplified;
+  // 乘法表达式暂不去重
+  return simplified;
+}
+
+SCEVHandle ScalarEvolution::CreateAddRecExpr(SCEVHandle base, SCEVHandle step,
+                                             BasicBlock* loop_header) {
+  // 暂不去重
+  return new SCEVAddRecExpr(base, step, loop_header);
+}
+
+int64_t ScalarEvolution::GetConstantValue(SCEVHandle s) {
+  if (auto* c = dynamic_cast<const SCEVConstant*>(s)) return c->GetValue();
+  return 0;
+}
+
+// ================================================================
+// SCEV 简化
+// ================================================================
+
+SCEVHandle ScalarEvolution::SimplifyAddExpr(std::vector<SCEVHandle> ops) {
+  // 扁平化嵌套的 AddExpr
+  std::vector<SCEVHandle> flat;
+  int64_t const_sum = 0;
+
+  auto flatten = [&](auto& self, SCEVHandle s) -> void {
+    if (!s) return;
+    if (auto* c = dynamic_cast<const SCEVConstant*>(s)) {
+      const_sum += c->GetValue();
+      return;
+    }
+    if (auto* add = dynamic_cast<const SCEVAddExpr*>(s)) {
+      for (auto* op : add->GetOperands()) self(self, op);
+      return;
+    }
+    flat.push_back(s);
+  };
+
+  for (auto* op : ops) flatten(flatten, op);
+
+  if (const_sum != 0) flat.push_back(CreateConstant(const_sum));
+
+  if (flat.empty()) return CreateConstant(0);
+  if (flat.size() == 1) return flat[0];
+
+  // 排序以支持去重（按指针地址排序，保证确定性）
+  std::sort(flat.begin(), flat.end());
+  return new SCEVAddExpr(std::move(flat));
+}
+
+SCEVHandle ScalarEvolution::SimplifyMulExpr(std::vector<SCEVHandle> ops) {
+  int64_t const_prod = 1;
+  std::vector<SCEVHandle> non_const;
+
+  for (auto* op : ops) {
+    if (auto* c = dynamic_cast<const SCEVConstant*>(op)) {
+      const_prod *= c->GetValue();
+      if (const_prod == 0) return CreateConstant(0);
+    } else {
+      non_const.push_back(op);
+    }
+  }
+
+  if (const_prod != 1) non_const.push_back(CreateConstant(const_prod));
+  if (non_const.empty()) return CreateConstant(1);
+  if (non_const.size() == 1) return non_const[0];
+
+  return new SCEVMulExpr(non_const);
+}
+
+// ================================================================
+// 循环检测
+// ================================================================
+
+void ScalarEvolution::DetectLoops(Function* func) {
+  detected_loops_.clear();
+  latch_to_header_.clear();
+  header_to_latch_.clear();
+  block_to_loop_headers_.clear();
+
+  DominatorTree dom_tree;
+  dom_tree.Compute(func);
+
+  // 查找回边：succ 支配 pred 的边
+  for (auto& bb : func->GetBlocks()) {
+    auto succs = GetSuccessors(bb.get());
+    for (auto* succ : succs) {
+      if (dom_tree.Dominates(succ, bb.get())) {
+        // 找到回边 bb → succ，succ 是循环头，bb 是 latch
+        latch_to_header_[bb.get()] = succ;
+        header_to_latch_[succ] = bb.get();
+      }
+    }
+  }
+
+  // 为每个检测到的循环头构建 SCEVLoopInfo
+  std::unordered_set<BasicBlock*> processed_headers;
+  for (auto& [latch, header] : latch_to_header_) {
+    if (processed_headers.count(header)) continue;
+    processed_headers.insert(header);
+
+    SCEVLoopInfo loop;
+    loop.header = header;
+    loop.latch = header_to_latch_[header];
+
+    // 收集循环体块（BFS 从 latch 反向，但不穿过 header）
+    std::unordered_set<BasicBlock*> body;
+    std::vector<BasicBlock*> worklist = {latch};
+    body.insert(header);
+    body.insert(latch);
+
+    while (!worklist.empty()) {
+      auto* cur = worklist.back();
+      worklist.pop_back();
+
+      // 查找前驱（扫描所有块）
+      for (auto& bb : func->GetBlocks()) {
+        auto succs = GetSuccessors(bb.get());
+        for (auto* s : succs) {
+          if (s == cur && !body.count(bb.get())) {
+            body.insert(bb.get());
+            worklist.push_back(bb.get());
+          }
+        }
+      }
+    }
+
+    for (auto* b : body) {
+      loop.blocks.push_back(b);
+      block_to_loop_headers_[b].push_back(header);
+
+      // 查找 exiting blocks（有后继在循环外的块）
+      auto succs = GetSuccessors(b);
+      for (auto* s : succs) {
+        if (!body.count(s)) {
+          loop.exiting_blocks.push_back(b);
+          break;
+        }
+      }
+    }
+
+    // 确定 preheader（header 的循环外前驱）
+    for (auto& bb : func->GetBlocks()) {
+      auto succs = GetSuccessors(bb.get());
+      for (auto* s : succs) {
+        if (s == header && !body.count(bb.get()) &&
+            loop.preheader == nullptr) {
+          loop.preheader = bb.get();
+        }
+      }
+    }
+
+    if (loop.Valid()) {
+      detected_loops_.push_back(std::move(loop));
+    }
+  }
+}
+
+// ================================================================
+// 循环不变量判断
+// ================================================================
+
+bool ScalarEvolution::IsLoopInvariant(SCEVHandle s,
+                                      BasicBlock* loop_header) const {
+  if (!s) return true;
+  if (dynamic_cast<const SCEVConstant*>(s)) return true;
+
+  if (auto* unknown = dynamic_cast<const SCEVUnknown*>(s)) {
+    auto* val = unknown->GetValue();
+    // 参数是循环不变量
+    if (dynamic_cast<Argument*>(val)) return true;
+    // 全局变量是循环不变量
+    if (dynamic_cast<GlobalVariable*>(val)) return true;
+    // 指令：在其所在块不在循环内时是不变量
+    if (auto* inst = dynamic_cast<Instruction*>(val)) {
+      auto* parent = inst->GetParent();
+      auto it = block_to_loop_headers_.find(parent);
+      if (it == block_to_loop_headers_.end()) return true;
+      return std::find(it->second.begin(), it->second.end(), loop_header) ==
+             it->second.end();
+    }
+    return false;
+  }
+
+  if (auto* add = dynamic_cast<const SCEVAddExpr*>(s)) {
+    for (auto* op : add->GetOperands())
+      if (!IsLoopInvariant(op, loop_header)) return false;
+    return true;
+  }
+
+  if (auto* mul = dynamic_cast<const SCEVMulExpr*>(s)) {
+    for (auto* op : mul->GetOperands())
+      if (!IsLoopInvariant(op, loop_header)) return false;
+    return true;
+  }
+
+  // AddRec 本身不是不变量
+  if (dynamic_cast<const SCEVAddRecExpr*>(s)) return false;
+
+  return false;
+}
+
+// ================================================================
+// 循环迭代次数计算
+// ================================================================
+
+bool ScalarEvolution::GetLoopTripCount(BasicBlock* loop_header,
+                                       int64_t* result) const {
+  // 查找 header 中以 SCEVAddRecExpr 形式的归纳变量
+  for (auto& inst : loop_header->GetInstructions()) {
+    auto* phi = dynamic_cast<PhiInst*>(inst.get());
+    if (!phi) continue;
+
+    auto it = scev_map_.find(phi);
+    if (it == scev_map_.end()) continue;
+
+    auto* add_rec = dynamic_cast<const SCEVAddRecExpr*>(it->second);
+    if (!add_rec || add_rec->GetLoop() != loop_header) continue;
+
+    // 查找循环退出条件：header 中的 CondBr，条件是 icmp indvar, bound
+    if (!loop_header->HasTerminator()) continue;
+    auto* term = loop_header->GetInstructions().back().get();
+    auto* cbr = dynamic_cast<CondBranchInst*>(term);
+    if (!cbr) continue;
+
+    auto* cond = dynamic_cast<BinaryInst*>(cbr->GetCond());
+    if (!cond) continue;
+
+    Opcode cmp_op = cond->GetOpcode();
+    if (cmp_op != Opcode::Eq && cmp_op != Opcode::Ne &&
+        cmp_op != Opcode::Lt && cmp_op != Opcode::Le &&
+        cmp_op != Opcode::Gt && cmp_op != Opcode::Ge)
+      continue;
+
+    // 检查是否是 icmp indvar, bound 或 icmp bound, indvar
+    Value* bound_val = nullptr;
+
+    auto find_scev = [&](Value* v) -> SCEVHandle {
+      auto it = scev_map_.find(v);
+      return (it != scev_map_.end()) ? it->second : nullptr;
+    };
+
+    if (find_scev(cond->GetOperand(0)) == add_rec) {
+      bound_val = cond->GetOperand(1);
+    } else if (find_scev(cond->GetOperand(1)) == add_rec) {
+      bound_val = cond->GetOperand(0);
+    }
+    if (!bound_val) continue;
+
+    // 计算 bound 的 SCEV
+    auto* bound_scev = find_scev(bound_val);
+    auto* bound_const = dynamic_cast<const SCEVConstant*>(bound_scev);
+    if (!bound_const) continue;
+    auto* start_const =
+        dynamic_cast<const SCEVConstant*>(add_rec->GetStart());
+    if (!start_const) continue;
+    auto* step_const =
+        dynamic_cast<const SCEVConstant*>(add_rec->GetStepRecurrence());
+    if (!step_const) continue;
+
+    int64_t start = start_const->GetValue();
+    int64_t bound = bound_const->GetValue();
+    int64_t step = step_const->GetValue();
+
+    // 计算迭代次数
+    if (step == 0) return false;
+
+    // 确定退出方向
+    auto* false_target = cbr->GetFalseTarget();
+    auto h2l_it = header_to_latch_.find(loop_header);
+    BasicBlock* latch = (h2l_it != header_to_latch_.end()) ? h2l_it->second
+                                                            : nullptr;
+
+    // 简化：判断 false_target 是否在循环内来确定退出方向
+    bool true_exits = true;
+    if (latch) {
+      auto succs = GetSuccessors(latch);
+      bool latch_to_false =
+          std::find(succs.begin(), succs.end(), false_target) != succs.end();
+      true_exits = !latch_to_false;
+    }
+
+    // 根据比较方向和 step 符号计算迭代次数
+    int64_t diff = bound - start;
+    int64_t trip_count = 0;
+
+    if (step > 0) {
+      if (diff <= 0 && !true_exits) {
+        // indvar < bound (或 indvar <= bound-1)
+        // trip = ceil((bound - start) / step)
+        if (cmp_op == Opcode::Lt || cmp_op == Opcode::Ne ||
+            (cmp_op == Opcode::Eq && !true_exits)) {
+          trip_count = (diff + step - 1) / step;
+          if (trip_count < 0) trip_count = 0;
+        } else if (cmp_op == Opcode::Le) {
+          diff += 1;
+          trip_count = (diff + step - 1) / step;
+          if (trip_count < 0) trip_count = 0;
+        }
+      }
+    } else if (step < 0) {
+      if (diff >= 0) {
+        if (cmp_op == Opcode::Gt || cmp_op == Opcode::Ne) {
+          diff = -diff;
+          step = -step;
+          trip_count = (diff + step - 1) / step;
+          if (trip_count < 0) trip_count = 0;
+        } else if (cmp_op == Opcode::Ge) {
+          diff = -(diff - 1);
+          step = -step;
+          trip_count = (diff + step - 1) / step;
+          if (trip_count < 0) trip_count = 0;
+        }
+      }
+    }
+
+    if (trip_count > 0) {
+      *result = trip_count;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// ================================================================
+// 为指令计算 SCEV
+// ================================================================
+
+SCEVHandle ScalarEvolution::ComputeSCEVForInst(Instruction* inst) {
+  if (!inst) return CreateUnknown(inst);
+
+  // 常量
+  if (auto* ci = dynamic_cast<ConstantInt*>(inst)) {
+    return CreateConstant(ci->GetValue());
+  }
+
+  Opcode op = inst->GetOpcode();
+
+  // PHI 节点 → 可能的归纳变量
+  if (auto* phi = dynamic_cast<PhiInst*>(inst)) {
+    // 检查是否在循环头中
+    auto* parent = phi->GetParent();
+    auto h2l_it = header_to_latch_.find(parent);
+
+    if (h2l_it != header_to_latch_.end()) {
+      // 这是循环头的 PHI → 可能是归纳变量
+      auto* latch = h2l_it->second;
+
+      // 收集来自 preheader 和 latch 的值
+      Value* start_val = nullptr;
+      Value* step_val = nullptr;
+
+      for (size_t i = 0; i + 1 < phi->GetNumOperands(); i += 2) {
+        auto* incoming_bb =
+            dynamic_cast<BasicBlock*>(phi->GetOperand(i + 1));
+        if (!incoming_bb) continue;
+        auto* incoming_val = phi->GetOperand(i);
+        if (!incoming_val) continue;
+
+        if (incoming_bb == latch) {
+          step_val = incoming_val;
+        } else {
+          // 假设另一个是 preheader（循环外前驱）
+          auto lp_it = block_to_loop_headers_.find(incoming_bb);
+          if (lp_it == block_to_loop_headers_.end() ||
+              std::find(lp_it->second.begin(), lp_it->second.end(),
+                        parent) == lp_it->second.end()) {
+            start_val = incoming_val;
+          }
+        }
+      }
+
+      if (start_val && step_val) {
+        auto* start_scev = GetSCEV(start_val);
+        // step_val 应该是 start_val 的某种增量形式
+        // 典型模式：step_val = start + stride（BinaryInst Add with constant）
+        if (auto* step_inst = dynamic_cast<Instruction*>(step_val)) {
+          if (step_inst->GetOpcode() == Opcode::Add) {
+            auto* lhs = step_inst->GetOperand(0);
+            auto* rhs = step_inst->GetOperand(1);
+
+            Value* base_val = nullptr;
+            Value* stride_val = nullptr;
+
+            if (lhs == phi) {
+              base_val = phi;
+              stride_val = rhs;
+            } else if (rhs == phi) {
+              base_val = phi;
+              stride_val = lhs;
+            }
+
+            if (base_val == phi && stride_val) {
+              auto* stride_scev = GetSCEV(stride_val);
+              if (auto* stride_c =
+                      dynamic_cast<const SCEVConstant*>(stride_scev)) {
+                return CreateAddRecExpr(start_scev, stride_c, parent);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // 非归纳变量的 PHI：返回 Unknown
+    return CreateUnknown(inst);
+  }
+
+  // 二元运算
+  if (auto* bin = dynamic_cast<BinaryInst*>(inst)) {
+    auto* lhs = GetSCEV(bin->GetOperand(0));
+    auto* rhs = GetSCEV(bin->GetOperand(1));
+    if (!lhs || !rhs) return CreateUnknown(inst);
+
+    switch (op) {
+      case Opcode::Add:
+        return CreateAddExpr({lhs, rhs});
+      case Opcode::Sub:
+        return CreateAddExpr(
+            {lhs, CreateMulExpr({CreateConstant(-1), rhs})});
+      case Opcode::Mul:
+        return CreateMulExpr({lhs, rhs});
+      default:
+        return CreateUnknown(inst);
+    }
+  }
+
+  // ZExt → 透传（i1 → i32）
+  if (op == Opcode::ZExt && inst->GetNumOperands() > 0) {
+    auto* operand = GetSCEV(inst->GetOperand(0));
+    if (operand && !dynamic_cast<const SCEVUnknown*>(operand))
+      return operand;
+  }
+
+  return CreateUnknown(inst);
+}
+
+// ================================================================
+// Compute — 主入口
+// ================================================================
+
+void ScalarEvolution::Compute(Function* func) {
+  unknowns_.clear();
+  constants_.clear();
+  add_exprs_.clear();
+  scev_map_.clear();
+  detected_loops_.clear();
+
+  if (!func) return;
+
+  DetectLoops(func);
+  ComputeSCEVs(func);
+}
+
+void ScalarEvolution::ComputeSCEVs(Function* func) {
+  for (auto& bb : func->GetBlocks()) {
+    for (auto& inst : bb->GetInstructions()) {
+      auto* scev = ComputeSCEVForInst(inst.get());
+      scev_map_[inst.get()] = scev;
+    }
+  }
+  // 为参数创建 Unknown
+  for (auto& param : func->GetParams()) {
+    scev_map_[param.get()] = CreateUnknown(param.get());
+  }
+}
+
+SCEVHandle ScalarEvolution::GetSCEV(Value* val) const {
+  if (!val) return nullptr;
+
+  // 常量：从 constants_ 中查找
+  if (auto* ci = dynamic_cast<ConstantInt*>(val)) {
+    auto it = constants_.find(ci->GetValue());
+    if (it != constants_.end()) return it->second.get();
+  }
+
+  // 先从 scev_map 中查找
+  auto it = scev_map_.find(val);
+  if (it != scev_map_.end()) return it->second;
+
+  // 参数
+  if (dynamic_cast<Argument*>(val)) {
+    auto uit = unknowns_.find(val);
+    if (uit != unknowns_.end()) return uit->second.get();
+  }
+
+  return nullptr;
+}
+
+}  // namespace ir
diff --git a/src/ir/passes/CMakeLists.txt b/src/ir/passes/CMakeLists.txt
index 4521fe78..d770f751 100644
--- a/src/ir/passes/CMakeLists.txt
+++ b/src/ir/passes/CMakeLists.txt
@@ -8,6 +8,8 @@ add_library(ir_passes STATIC
   CSE.cpp
   DCE.cpp
   CFGSimplify.cpp
+  LoopInterchange.cpp
+  LoopVectorize.cpp
   TailCallOpt.cpp
 )
 
diff --git a/src/ir/passes/DSE.cpp b/src/ir/passes/DSE.cpp
new file mode 100644
index 00000000..4678e0d4
--- /dev/null
+++ b/src/ir/passes/DSE.cpp
@@ -0,0 +1,145 @@
+// Dead Store Elimination（DSE）—— MemorySSA 驱动的死存储删除
+//
+// 对齐 LLVM DSE（lib/Transforms/Scalar/DeadStoreElimination.cpp）
+//
+// 算法：
+//   1. 遍历所有 MemoryUse → 标记其 definingAccess 链上的 MemoryDef 为「活跃」
+//   2. 遍历所有 MemoryPhi → 标记其所有 incoming MemoryDef 为「活跃」
+//      （因为 phi 合并的值在运行时可能被读取，其源 store 不能删除）
+//   3. 标记 global/escaping alloca 的 store 为活跃（外部可观测）
+//   4. 未被标记的 MemoryDef（non-escaping alloca 的死 store）→ 删除
+//   5. Tier 2：同块内同指针连续 store，无中间 load/call → 前者被覆盖
+
+#include "ir/IR.h"
+#include "ir/analysis/AliasAnalysis.h"
+#include "ir/analysis/MemorySSA.h"
+
+#include <functional>
+#include <unordered_set>
+#include <vector>
+
+namespace ir {
+namespace {
+
+static bool RunDSEOnFunction(Function& func) {
+  AliasAnalysis aa;
+  aa.Compute(&func);
+  MemorySSA mssa;
+  mssa.Compute(func, &aa);
+
+  // Step 1: 收集活跃 MemoryDef
+  std::unordered_set<MemoryDef*> live_defs;
+
+  // 1a: MemoryUse → 标记其 definingAccess 链上的所有 MemoryDef
+  for (auto& bb : func.GetBlocks()) {
+    for (auto& inst_ptr : bb->GetInstructions()) {
+      if (auto* load = dynamic_cast<LoadInst*>(inst_ptr.get())) {
+        auto* mu = mssa.getMemoryUse(load);
+        if (!mu) continue;
+        MemoryAccess* acc = mu->getDefiningAccess();
+        for (int steps = 0; steps < 20 && acc; ++steps) {
+          if (acc->getKind() == MemoryAccess::Def)
+            live_defs.insert(static_cast<MemoryDef*>(acc));
+          if (acc->getKind() == MemoryAccess::Phi) break;  // phi incoming 由 1b 处理
+          acc = acc->getDefiningAccess();
+        }
+      }
+    }
+  }
+
+  // 1b: MemoryPhi incoming → 全部标记为活跃
+  mssa.forEachMemoryPhi([&](MemoryPhi* phi) {
+    for (size_t i = 0; i < phi->getNumIncoming(); ++i) {
+      auto* incoming = phi->getIncomingValue(i);
+      if (incoming && incoming->getKind() == MemoryAccess::Def)
+        live_defs.insert(static_cast<MemoryDef*>(incoming));
+    }
+  });
+
+  // 1c: Global/escaping alloca 的 store → 保守标记为活跃
+  for (auto& bb : func.GetBlocks()) {
+    for (auto& inst_ptr : bb->GetInstructions()) {
+      if (auto* store = dynamic_cast<StoreInst*>(inst_ptr.get())) {
+        Value* ptr = store->GetOperand(1);
+        auto* alloca = dynamic_cast<AllocaInst*>(ptr);
+        if (!alloca || !aa.IsNonEscaping(alloca)) {
+          auto* md = mssa.getMemoryDef(store);
+          if (md) live_defs.insert(md);
+        }
+      }
+    }
+  }
+
+  // Step 2: 收集死存储（Tier 1——MemorySSA）
+  std::unordered_set<StoreInst*> dead_stores;
+  for (auto& bb : func.GetBlocks()) {
+    for (auto& inst_ptr : bb->GetInstructions()) {
+      auto* store = dynamic_cast<StoreInst*>(inst_ptr.get());
+      if (!store) continue;
+      Value* ptr = store->GetOperand(1);
+      auto* alloca = dynamic_cast<AllocaInst*>(ptr);
+      if (!alloca || !aa.IsNonEscaping(alloca)) continue;
+      auto* md = mssa.getMemoryDef(store);
+      if (md && !live_defs.count(md))
+        dead_stores.insert(store);
+    }
+  }
+
+  // Step 3: Tier 2——同块覆盖
+  for (auto& bb : func.GetBlocks()) {
+    std::unordered_map<Value*, StoreInst*> uncovered;
+    for (auto& inst_ptr : bb->GetInstructions()) {
+      auto* inst = inst_ptr.get();
+      if (auto* store = dynamic_cast<StoreInst*>(inst)) {
+        Value* ptr = store->GetOperand(1);
+        auto it = uncovered.find(ptr);
+        if (it != uncovered.end()) {
+          auto* prev_md = mssa.getMemoryDef(it->second);
+          if (prev_md && !live_defs.count(prev_md))
+            dead_stores.insert(it->second);
+        }
+        uncovered[ptr] = store;
+      } else if (auto* load = dynamic_cast<LoadInst*>(inst)) {
+        if (load->GetNumOperands() >= 1)
+          uncovered.erase(load->GetOperand(0));
+      } else if (dynamic_cast<CallInst*>(inst)) {
+        uncovered.clear();
+      }
+    }
+  }
+
+  // Step 4: 删除
+  if (dead_stores.empty()) return false;
+
+  for (auto& bb : func.GetBlocks()) {
+    auto& insts = const_cast<std::vector<std::unique_ptr<Instruction>>&>(bb->GetInstructions());
+    for (auto& inst_ptr : insts) {
+      if (auto* store = dynamic_cast<StoreInst*>(inst_ptr.get())) {
+        if (dead_stores.count(store)) {
+          for (size_t i = 0; i < store->GetNumOperands(); ++i)
+            if (auto* op = dynamic_cast<Instruction*>(store->GetOperand(i)))
+              op->RemoveUse(store, i);
+        }
+      }
+    }
+    insts.erase(std::remove_if(insts.begin(), insts.end(),
+      [&dead_stores](const std::unique_ptr<Instruction>& p) {
+        auto* s = dynamic_cast<StoreInst*>(p.get());
+        return s && dead_stores.count(s) > 0;
+      }), insts.end());
+  }
+  return true;
+}
+
+} // namespace
+
+bool RunDSE(Module& module) {
+  bool changed = false;
+  for (auto& func_ptr : module.GetFunctions()) {
+    if (func_ptr->IsExternal()) continue;
+    changed |= RunDSEOnFunction(*func_ptr);
+  }
+  return changed;
+}
+
+} // namespace ir
diff --git a/src/ir/passes/IRVerifier.cpp b/src/ir/passes/IRVerifier.cpp
new file mode 100644
index 00000000..2b3dd742
--- /dev/null
+++ b/src/ir/passes/IRVerifier.cpp
@@ -0,0 +1,208 @@
+// IR 验证器：校验 IR 模块的合法性。
+// 检查项：
+// 1. SSA 支配性：每条指令的操作数（也是指令）必须由当前基本块支配
+// 2. 基本块终结指令：每个非空基本块必须以终结指令结尾
+// 3. PHI 一致性：PHI 节点操作数结构正确
+//
+// 验证失败时输出错误信息并 abort()。调用方应通过 NDEBUG 控制是否启用。
+
+#include "ir/IR.h"
+#include "ir/analysis/DominatorTree.h"
+#include "utils/Log.h"
+
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace ir {
+
+namespace {
+
+// 收集某个基本块的前驱（基于终结指令的跳转目标）
+std::unordered_map<BasicBlock*, std::vector<BasicBlock*>> CollectPredecessors(
+    Function* func) {
+  std::unordered_map<BasicBlock*, std::vector<BasicBlock*>> preds;
+  for (const auto& bb : func->GetBlocks()) {
+    preds[bb.get()] = {};
+  }
+  for (const auto& bb : func->GetBlocks()) {
+    if (!bb->HasTerminator()) {
+      continue;
+    }
+    auto* term = bb->GetInstructions().back().get();
+    if (auto* br = dynamic_cast<BranchInst*>(term)) {
+      preds[br->GetTarget()].push_back(bb.get());
+    } else if (auto* condbr = dynamic_cast<CondBranchInst*>(term)) {
+      preds[condbr->GetTrueTarget()].push_back(bb.get());
+      preds[condbr->GetFalseTarget()].push_back(bb.get());
+    }
+  }
+  return preds;
+}
+
+// 验证单个函数的 IR
+void VerifyFunction(Function* func) {
+  const auto& blocks = func->GetBlocks();
+  if (blocks.empty()) {
+    return;  // 空函数（external declaration），无需验证
+  }
+
+  // 构建前驱映射（用于 PHI 验证）
+  auto pred_map = CollectPredecessors(func);
+
+  // 构建支配树
+  DominatorTree dom_tree;
+  dom_tree.Compute(func);
+
+  for (const auto& bb : blocks) {
+    BasicBlock* current_bb = bb.get();
+
+    // 检查 1: 非空基本块必须以终结指令结尾
+    const auto& instructions = current_bb->GetInstructions();
+    if (!instructions.empty()) {
+      auto* last_inst = instructions.back().get();
+      if (!last_inst->IsTerminator()) {
+        std::ostringstream oss;
+        oss << "IR 验证失败: 函数 '" << func->GetName() << "' 的基本块 '"
+            << current_bb->GetName()
+            << "' 的最后一条指令不是终结指令 (opcode="
+            << static_cast<int>(last_inst->GetOpcode()) << ")";
+        LogError(oss.str(), std::cerr);
+        std::abort();
+      }
+    }
+
+    // 检查 2: SSA 支配性 + PHI 一致性
+    for (const auto& inst_ptr : instructions) {
+      auto* inst = inst_ptr.get();
+
+      if (inst->GetOpcode() == Opcode::Phi) {
+        // PHI 一致性检查:
+        // - 操作数个数必须为偶数
+        // - 每个奇数索引的操作数（基本块引用）必须是前驱
+        size_t num_operands = inst->GetNumOperands();
+        if (num_operands % 2 != 0) {
+          std::ostringstream oss;
+          oss << "IR 验证失败: 函数 '" << func->GetName()
+              << "' 的基本块 '" << current_bb->GetName()
+              << "' 中的 PHI 节点操作数个数为奇数 (" << num_operands << ")";
+          LogError(oss.str(), std::cerr);
+          std::abort();
+        }
+
+        // 收集已出现的前驱，检查重复
+        std::unordered_set<BasicBlock*> seen_preds;
+        const auto& bb_preds = pred_map[current_bb];
+
+        for (size_t i = 1; i < num_operands; i += 2) {
+          Value* block_op = inst->GetOperand(i);
+          auto* pred_bb = dynamic_cast<BasicBlock*>(block_op);
+          if (!pred_bb) {
+            std::ostringstream oss;
+            oss << "IR 验证失败: 函数 '" << func->GetName()
+                << "' 的基本块 '" << current_bb->GetName()
+                << "' 中的 PHI 节点操作数 " << i << " 不是基本块";
+            LogError(oss.str(), std::cerr);
+            std::abort();
+          }
+
+          // 检查该基本块是否为实际前驱
+          bool is_pred = false;
+          for (auto* p : bb_preds) {
+            if (p == pred_bb) {
+              is_pred = true;
+              break;
+            }
+          }
+          if (!is_pred) {
+            std::ostringstream oss;
+            oss << "IR 验证失败: 函数 '" << func->GetName()
+                << "' 的基本块 '" << current_bb->GetName()
+                << "' 中的 PHI 节点引用了非前驱基本块 '"
+                << pred_bb->GetName() << "'";
+            LogError(oss.str(), std::cerr);
+            std::abort();
+          }
+
+          // 检查重复前驱
+          if (seen_preds.find(pred_bb) != seen_preds.end()) {
+            std::ostringstream oss;
+            oss << "IR 验证失败: 函数 '" << func->GetName()
+                << "' 的基本块 '" << current_bb->GetName()
+                << "' 中的 PHI 节点包含重复前驱 '"
+                << pred_bb->GetName() << "'";
+            LogError(oss.str(), std::cerr);
+            std::abort();
+          }
+          seen_preds.insert(pred_bb);
+
+          // PHI 的 SSA 支配性检查：
+          // 每个值操作数（偶数索引）的定义必须支配对应的前驱基本块
+          Value* val_op = inst->GetOperand(i - 1);
+          auto* val_inst = dynamic_cast<Instruction*>(val_op);
+          if (val_inst && val_inst->GetParent()) {
+            if (!dom_tree.Dominates(val_inst->GetParent(), pred_bb)) {
+              std::ostringstream oss;
+              oss << "IR 验证失败: 函数 '" << func->GetName()
+                  << "' 的基本块 '" << current_bb->GetName()
+                  << "' 中的 PHI 值操作数 '" << val_inst->GetName()
+                  << "' (定义于 '" << val_inst->GetParent()->GetName()
+                  << "') 不支配前驱 '" << pred_bb->GetName() << "'";
+              LogError(oss.str(), std::cerr);
+              std::abort();
+            }
+          }
+        }
+      } else {
+        // 非 PHI 指令：检查每个操作数的 SSA 支配性
+        for (size_t i = 0; i < inst->GetNumOperands(); ++i) {
+          Value* op = inst->GetOperand(i);
+          // 跳过常量、参数和基本块引用
+          if (op->IsConstant()) continue;
+          auto* op_bb_ref = dynamic_cast<BasicBlock*>(op);
+          if (op_bb_ref) continue;
+          auto* op_arg = dynamic_cast<Argument*>(op);
+          if (op_arg) continue;
+          if (op->IsFunction()) continue;
+
+          auto* op_inst = dynamic_cast<Instruction*>(op);
+          if (!op_inst) continue;
+
+          BasicBlock* def_bb = op_inst->GetParent();
+          if (!def_bb) continue;
+
+          // 支配性检查：定义的基本块必须支配当前基本块
+          if (!dom_tree.Dominates(def_bb, current_bb)) {
+            std::ostringstream oss;
+            oss << "IR 验证失败: 函数 '" << func->GetName()
+                << "' 中的指令 '" << inst->GetName()
+                << "' (opcode=" << static_cast<int>(inst->GetOpcode())
+                << ", 基本块 '" << current_bb->GetName()
+                << "') 使用了未支配的操作数 '" << op_inst->GetName()
+                << "' (定义于 '" << def_bb->GetName() << "')";
+            LogError(oss.str(), std::cerr);
+            std::abort();
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
+
+void VerifyIR(Module& module) {
+  for (const auto& func_ptr : module.GetFunctions()) {
+    auto* func = func_ptr.get();
+    // 跳过外部声明（没有函数体）
+    if (func->IsExternal()) continue;
+    // 跳过空函数体
+    if (func->GetBlocks().empty()) continue;
+
+    VerifyFunction(func);
+  }
+}
+
+}  // namespace ir
diff --git a/src/ir/passes/IfConversion.cpp b/src/ir/passes/IfConversion.cpp
new file mode 100644
index 00000000..c5e8bf95
--- /dev/null
+++ b/src/ir/passes/IfConversion.cpp
@@ -0,0 +1,290 @@
+// IfConversion: 将简单 if-else diamond 转换为算术 select
+// - 扫描 CondBr→T→Br→M 且 F==M 的 diamond 模式
+// - 安全检查：T 必须只有单一前驱（B），仅允许纯算术指令（禁 Div/Mod/浮点）
+// - 将 phi 转换为 fv + (tv-fv)*zext(cond)
+// - 配合 CFGSimplify 清理空块，使循环体变为单 BB → 可被 LoopUnroll 展开
+
+#include "ir/IR.h"
+
+#include <unordered_set>
+#include <vector>
+
+namespace ir {
+
+namespace {
+
+static Value* UnwrapCondition(Value* cond) {
+  for (int pass = 0; pass < 2; ++pass) {
+    auto* outer = dynamic_cast<BinaryInst*>(cond);
+    if (!outer || outer->GetOpcode() != Opcode::Ne) break;
+    auto* rc = dynamic_cast<ConstantInt*>(outer->GetRhs());
+    if (!rc || rc->GetValue() != 0) break;
+    auto* zext = dynamic_cast<CastInst*>(outer->GetLhs());
+    if (!zext || zext->GetOpcode() != Opcode::ZExt) break;
+    cond = zext->GetOperandValue();
+  }
+  return cond;
+}
+
+static BasicBlock* GetOnlyBrTarget(BasicBlock* bb) {
+  const auto& insts = bb->GetInstructions();
+  if (insts.empty()) return nullptr;
+  auto* br = dynamic_cast<BranchInst*>(insts.back().get());
+  return br ? br->GetTarget() : nullptr;
+}
+
+static std::vector<BasicBlock*> ComputePredecessors(
+    BasicBlock* bb, const std::vector<std::unique_ptr<BasicBlock>>& all_blocks) {
+  std::vector<BasicBlock*> preds;
+  for (const auto& other : all_blocks) {
+    if (other.get() == bb) continue;
+    const auto& insts = other->GetInstructions();
+    if (insts.empty()) continue;
+    auto* term = insts.back().get();
+    if (auto* br = dynamic_cast<BranchInst*>(term)) {
+      if (br->GetTarget() == bb) preds.push_back(other.get());
+    } else if (auto* cbr = dynamic_cast<CondBranchInst*>(term)) {
+      if (cbr->GetTrueTarget() == bb || cbr->GetFalseTarget() == bb)
+        preds.push_back(other.get());
+    }
+  }
+  return preds;
+}
+
+static bool IsSimpleBlock(BasicBlock* bb) {
+  for (const auto& inst : bb->GetInstructions()) {
+    switch (inst->GetOpcode()) {
+      case Opcode::Add: case Opcode::Sub: case Opcode::Mul:
+      case Opcode::And: case Opcode::Or:
+      case Opcode::Eq:  case Opcode::Ne:  case Opcode::Lt:
+      case Opcode::Le:  case Opcode::Gt:  case Opcode::Ge:
+      case Opcode::ZExt:
+      case Opcode::Br:
+        continue;
+      default:
+        return false;
+    }
+  }
+  return true;
+}
+
+static Value* GetPhiValueFrom(PhiInst* phi, BasicBlock* bb) {
+  for (size_t i = 0; i < phi->GetNumOperands(); i += 2) {
+    if (dynamic_cast<BasicBlock*>(phi->GetOperand(i + 1)) == bb)
+      return phi->GetOperand(i);
+  }
+  return nullptr;
+}
+
+static void RemovePhiEntriesFrom(PhiInst* phi, BasicBlock* bb) {
+  std::vector<std::pair<Value*, Value*>> keep;
+  for (size_t i = 0; i < phi->GetNumOperands(); i += 2) {
+    auto* pred = dynamic_cast<BasicBlock*>(phi->GetOperand(i + 1));
+    if (pred != bb)
+      keep.push_back({phi->GetOperand(i), phi->GetOperand(i + 1)});
+  }
+  if (keep.size() * 2 != phi->GetNumOperands()) {
+    phi->ClearOperands();
+    for (auto& [val, pred] : keep) {
+      phi->AddOperand(val);
+      phi->AddOperand(pred);
+    }
+  }
+}
+
+static void SetPhiEntry(PhiInst* phi, BasicBlock* bb, Value* val) {
+  for (size_t i = 0; i < phi->GetNumOperands(); i += 2) {
+    if (dynamic_cast<BasicBlock*>(phi->GetOperand(i + 1)) == bb) {
+      phi->SetOperand(i, val);
+      return;
+    }
+  }
+  phi->AddOperand(val);
+  phi->AddOperand(bb);
+}
+
+static bool TryConvertOneDiamond(BasicBlock* B, BasicBlock* T, BasicBlock* M,
+                                  Value* cond_i1, Context& ctx,
+                                  const std::vector<std::unique_ptr<BasicBlock>>& all_blocks) {
+  if (!IsSimpleBlock(T)) return false;
+  if (GetOnlyBrTarget(T) != M) return false;
+  auto t_preds = ComputePredecessors(T, all_blocks);
+  if (t_preds.size() != 1 || t_preds[0] != B) return false;
+
+  struct PhiEntry { PhiInst* phi; Value* val_t; Value* val_f; };
+  std::vector<PhiEntry> to_convert;
+  for (const auto& inst : M->GetInstructions()) {
+    auto* phi = dynamic_cast<PhiInst*>(inst.get());
+    if (!phi) break;
+    // 仅处理 i32 类型的 phi——算术 select 变换不支持 float
+    if (!phi->GetType()->IsInt32()) continue;
+    Value* val_t = GetPhiValueFrom(phi, T);
+    if (!val_t) continue;
+    Value* val_f = GetPhiValueFrom(phi, B);
+    if (!val_f) {
+      for (size_t i = 0; i < phi->GetNumOperands(); i += 2) {
+        auto* pred = dynamic_cast<BasicBlock*>(phi->GetOperand(i + 1));
+        if (pred != T) { val_f = phi->GetOperand(i); break; }
+      }
+    }
+    if (!val_f) continue;
+    to_convert.push_back({phi, val_t, val_f});
+  }
+  if (to_convert.empty()) return false;
+
+  // 检查 T 块指令类型：浮点运算移入无条件块会改变语义
+  // 仅当 T 块所有指令均为 i32/i1/void 类型时才安全
+  for (const auto& inst : T->GetInstructions()) {
+    if (inst->GetOpcode() == Opcode::Br) continue;
+    auto ty = inst->GetType();
+    // Block with no type info is suspicious, skip safely
+    if (!ty) continue;
+    if (!ty->IsInt32() && !ty->IsInt1() && !ty->IsVoid()) {
+      return false;
+    }
+  }
+
+  auto* cbr = B->GetInstructions().back().get();
+  cbr->ClearOperands();  // 析构前清理操作数引用，防止悬空 use
+  B->TakeInstruction(cbr);
+
+  auto& t_insts = const_cast<std::vector<std::unique_ptr<Instruction>>&>(T->GetInstructions());
+  std::vector<Instruction*> t_to_move;
+  for (const auto& inst : t_insts)
+    if (inst->GetOpcode() != Opcode::Br)
+      t_to_move.push_back(inst.get());
+  for (auto* inst : t_to_move) {
+    auto taken = T->TakeInstruction(inst);
+    B->InsertInstructionBeforeTerminator(std::move(taken));
+  }
+  if (!T->GetInstructions().empty()) {
+    auto* last_inst = T->GetInstructions().back().get();
+    last_inst->ClearOperands();
+    T->TakeInstruction(last_inst);
+  }
+
+  for (auto& [phi, val_t, val_f] : to_convert) {
+    if (val_t == val_f) {
+      RemovePhiEntriesFrom(phi, T);
+      SetPhiEntry(phi, B, val_f);
+      continue;
+    }
+    auto* zext = B->Append<CastInst>(Opcode::ZExt, Type::GetInt32Type(), cond_i1, ctx.NextTemp());
+    auto* diff = B->Append<BinaryInst>(Opcode::Sub, Type::GetInt32Type(), val_t, val_f, ctx.NextTemp());
+    auto* masked = B->Append<BinaryInst>(Opcode::Mul, Type::GetInt32Type(), diff, zext, ctx.NextTemp());
+    auto* select_val = B->Append<BinaryInst>(Opcode::Add, Type::GetInt32Type(), val_f, masked, ctx.NextTemp());
+    RemovePhiEntriesFrom(phi, T);
+    SetPhiEntry(phi, B, select_val);
+  }
+
+  B->Append<BranchInst>(Type::GetVoidType(), M);
+  return true;
+}
+
+static void IfConvertFunction(Function* func, Context& ctx) {
+  auto& blocks = const_cast<std::vector<std::unique_ptr<BasicBlock>>&>(func->GetBlocks());
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    for (const auto& bb : blocks) {
+      const auto& insts = bb->GetInstructions();
+      if (insts.empty()) continue;
+      auto* cbr = dynamic_cast<CondBranchInst*>(insts.back().get());
+      if (!cbr) continue;
+      BasicBlock* T = cbr->GetTrueTarget();
+      BasicBlock* F = cbr->GetFalseTarget();
+      Value* cond = UnwrapCondition(cbr->GetCond());
+      if (TryConvertOneDiamond(bb.get(), T, F, cond, ctx, blocks)) {
+        changed = true;
+        break;
+      }
+    }
+  }
+}
+
+static void CleanupRedundantPhis(Function* func) {
+  for (const auto& bb : func->GetBlocks()) {
+    auto& insts = const_cast<std::vector<std::unique_ptr<Instruction>>&>(bb->GetInstructions());
+    for (size_t i = 0; i < insts.size(); ) {
+      auto* phi = dynamic_cast<PhiInst*>(insts[i].get());
+      if (!phi) break;
+      Value* unique_val = nullptr;
+      bool all_same = true;
+      for (size_t j = 0; j < phi->GetNumOperands(); j += 2) {
+        Value* v = phi->GetOperand(j);
+        if (!unique_val) unique_val = v;
+        else if (unique_val != v) { all_same = false; break; }
+      }
+      if (all_same && unique_val) {
+        phi->ReplaceAllUsesWith(unique_val);
+        phi->ClearOperands();
+        insts.erase(insts.begin() + i);
+        continue;
+      }
+      ++i;
+    }
+  }
+}
+
+static void MergeSinglePredBlocks(Function* func) {
+  auto& blocks = const_cast<std::vector<std::unique_ptr<BasicBlock>>&>(func->GetBlocks());
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    for (auto& bb_ptr : blocks) {
+      BasicBlock* bb = bb_ptr.get();
+      if (bb == func->GetEntry()) continue;
+      bool has_phi = false;
+      for (const auto& inst : bb->GetInstructions()) {
+        if (dynamic_cast<PhiInst*>(inst.get())) { has_phi = true; break; }
+      }
+      if (has_phi) continue;
+      auto preds = ComputePredecessors(bb, blocks);
+      if (preds.size() != 1) continue;
+      BasicBlock* pred = preds[0];
+      if (pred == bb) continue;
+      const auto& pred_insts = pred->GetInstructions();
+      if (pred_insts.empty()) continue;
+      auto* br = dynamic_cast<BranchInst*>(pred_insts.back().get());
+      if (!br || br->GetTarget() != bb) continue;
+      auto* pred_term = pred_insts.back().get();
+      pred_term->ClearOperands();
+      pred->TakeInstruction(pred_term);
+      auto& bb_insts = const_cast<std::vector<std::unique_ptr<Instruction>>&>(bb->GetInstructions());
+      std::vector<Instruction*> to_move;
+      for (auto& inst : bb_insts)
+        to_move.push_back(inst.get());
+      for (auto* inst : to_move) {
+        auto taken = bb->TakeInstruction(inst);
+        pred->InsertInstructionBeforeTerminator(std::move(taken));
+      }
+      for (auto& other : blocks) {
+        if (other.get() == bb) continue;
+        auto& o_insts = const_cast<std::vector<std::unique_ptr<Instruction>>&>(other->GetInstructions());
+        for (auto& inst : o_insts) {
+          auto* phi = dynamic_cast<PhiInst*>(inst.get());
+          if (!phi) break;
+          for (size_t i = 0; i < phi->GetNumOperands(); i += 2) {
+            if (dynamic_cast<BasicBlock*>(phi->GetOperand(i + 1)) == bb)
+              phi->SetOperand(i + 1, pred);
+          }
+        }
+      }
+      changed = true;
+      break;
+    }
+  }
+}
+
+}  // namespace
+
+void RunIfConversion(Module& module) {
+  for (auto& func : module.GetFunctions()) {
+    if (func->IsExternal()) continue;
+    IfConvertFunction(func.get(), module.GetContext());
+    CleanupRedundantPhis(func.get());
+    MergeSinglePredBlocks(func.get());
+  }
+}
+
+}  // namespace ir
diff --git a/src/ir/passes/LoopInterchange.cpp b/src/ir/passes/LoopInterchange.cpp
new file mode 100644
index 00000000..128f482a
--- /dev/null
+++ b/src/ir/passes/LoopInterchange.cpp
@@ -0,0 +1,1128 @@
+#include "ir/IR.h"
+
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <optional>
+#include <queue>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace ir {
+
+namespace {
+
+constexpr bool kDebugLoopInterchange = false;
+constexpr bool kEnableTransform = true;  // 停点三：启用 IR 变换
+
+// ===========================================================================
+// CFG 辅助函数（从 LICM.cpp 复用模式）
+// ===========================================================================
+
+std::vector<BasicBlock*> GetSuccessors(BasicBlock* bb) {
+  std::vector<BasicBlock*> succs;
+  if (!bb) return succs;
+
+  const auto& insts = bb->GetInstructions();
+  if (insts.empty()) return succs;
+
+  auto* term = insts.back().get();
+  if (!term) return succs;
+
+  switch (term->GetOpcode()) {
+    case Opcode::Br: {
+      auto* br = static_cast<BranchInst*>(term);
+      succs.push_back(br->GetTarget());
+      break;
+    }
+    case Opcode::CondBr: {
+      auto* cbr = static_cast<CondBranchInst*>(term);
+      succs.push_back(cbr->GetTrueTarget());
+      succs.push_back(cbr->GetFalseTarget());
+      break;
+    }
+    default:
+      break;
+  }
+
+  return succs;
+}
+
+std::vector<BasicBlock*> GetPredecessors(BasicBlock* bb, Function* func) {
+  std::vector<BasicBlock*> preds;
+  if (!bb || !func) return preds;
+
+  for (auto& block : func->GetBlocks()) {
+    auto succs = GetSuccessors(block.get());
+    for (auto* succ : succs) {
+      if (succ == bb) {
+        preds.push_back(block.get());
+        break;
+      }
+    }
+  }
+
+  return preds;
+}
+
+// ===========================================================================
+// 循环检测（从 LICM.cpp 复用）
+// ===========================================================================
+
+struct Loop {
+  BasicBlock* header = nullptr;
+  BasicBlock* preheader = nullptr;
+  std::set<BasicBlock*> blocks;
+};
+
+BasicBlock* FindPreheader(Loop* loop, Function* func) {
+  auto header_preds = GetPredecessors(loop->header, func);
+  BasicBlock* preheader = nullptr;
+
+  for (auto* pred : header_preds) {
+    if (loop->blocks.find(pred) == loop->blocks.end()) {
+      if (preheader == nullptr) {
+        preheader = pred;
+      } else {
+        return nullptr;  // 多个外部前驱 → 不是自然循环
+      }
+    }
+  }
+
+  if (preheader) {
+    auto succs = GetSuccessors(preheader);
+    if (succs.size() == 1 && succs[0] == loop->header) {
+      return preheader;
+    }
+  }
+
+  return nullptr;
+}
+
+std::vector<std::unique_ptr<Loop>> FindLoops(Function* func) {
+  std::vector<std::unique_ptr<Loop>> loops;
+
+  std::unordered_map<BasicBlock*, int> dfn;
+  std::vector<BasicBlock*> postorder;
+
+  std::set<BasicBlock*> visited;
+  std::vector<std::pair<BasicBlock*, size_t>> stack;
+
+  if (func->GetBlocks().empty()) return loops;
+
+  auto* entry = func->GetEntry();
+  if (!entry) return loops;
+
+  stack.push_back({entry, 0});
+
+  // DFS 后序遍历
+  while (!stack.empty()) {
+    auto& top = stack.back();
+    auto* bb = top.first;
+    auto& child_idx = top.second;
+
+    if (child_idx == 0) {
+      if (visited.count(bb)) {
+        stack.pop_back();
+        continue;
+      }
+      visited.insert(bb);
+      dfn[bb] = static_cast<int>(dfn.size());
+    }
+
+    auto succs = GetSuccessors(bb);
+    bool found_new = false;
+
+    while (child_idx < succs.size()) {
+      auto* succ = succs[child_idx];
+      child_idx++;
+
+      if (visited.count(succ) == 0) {
+        stack.push_back({succ, 0});
+        found_new = true;
+        break;
+      }
+    }
+
+    if (!found_new) {
+      postorder.push_back(bb);
+      stack.pop_back();
+    }
+  }
+
+  std::reverse(postorder.begin(), postorder.end());
+
+  // 检测回边（succ DFN <= bb DFN）
+  for (auto* bb : postorder) {
+    auto succs = GetSuccessors(bb);
+    for (auto* succ : succs) {
+      if (dfn.count(succ) && dfn[succ] <= dfn[bb]) {
+        auto loop = std::make_unique<Loop>();
+        loop->header = succ;
+
+        std::set<BasicBlock*> in_loop;
+        std::queue<BasicBlock*> worklist;
+
+        in_loop.insert(succ);
+        if (bb != succ) {
+          in_loop.insert(bb);
+          worklist.push(bb);
+        }
+
+        while (!worklist.empty()) {
+          auto* current = worklist.front();
+          worklist.pop();
+
+          auto preds = GetPredecessors(current, func);
+          for (auto* pred : preds) {
+            if (in_loop.count(pred) == 0) {
+              in_loop.insert(pred);
+              worklist.push(pred);
+            }
+          }
+        }
+
+        loop->blocks = std::move(in_loop);
+        loop->preheader = FindPreheader(loop.get(), func);
+
+        if (loop->preheader) {
+          loops.push_back(std::move(loop));
+        }
+      }
+    }
+  }
+
+  return loops;
+}
+
+// ===========================================================================
+// 二维循环嵌套结构
+// ===========================================================================
+
+struct LoopNest {
+  BasicBlock* outer_header = nullptr;
+  BasicBlock* inner_preheader = nullptr;   // outer body / inner preheader
+  BasicBlock* inner_header = nullptr;
+  BasicBlock* inner_body = nullptr;        // inner body + inner latch
+  BasicBlock* inner_exit = nullptr;        // inner exit / outer latch
+  BasicBlock* outer_exit = nullptr;
+
+  PhiInst* outer_iv_phi = nullptr;
+  PhiInst* inner_iv_phi = nullptr;
+  PhiInst* outer_passthrough_phi = nullptr;
+
+  Value* outer_bound = nullptr;
+  Value* inner_bound = nullptr;
+  BinaryInst* outer_cmp = nullptr;
+  BinaryInst* inner_cmp = nullptr;
+  CondBranchInst* outer_condbr = nullptr;
+  CondBranchInst* inner_condbr = nullptr;
+
+  BinaryInst* inner_inc = nullptr;
+  BinaryInst* outer_inc = nullptr;
+};
+
+// ===========================================================================
+// 二维循环嵌套检测
+// ===========================================================================
+
+// 透过 zext + icmp ne 找到真正的循环比较指令
+// 模式: condbr(icmp ne(zext(icmp slt(iv, bound)), 0))
+// 返回真实 icmp slt/sle 的 LHS(IV) 和 RHS(bound)
+struct LoopCmpInfo {
+  BinaryInst* real_cmp = nullptr;  // icmp slt/sle
+  Value* iv = nullptr;
+  Value* bound = nullptr;
+};
+
+LoopCmpInfo ExtractLoopCmp(CondBranchInst* condbr) {
+  LoopCmpInfo info;
+
+  // Step 1: condbr 的 condition 是 icmp ne %x, 0
+  auto* icmp_ne = dynamic_cast<BinaryInst*>(condbr->GetCond());
+  if (!icmp_ne) return info;
+  if (icmp_ne->GetOpcode() != Opcode::Ne) return info;
+
+  // rhs 必须是常量 0
+  auto* zero_const = dynamic_cast<ConstantInt*>(icmp_ne->GetRhs());
+  if (!zero_const || zero_const->GetValue() != 0) return info;
+
+  // Step 2: lhs 是 zext 的结果
+  auto* zext = dynamic_cast<CastInst*>(icmp_ne->GetLhs());
+  if (!zext || zext->GetOpcode() != Opcode::ZExt) return info;
+
+  // Step 3: zext 的操作数是真正的 icmp slt/sle
+  auto* real_cmp = dynamic_cast<BinaryInst*>(zext->GetOperandValue());
+  if (!real_cmp) return info;
+  auto opcode = real_cmp->GetOpcode();
+  if (opcode != Opcode::Lt && opcode != Opcode::Le) return info;
+
+  info.real_cmp = real_cmp;
+  info.iv = real_cmp->GetLhs();
+  info.bound = real_cmp->GetRhs();
+  return info;
+}
+
+std::optional<LoopNest> DetectTwoDLoopNest(Loop* outer_loop, Function* /*func*/) {
+  LoopNest nest;
+
+  // ---- Step 1: 验证外层 header ----
+  nest.outer_header = outer_loop->header;
+
+  // 1a: 查找 condbr（必须先做，才能提取 cmp）
+  if (!nest.outer_header->HasTerminator()) return std::nullopt;
+  auto* outer_term = nest.outer_header->GetInstructions().back().get();
+  nest.outer_condbr = dynamic_cast<CondBranchInst*>(outer_term);
+  if (!nest.outer_condbr) {
+    if (kDebugLoopInterchange)
+      std::cerr << "[LoopInterchange] 外层 header 非 condbr\n";
+    return std::nullopt;
+  }
+
+  // 1b: 透过 zext + icmp ne 找到真正的循环比较
+  auto outer_cmp_info = ExtractLoopCmp(nest.outer_condbr);
+  if (!outer_cmp_info.real_cmp) {
+    if (kDebugLoopInterchange)
+      std::cerr << "[LoopInterchange] 外层 condbr 无法提取循环比较\n";
+    return std::nullopt;
+  }
+  nest.outer_cmp = outer_cmp_info.real_cmp;
+  nest.outer_bound = outer_cmp_info.bound;
+
+  // 1c: 收集候选 phi，用 cmp LHS 匹配选择 outer IV
+  std::vector<PhiInst*> outer_phis;
+  for (auto& inst : nest.outer_header->GetInstructions()) {
+    auto* phi = dynamic_cast<PhiInst*>(inst.get());
+    if (!phi) break;
+    if (phi->GetNumOperands() == 4) outer_phis.push_back(phi);
+  }
+
+  for (auto* phi : outer_phis) {
+    if (phi == outer_cmp_info.iv) {
+      nest.outer_iv_phi = phi;
+      break;
+    }
+  }
+
+  if (!nest.outer_iv_phi) {
+    // 退路：取第一个 phi
+    if (!outer_phis.empty()) {
+      nest.outer_iv_phi = outer_phis[0];
+    } else {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] 外层 header 无有效 phi\n";
+      return std::nullopt;
+    }
+  }
+
+  // 确定 inner_preheader / outer_exit
+  {
+    auto* true_bb = nest.outer_condbr->GetTrueTarget();
+    auto* false_bb = nest.outer_condbr->GetFalseTarget();
+
+    if (outer_loop->blocks.count(true_bb) && !outer_loop->blocks.count(false_bb)) {
+      nest.inner_preheader = true_bb;
+      nest.outer_exit = false_bb;
+    } else if (!outer_loop->blocks.count(true_bb) && outer_loop->blocks.count(false_bb)) {
+      nest.inner_preheader = false_bb;
+      nest.outer_exit = true_bb;
+    } else {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] 外层 condbr 无法区分 preheader/exit\n";
+      return std::nullopt;
+    }
+  }
+
+  // ---- Step 2: 验证 inner_preheader（外层 body）----
+  // 必须只有一个无条件分支到 inner_header
+  if (nest.inner_preheader->HasTerminator()) {
+    auto* ph_term = nest.inner_preheader->GetInstructions().back().get();
+    auto* ph_br = dynamic_cast<BranchInst*>(ph_term);
+    if (!ph_br) {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] inner_preheader terminator 非 br\n";
+      return std::nullopt;
+    }
+    nest.inner_header = ph_br->GetTarget();
+  } else {
+    return std::nullopt;
+  }
+
+  // inner_preheader: 允许非 br 指令（放松 perfect nest 要求）
+  // 只拒绝 phi（说明不是自然循环）和 call（可能有副作用）
+  for (auto& inst : nest.inner_preheader->GetInstructions()) {
+    if (inst->IsTerminator()) break;
+    if (dynamic_cast<PhiInst*>(inst.get())) {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] inner_preheader 含 phi → 非 perfect nest\n";
+      return std::nullopt;
+    }
+    if (dynamic_cast<CallInst*>(inst.get())) {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] inner_preheader 含 call → 非 perfect nest\n";
+      return std::nullopt;
+    }
+  }
+
+  // ---- Step 3: 验证内层 header ----
+  // 3a: 查找 condbr 并提取 cmp
+  if (!nest.inner_header->HasTerminator()) return std::nullopt;
+  {
+    auto* inner_term = nest.inner_header->GetInstructions().back().get();
+    nest.inner_condbr = dynamic_cast<CondBranchInst*>(inner_term);
+    if (!nest.inner_condbr) {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] 内层 header 非 condbr\n";
+      return std::nullopt;
+    }
+  }
+
+  auto inner_cmp_info = ExtractLoopCmp(nest.inner_condbr);
+  if (!inner_cmp_info.real_cmp) {
+    if (kDebugLoopInterchange)
+      std::cerr << "[LoopInterchange] 内层 condbr 无法提取循环比较\n";
+    return std::nullopt;
+  }
+  nest.inner_cmp = inner_cmp_info.real_cmp;
+  nest.inner_bound = inner_cmp_info.bound;
+
+  // 3b: 收集 phi，区分 inner IV 和 outer passthrough
+  {
+    std::vector<PhiInst*> phis;
+    for (auto& inst : nest.inner_header->GetInstructions()) {
+      auto* phi = dynamic_cast<PhiInst*>(inst.get());
+      if (!phi) break;
+      if (phi->GetNumOperands() == 4) phis.push_back(phi);
+    }
+
+    if (phis.size() < 2) {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] 内层 header phi 数量 < 2 (实际 "
+                  << phis.size() << ")\n";
+      return std::nullopt;
+    }
+
+    // 自引用 phi = passthrough。可能有多层嵌套的 passthrough（如 rep→i→j）
+    // 真正的 outer IV passthrough: 其 init 值来自 outer_header 的 outer_iv_phi
+    BasicBlock* latch_bb_for_body = nullptr;
+    for (auto* phi : phis) {
+      Value* init_val = nullptr;
+      Value* latch_val = nullptr;
+      BasicBlock* init_bb = nullptr;
+      BasicBlock* latch_bb = nullptr;
+      for (size_t i = 0; i < phi->GetNumOperands(); i += 2) {
+        auto* bb = dynamic_cast<BasicBlock*>(phi->GetOperand(i + 1));
+        if (bb == nest.inner_preheader) {
+          init_val = phi->GetOperand(i);
+          init_bb = bb;
+        } else {
+          latch_val = phi->GetOperand(i);
+          latch_bb = bb;
+        }
+      }
+      if (latch_val == phi) {  // 自引用 = 某种 passthrough
+        // 记录 inner_body（所有自引用 phi 的 latch_bb 应该相同）
+        if (!nest.inner_body && latch_bb) {
+          nest.inner_body = latch_bb;
+        }
+        // init 值来自 outer_iv_phi → 这是 outer IV passthrough
+        if (init_val == nest.outer_iv_phi) {
+          nest.outer_passthrough_phi = phi;
+        }
+      }
+    }
+
+    if (!nest.outer_passthrough_phi) {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] 未找到 outer IV passthrough phi\n";
+      return std::nullopt;
+    }
+
+    // inner IV: 在非 passthrough phi 中匹配 cmp LHS
+    for (auto* phi : phis) {
+      if (phi == nest.outer_passthrough_phi) continue;
+      if (phi == inner_cmp_info.iv) {
+        nest.inner_iv_phi = phi;
+        break;
+      }
+    }
+
+    // 退路：取第一个非 passthrough phi
+    if (!nest.inner_iv_phi) {
+      for (auto* phi : phis) {
+        if (phi != nest.outer_passthrough_phi) {
+          nest.inner_iv_phi = phi;
+          break;
+        }
+      }
+    }
+
+    if (!nest.inner_iv_phi) {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] 未找到 inner IV phi\n";
+      return std::nullopt;
+    }
+  }
+
+  // 确定 inner_exit
+  {
+    auto* true_bb = nest.inner_condbr->GetTrueTarget();
+    auto* false_bb = nest.inner_condbr->GetFalseTarget();
+
+    if (true_bb == nest.inner_body) {
+      nest.inner_exit = false_bb;
+    } else if (false_bb == nest.inner_body) {
+      nest.inner_exit = true_bb;
+    } else {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] 内层 condbr 目标异常\n";
+      return std::nullopt;
+    }
+  }
+
+  // ---- Step 4: 验证 inner_body ----
+  // inner_body 的 terminator 必须无条件 br 到 inner_header
+  if (!nest.inner_body->HasTerminator()) return std::nullopt;
+  auto* body_term = nest.inner_body->GetInstructions().back().get();
+  auto* body_br = dynamic_cast<BranchInst*>(body_term);
+  if (!body_br || body_br->GetTarget() != nest.inner_header) {
+    if (kDebugLoopInterchange)
+      std::cerr << "[LoopInterchange] inner_body terminator 非 br → inner_header\n";
+    return std::nullopt;
+  }
+
+  // 查找 inner_inc（inner IV 的 +1 指令）
+  {
+    Value* inner_latch_val = nullptr;
+    for (size_t i = 0; i < nest.inner_iv_phi->GetNumOperands(); i += 2) {
+      auto* bb = dynamic_cast<BasicBlock*>(nest.inner_iv_phi->GetOperand(i + 1));
+      if (bb == nest.inner_body) {
+        inner_latch_val = nest.inner_iv_phi->GetOperand(i);
+        break;
+      }
+    }
+    nest.inner_inc = dynamic_cast<BinaryInst*>(inner_latch_val);
+    if (!nest.inner_inc || nest.inner_inc->GetOpcode() != Opcode::Add) {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] inner_inc 非 Add 指令\n";
+      return std::nullopt;
+    }
+    // 验证是 inner_iv + 1
+    if (!(nest.inner_inc->GetLhs() == nest.inner_iv_phi &&
+          dynamic_cast<ConstantInt*>(nest.inner_inc->GetRhs()))) {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] inner_inc 模式非 iv + const\n";
+      return std::nullopt;
+    }
+  }
+
+  // ---- Step 5: 验证 inner_exit ----
+  if (!nest.inner_exit->HasTerminator()) return std::nullopt;
+  auto* exit_term = nest.inner_exit->GetInstructions().back().get();
+  auto* exit_br = dynamic_cast<BranchInst*>(exit_term);
+  if (!exit_br || exit_br->GetTarget() != nest.outer_header) {
+    if (kDebugLoopInterchange)
+      std::cerr << "[LoopInterchange] inner_exit terminator 非 br → outer_header\n";
+    return std::nullopt;
+  }
+
+  // 查找 outer_inc（外层 IV 的 +1 指令）
+  {
+    Value* outer_latch_val = nullptr;
+    for (size_t i = 0; i < nest.outer_iv_phi->GetNumOperands(); i += 2) {
+      auto* bb = dynamic_cast<BasicBlock*>(nest.outer_iv_phi->GetOperand(i + 1));
+      if (bb == nest.inner_exit) {
+        outer_latch_val = nest.outer_iv_phi->GetOperand(i);
+        break;
+      }
+    }
+    nest.outer_inc = dynamic_cast<BinaryInst*>(outer_latch_val);
+    if (!nest.outer_inc || nest.outer_inc->GetOpcode() != Opcode::Add) {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] outer_inc 非 Add 指令\n";
+      return std::nullopt;
+    }
+    // outer_inc 可能引用 outer_passthrough_phi 或 outer_iv_phi
+    if (!(dynamic_cast<ConstantInt*>(nest.outer_inc->GetRhs()))) {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] outer_inc 模式非 iv + const\n";
+      return std::nullopt;
+    }
+  }
+
+  // ---- Step 6: 验证边界相等且 loop-invariant ----
+  {
+    // 构建 nest 所有块的集合（用于 loop-invariance 检查）
+    std::set<BasicBlock*> nest_blocks;
+    nest_blocks.insert(nest.outer_header);
+    nest_blocks.insert(nest.inner_preheader);
+    nest_blocks.insert(nest.inner_header);
+    nest_blocks.insert(nest.inner_body);
+    nest_blocks.insert(nest.inner_exit);
+
+    auto IsLoopInvariant = [&](Value* val) -> bool {
+      auto* inst = dynamic_cast<Instruction*>(val);
+      if (!inst) return true;  // Constant/Argument/GlobalVariable
+      return nest_blocks.count(inst->GetParent()) == 0;
+    };
+
+    auto* outer_const = dynamic_cast<ConstantInt*>(nest.outer_bound);
+    auto* inner_const = dynamic_cast<ConstantInt*>(nest.inner_bound);
+
+    if (outer_const && inner_const) {
+      // 两者都是常量：必须相等
+      if (outer_const->GetValue() != inner_const->GetValue()) {
+        if (kDebugLoopInterchange)
+          std::cerr << "[LoopInterchange] 外层边界(" << outer_const->GetValue()
+                    << ") != 内层边界(" << inner_const->GetValue() << ")\n";
+        return std::nullopt;
+      }
+    } else if (nest.outer_bound == nest.inner_bound) {
+      // 引用同一个 SSA 值：检查是否 loop-invariant
+      if (!IsLoopInvariant(nest.outer_bound)) {
+        if (kDebugLoopInterchange)
+          std::cerr << "[LoopInterchange] 边界不是 loop-invariant\n";
+        return std::nullopt;
+      }
+    } else {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] 边界非 ConstantInt 且不相等\n";
+      return std::nullopt;
+    }
+  }
+
+  // ---- Step 7: 验证 perfect nest（放宽：允许 guard condbr + 嵌套循环）----
+  bool has_nested_loop = false;
+  for (auto& inst : nest.inner_body->GetInstructions()) {
+    if (inst->IsTerminator()) break;
+    if (dynamic_cast<CallInst*>(inst.get())) {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] inner_body 含 call → 非 perfect nest\n";
+      return std::nullopt;
+    }
+    if (auto* cbr = dynamic_cast<CondBranchInst*>(inst.get())) {
+      // guard condbr: 一个目标回到 inner_header（continue/skip），允许
+      if (cbr->GetTrueTarget() == nest.inner_header ||
+          cbr->GetFalseTarget() == nest.inner_header) {
+        if (kDebugLoopInterchange)
+          std::cerr << "[LoopInterchange] inner_body 含 guard condbr（→inner_header），允许\n";
+        // 另一个目标可能通往嵌套循环（3D nest），标记并继续检查
+        has_nested_loop = true;
+        continue;
+      }
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] inner_body 含非 guard condbr → 拒绝\n";
+      return std::nullopt;
+    }
+  }
+  // 若有嵌套循环（3D nest），记录但允许（2D outer pair 交换不影响 innermost 语义）
+  if (has_nested_loop && kDebugLoopInterchange) {
+    std::cerr << "[LoopInterchange] 检测到 3D nest，将 outer 2D pair 作为交换目标\n";
+  }
+
+  if (kDebugLoopInterchange) {
+    std::cerr << "[LoopInterchange] ✅ 检测到二维 perfect nest:\n";
+    std::cerr << "  outer_header: " << nest.outer_header->GetName() << "\n";
+    std::cerr << "  inner_preheader: " << nest.inner_preheader->GetName() << "\n";
+    std::cerr << "  inner_header: " << nest.inner_header->GetName() << "\n";
+    std::cerr << "  inner_body: " << nest.inner_body->GetName() << "\n";
+    std::cerr << "  inner_exit: " << nest.inner_exit->GetName() << "\n";
+    std::cerr << "  outer_exit: " << nest.outer_exit->GetName() << "\n";
+    std::cerr << "  outer_iv: " << nest.outer_iv_phi->GetName() << "\n";
+    std::cerr << "  inner_iv: " << nest.inner_iv_phi->GetName() << "\n";
+    std::cerr << "  passthrough: " << nest.outer_passthrough_phi->GetName() << "\n";
+    if (auto* oc = dynamic_cast<ConstantInt*>(nest.outer_bound))
+      std::cerr << "  bound: " << oc->GetValue() << "\n";
+    else
+      std::cerr << "  bound: " << nest.outer_bound->GetName() << " (loop-invariant)\n";
+  }
+
+  return nest;
+}
+
+// ===========================================================================
+// 合法性判定
+// ===========================================================================
+
+// 前置声明
+struct IVCoefficients {
+  int outer_coeff = 0;
+  int inner_coeff = 0;
+};
+IVCoefficients ComputeIVCoefficients(Value* expr, PhiInst* outer_iv, PhiInst* inner_iv);
+
+enum class RejectReason {
+  kNotRejected,
+  kNotPerfectNest,
+  kDependenceIllegal,
+  kComplexCFG,
+  kDifferentBounds,
+  kHasCall,
+};
+
+const char* RejectReasonStr(RejectReason r) {
+  switch (r) {
+    case RejectReason::kNotRejected: return "not-rejected";
+    case RejectReason::kNotPerfectNest: return "not-perfect-nest";
+    case RejectReason::kDependenceIllegal: return "dependence-illegal";
+    case RejectReason::kComplexCFG: return "complex-cfg";
+    case RejectReason::kDifferentBounds: return "different-bounds";
+    case RejectReason::kHasCall: return "has-call";
+  }
+  return "unknown";
+}
+
+bool IsLegalToInterchange(const LoopNest& nest) {
+  std::vector<LoadInst*> loads;
+  std::vector<StoreInst*> stores;
+  bool has_scalar_store = false;
+
+  for (auto& inst : nest.inner_body->GetInstructions()) {
+    if (inst->IsTerminator()) break;
+    if (auto* l = dynamic_cast<LoadInst*>(inst.get())) loads.push_back(l);
+    if (auto* s = dynamic_cast<StoreInst*>(inst.get())) stores.push_back(s);
+    if (dynamic_cast<CallInst*>(inst.get())) {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] ❌ inner_body 含 CallInst\n";
+      return false;
+    }
+  }
+
+  // 检测归约/累加模式：
+  // 1. inner header 中有超过 2 个 phi 时，检查额外 phi 是否归约累加器
+  {
+    std::vector<PhiInst*> inner_phis;
+    for (auto& inst : nest.inner_header->GetInstructions()) {
+      auto* phi = dynamic_cast<PhiInst*>(inst.get());
+      if (!phi) break;
+      if (phi->GetNumOperands() == 4) inner_phis.push_back(phi);
+    }
+    for (auto* phi : inner_phis) {
+      // 跳过已识别的 inner IV 和 outer passthrough
+      if (phi == nest.inner_iv_phi || phi == nest.outer_passthrough_phi)
+        continue;
+
+      // 检查此 phi 的 latch 值是否涉及 phi 自身的计算（归约特征）
+      for (size_t i = 0; i < phi->GetNumOperands(); i += 2) {
+        auto* bb = dynamic_cast<BasicBlock*>(phi->GetOperand(i + 1));
+        if (bb == nest.inner_body) {
+          auto* latch_val = phi->GetOperand(i);
+          // self-reference → 可能是另一个 passthrough，放行
+          if (latch_val == phi) break;
+          // BinaryInst 且含 phi → 归约累加器
+          if (auto* bin = dynamic_cast<BinaryInst*>(latch_val)) {
+            if (bin->GetLhs() == phi || bin->GetRhs() == phi) {
+              if (kDebugLoopInterchange)
+                std::cerr << "[LoopInterchange] ❌ 检测到归约 phi: "
+                          << phi->GetName() << "\n";
+              return false;
+            }
+          }
+          // 非 BinaryInst 的 latch → 无法分析，保守拒绝
+          if (kDebugLoopInterchange)
+            std::cerr << "[LoopInterchange] ❌ 额外 phi " << phi->GetName()
+                      << " 的 latch 非简单模式\n";
+          return false;
+        }
+      }
+    }
+  }
+
+  // 2. store 到非 GEP 地址（标量）= reduction (pre-Mem2Reg)
+  for (auto* store : stores) {
+    auto* store_gep = dynamic_cast<GetElementPtrInst*>(store->GetPtr());
+    if (!store_gep) {
+      // store 到标量（非数组地址）→ 归约/累加，不应交换
+      has_scalar_store = true;
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] ❌ inner_body 含标量 store → 归约\n";
+    }
+  }
+
+  // 检测归约 load：从非 GEP 地址读取（标量累加器）
+  for (auto* load : loads) {
+    auto* load_gep = dynamic_cast<GetElementPtrInst*>(load->GetPtr());
+    if (!load_gep) {
+      has_scalar_store = true;  // 标量 load 也视为归约信号
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] ❌ inner_body 含标量 load → 归约\n";
+    }
+  }
+
+  if (has_scalar_store) {
+    return false;
+  }
+
+  // 依赖分析：区分同迭代内依赖 vs 跨迭代依赖
+  // 同 GEP offset（同一 SSA 值）→ 同元素 → 同迭代内 → 安全
+  // 不同 offset → 可能跨迭代 → 保守分析
+  for (auto* store : stores) {
+    auto* store_gep = dynamic_cast<GetElementPtrInst*>(store->GetPtr());
+    if (!store_gep) {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] ❌ store 非 GEP → 间接写入，拒绝\n";
+      return false;
+    }
+    Value* store_base = store_gep->GetBasePtr();
+    Value* store_idx = store_gep->GetIndex();
+
+    for (auto* load : loads) {
+      auto* load_gep = dynamic_cast<GetElementPtrInst*>(load->GetPtr());
+      if (!load_gep) {
+        if (kDebugLoopInterchange)
+          std::cerr << "[LoopInterchange] ❌ load 非 GEP → 间接读取，拒绝\n";
+        return false;
+      }
+
+      if (load_gep->GetBasePtr() != store_base) continue;  // 不同数组，安全
+
+      // 同一数组：检查 offset 是否相同
+      Value* load_idx = load_gep->GetIndex();
+
+      // 同一 SSA 值 → 访问同一元素 → 同迭代内依赖 → 安全
+      if (load_idx == store_idx) {
+        if (kDebugLoopInterchange)
+          std::cerr << "[LoopInterchange] 同数组同 offset → 同迭代内，安全\n";
+        continue;
+      }
+
+      // 不同 offset → 可能跨迭代依赖 → 检查方向向量
+      // 追踪两个 offset 中 IV 的系数差异
+      auto store_coeff = ComputeIVCoefficients(store_idx,
+                                                nest.outer_passthrough_phi,
+                                                nest.inner_iv_phi);
+      auto load_coeff = ComputeIVCoefficients(load_idx,
+                                               nest.outer_passthrough_phi,
+                                               nest.inner_iv_phi);
+
+      // 如果两个 offset 的 inner_coeff 相同且 outer_coeff 相同
+      // → offset 差值不依赖 IV → 常量偏移 → 不同元素但同迭代 → 安全
+      if (store_coeff.inner_coeff == load_coeff.inner_coeff &&
+          store_coeff.outer_coeff == load_coeff.outer_coeff) {
+        if (kDebugLoopInterchange)
+          std::cerr << "[LoopInterchange] 同数组同系数模式 → 常量偏移，安全\n";
+        continue;
+      }
+
+      // 其他情况：保守拒绝
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] ❌ load/store 同数组不同 offset 模式 → 保守拒绝"
+                  << " (s_inner=" << store_coeff.inner_coeff
+                  << " l_inner=" << load_coeff.inner_coeff << ")\n";
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// ===========================================================================
+// 收益分析（停点二实现）
+// ===========================================================================
+
+IVCoefficients ComputeIVCoefficients(Value* expr,
+                                     PhiInst* outer_iv,
+                                     PhiInst* inner_iv) {
+  // 直接引用 IV
+  if (expr == outer_iv) return {1, 0};
+  if (expr == inner_iv) return {0, 1};
+
+  // 常量：系数为 0
+  if (dynamic_cast<ConstantInt*>(expr)) return {0, 0};
+
+  // 非指令：未知
+  auto* inst = dynamic_cast<Instruction*>(expr);
+  if (!inst) return {0, 0};
+
+  // Add：系数相加
+  if (inst->GetOpcode() == Opcode::Add) {
+    auto* bin = static_cast<BinaryInst*>(inst);
+    auto lhs = ComputeIVCoefficients(bin->GetLhs(), outer_iv, inner_iv);
+    auto rhs = ComputeIVCoefficients(bin->GetRhs(), outer_iv, inner_iv);
+    return {lhs.outer_coeff + rhs.outer_coeff,
+            lhs.inner_coeff + rhs.inner_coeff};
+  }
+
+  // Mul：缩放
+  if (inst->GetOpcode() == Opcode::Mul) {
+    auto* bin = static_cast<BinaryInst*>(inst);
+    auto* lhs = bin->GetLhs();
+    auto* rhs = bin->GetRhs();
+
+    // 尝试 (const, value) 或 (value, const)
+    auto* const_op = dynamic_cast<ConstantInt*>(lhs);
+    Value* other = rhs;
+    if (!const_op) {
+      const_op = dynamic_cast<ConstantInt*>(rhs);
+      other = lhs;
+    }
+
+    if (const_op) {
+      int scale = const_op->GetValue();
+      auto inner = ComputeIVCoefficients(other, outer_iv, inner_iv);
+      return {scale * inner.outer_coeff, scale * inner.inner_coeff};
+    }
+  }
+
+  // 无法识别的模式
+  return {0, 0};
+}
+
+bool IsProfitableToInterchange(const LoopNest& nest) {
+  // 严格规则：仅当所有参与地址计算的 GEP 访存都受益时才交换
+  // 即每个访存的 inner_coeff > outer_coeff（内层 stride → 交换后 unit-stride）
+  // 混合受益/受损的案例（如转置 B[i][j]=A[j][i]）实际性能中性或退化，不交换
+  int benefit_count = 0;   // 受益的访存数
+  int harm_count = 0;      // 受损的访存数
+  int neutral_count = 0;   // 不受影响的访存数
+  int benefit_score = 0;
+
+  for (auto& inst : nest.inner_body->GetInstructions()) {
+    if (inst->IsTerminator()) break;
+
+    GetElementPtrInst* gep = nullptr;
+    bool is_load = false;
+
+    if (auto* load = dynamic_cast<LoadInst*>(inst.get())) {
+      gep = dynamic_cast<GetElementPtrInst*>(load->GetPtr());
+      is_load = true;
+    } else if (auto* store = dynamic_cast<StoreInst*>(inst.get())) {
+      gep = dynamic_cast<GetElementPtrInst*>(store->GetPtr());
+    } else {
+      continue;
+    }
+
+    if (!gep) continue;
+
+    auto coeff = ComputeIVCoefficients(gep->GetIndex(),
+                                       nest.outer_passthrough_phi,
+                                       nest.inner_iv_phi);
+    int contrib = coeff.inner_coeff - coeff.outer_coeff;
+    benefit_score += contrib;
+
+    // 分类：受益 / 受损 / 中性
+    if (coeff.outer_coeff == 0 || coeff.inner_coeff == coeff.outer_coeff) {
+      neutral_count++;  // 外层不参与或系数相等 → 交换无影响
+    } else if (coeff.inner_coeff > coeff.outer_coeff) {
+      benefit_count++;  // 内层 stride → 交换变 unit-stride
+    } else {
+      harm_count++;     // 内层 unit-stride → 交换变 stride
+    }
+
+    if (kDebugLoopInterchange) {
+      std::cerr << "[LoopInterchange] GEP base="
+                << (gep->GetBasePtr() ? gep->GetBasePtr()->GetName() : "null")
+                << " " << (is_load ? "load" : "store")
+                << " inner_coeff=" << coeff.inner_coeff
+                << " outer_coeff=" << coeff.outer_coeff
+                << " contrib=" << contrib
+                << "\n";
+    }
+  }
+
+  // 仅当没有受损访存 且 至少有一个受益访存 且 总分 > 0 → 交换
+  return (harm_count == 0) && (benefit_count > 0) && (benefit_score > 0);
+}
+
+// ===========================================================================
+// IR 变换（停点三实现）
+// ===========================================================================
+
+void PerformInterchange(LoopNest& nest, Function* func, Context& ctx) {
+  if constexpr (!kEnableTransform) {
+    if (kDebugLoopInterchange)
+      std::cerr << "[LoopInterchange] kEnableTransform=false, 跳过变换\n";
+    return;
+  }
+
+  if (kDebugLoopInterchange) {
+    std::cerr << "[LoopInterchange] 🔄 执行循环交换变换...\n";
+  }
+
+  // =====================================================================
+  // Step 1: 移动增量指令
+  // inner_inc (j++) 从 inner_body → inner_exit
+  // outer_inc (i++) 从 inner_exit → inner_body
+  // =====================================================================
+
+  // 先保存旧 cmp 的 operands（在 SSA 重写前）
+  Value* saved_outer_bound = nest.outer_bound;
+  Value* saved_inner_bound = nest.inner_bound;
+  BinaryInst* saved_inner_cmp = nest.inner_cmp;
+  BinaryInst* saved_outer_cmp = nest.outer_cmp;
+
+  // 移动 inner_inc (j++): inner_body → inner_exit
+  auto inner_inc_owned = nest.inner_body->TakeInstruction(nest.inner_inc);
+  nest.inner_exit->InsertInstructionBeforeTerminator(std::move(inner_inc_owned));
+
+  // 移动 outer_inc (i++): inner_exit → inner_body
+  auto outer_inc_owned = nest.inner_exit->TakeInstruction(nest.outer_inc);
+  nest.inner_body->InsertInstructionBeforeTerminator(std::move(outer_inc_owned));
+
+  // =====================================================================
+  // Step 2: 在 outer_header 创建新的 j-phi（新外层 IV）
+  // =====================================================================
+  auto* new_outer_phi = nest.outer_header->Prepend<PhiInst>(
+      Type::GetInt32Type(), ctx.NextTemp());
+
+  // 从旧 outer_iv_phi 获取 init 值
+  Value* outer_init_val = nullptr;
+  BasicBlock* outer_init_bb = nullptr;
+  for (size_t i = 0; i < nest.outer_iv_phi->GetNumOperands(); i += 2) {
+    auto* bb = dynamic_cast<BasicBlock*>(nest.outer_iv_phi->GetOperand(i + 1));
+    if (bb != nest.inner_exit) {
+      outer_init_val = nest.outer_iv_phi->GetOperand(i);
+      outer_init_bb = bb;
+    }
+  }
+
+  new_outer_phi->AddOperand(outer_init_val);
+  new_outer_phi->AddOperand(outer_init_bb);
+  new_outer_phi->AddOperand(nest.inner_inc);   // j++（现在在 inner_exit）
+  new_outer_phi->AddOperand(nest.inner_exit);
+
+  // =====================================================================
+  // Step 3: 在 inner_header 创建新的 i-phi（新内层 IV）和 j-passthrough
+  // =====================================================================
+  auto* new_inner_phi = nest.inner_header->Prepend<PhiInst>(
+      Type::GetInt32Type(), ctx.NextTemp());
+
+  // 从旧 inner_iv_phi 获取 init 值
+  Value* inner_init_val = nullptr;
+  for (size_t i = 0; i < nest.inner_iv_phi->GetNumOperands(); i += 2) {
+    auto* bb = dynamic_cast<BasicBlock*>(nest.inner_iv_phi->GetOperand(i + 1));
+    if (bb == nest.inner_preheader) {
+      inner_init_val = nest.inner_iv_phi->GetOperand(i);
+      break;
+    }
+  }
+
+  new_inner_phi->AddOperand(inner_init_val);
+  new_inner_phi->AddOperand(nest.inner_preheader);
+  new_inner_phi->AddOperand(nest.outer_inc);    // i++（现在在 inner_body）
+  new_inner_phi->AddOperand(nest.inner_body);
+
+  // j-passthrough（自引用，把外层的 j 值传到内层 body）
+  auto* new_passthrough = nest.inner_header->Prepend<PhiInst>(
+      Type::GetInt32Type(), ctx.NextTemp());
+
+  new_passthrough->AddOperand(new_outer_phi);   // init = 当前外层的 j
+  new_passthrough->AddOperand(nest.inner_preheader);
+  new_passthrough->AddOperand(new_passthrough); // latch = self（不变）
+  new_passthrough->AddOperand(nest.inner_body);
+
+  // =====================================================================
+  // Step 4: SSA 引用重写
+  // =====================================================================
+  // 4a: outer_passthrough → new_inner_phi（body 中使用 i 的地方）
+  nest.outer_passthrough_phi->ReplaceAllUsesWith(new_inner_phi);
+
+  // 4b: inner_iv_phi → new_passthrough（body 中使用 j 的地方）
+  nest.inner_iv_phi->ReplaceAllUsesWith(new_passthrough);
+
+  // 4c: outer_iv_phi → new_outer_phi
+  nest.outer_iv_phi->ReplaceAllUsesWith(new_outer_phi);
+
+  // =====================================================================
+  // Step 5: 修正比较指令和增量指令
+  // =====================================================================
+  // inner_cmp: 从 slt j, inner_bound → slt i, outer_bound
+  saved_inner_cmp->SetOperand(0, new_inner_phi);
+  saved_inner_cmp->SetOperand(1, saved_outer_bound);
+
+  // outer_cmp: 从 slt i, outer_bound → slt j, inner_bound
+  saved_outer_cmp->SetOperand(0, new_outer_phi);
+  saved_outer_cmp->SetOperand(1, saved_inner_bound);
+
+  // inner_inc (j++): 旧引用 inner_iv_phi → new_passthrough，需改为 new_outer_phi
+  for (size_t i = 0; i < nest.inner_inc->GetNumOperands(); ++i) {
+    if (nest.inner_inc->GetOperand(i) == new_passthrough) {
+      nest.inner_inc->SetOperand(i, new_outer_phi);
+      break;
+    }
+  }
+
+  // =====================================================================
+  // Step 6: 删除旧 phi
+  // =====================================================================
+  nest.outer_header->RemoveInstruction(nest.outer_iv_phi);
+  nest.inner_header->RemoveInstruction(nest.inner_iv_phi);
+  nest.inner_header->RemoveInstruction(nest.outer_passthrough_phi);
+
+  if (kDebugLoopInterchange) {
+    std::cerr << "[LoopInterchange] ✅ 循环交换完成\n";
+  }
+}
+
+// ===========================================================================
+// 主入口
+// ===========================================================================
+
+void RunLoopInterchangeOnFunction(Function* func, Context& ctx) {
+  int block_count = static_cast<int>(func->GetBlocks().size());
+  if (block_count > 500) {
+    if (kDebugLoopInterchange)
+      std::cerr << "[LoopInterchange] 跳过函数 " << func->GetName()
+                << "（" << block_count << " blocks，过大）\n";
+    return;
+  }
+
+  auto loops = FindLoops(func);
+  if (loops.empty()) return;
+
+  if (kDebugLoopInterchange)
+    std::cerr << "[LoopInterchange] 函数 " << func->GetName()
+              << " 检测到 " << loops.size() << " 个循环\n";
+
+  for (auto& loop : loops) {
+    auto nest_opt = DetectTwoDLoopNest(loop.get(), func);
+
+    if (!nest_opt) {
+      if (kDebugLoopInterchange)
+        std::cerr << "[LoopInterchange] interchangeable: no\n"
+                  << "[LoopInterchange] reason: not-perfect-nest\n";
+      continue;
+    }
+
+    auto& nest = *nest_opt;
+
+    // 合法性检查
+    if (!IsLegalToInterchange(nest)) {
+      std::cerr << "[LoopInterchange] interchangeable: no\n"
+                << "[LoopInterchange] reason: dependence-illegal\n";
+      continue;
+    }
+
+    std::cerr << "[LoopInterchange] interchangeable: yes\n";
+
+    // 收益分析
+    if (!IsProfitableToInterchange(nest)) {
+      std::cerr << "[LoopInterchange] profitable: no\n"
+                << "[LoopInterchange] reason: not-worth-it\n";
+      continue;
+    }
+
+    std::cerr << "[LoopInterchange] profitable: yes\n"
+              << "[LoopInterchange] reason: better-unit-stride\n";
+
+    // IR 变换
+    PerformInterchange(nest, func, ctx);
+  }
+}
+
+}  // anonymous namespace
+
+void RunLoopInterchange(Module& module) {
+  auto& ctx = module.GetContext();
+  for (auto& func_ptr : module.GetFunctions()) {
+    auto* func = func_ptr.get();
+    if (func->IsExternal()) continue;
+    RunLoopInterchangeOnFunction(func, ctx);
+  }
+}
+
+}  // namespace ir
diff --git a/src/ir/passes/LoopUnroll.cpp b/src/ir/passes/LoopUnroll.cpp
new file mode 100644
index 00000000..132974e7
--- /dev/null
+++ b/src/ir/passes/LoopUnroll.cpp
@@ -0,0 +1,345 @@
+// 简单 countdown 循环全展开：
+// - 处理形如 while(len) { body; len = len - 1; } 的递减循环
+// - 要求 body 为单 BB，len 初值为编译时常量且 ≤64
+// - 全展开后函数变为单 BB，可被 Inline 内联
+// - 配合 ConstFold 将 len/power 等常量传播到每次迭代
+
+#include "ir/IR.h"
+
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+
+namespace ir {
+
+namespace {
+
+// 检测递减循环模式，返回 (phi, trip_count) 或 nullptr
+static PhiInst* DetectSimpleCountdown(BasicBlock* header, BasicBlock* body,
+                                       BasicBlock* exit_bb, int& trip_count) {
+  // 检查 body → header 回边
+  bool has_backedge = false;
+  for (const auto& inst : body->GetInstructions()) {
+    if (auto* br = dynamic_cast<BranchInst*>(inst.get()))
+      if (br->GetTarget() == header) has_backedge = true;
+  }
+  if (!has_backedge) return nullptr;
+
+  for (const auto& inst : header->GetInstructions()) {
+    auto* phi = dynamic_cast<PhiInst*>(inst.get());
+    if (!phi) continue;
+    if (phi->GetNumOperands() < 4) continue;
+
+    Value* val0 = phi->GetOperand(0);
+    BasicBlock* bb0 = dynamic_cast<BasicBlock*>(phi->GetOperand(1));
+    Value* val1 = phi->GetOperand(2);
+    BasicBlock* bb1 = dynamic_cast<BasicBlock*>(phi->GetOperand(3));
+
+    Value* init_val = nullptr;
+    Value* update_val = nullptr;
+    if (bb0 != body && bb1 == body)      { init_val = val0; update_val = val1; }
+    else if (bb1 != body && bb0 == body) { init_val = val1; update_val = val0; }
+    else continue;
+
+    auto* init_c = dynamic_cast<ConstantInt*>(init_val);
+    if (!init_c) continue;
+    int count = init_c->GetValue();
+    if (count <= 0) continue;  // 由下方成本阈值 kUnrollThreshold 控制展开上限，不再硬编码 64
+
+    auto* sub = dynamic_cast<BinaryInst*>(update_val);
+    if (!sub || sub->GetOpcode() != Opcode::Sub) continue;
+    if (sub->GetLhs() != phi) continue;
+    auto* dec = dynamic_cast<ConstantInt*>(sub->GetRhs());
+    if (!dec || dec->GetValue() != 1) continue;
+
+    // 检查退出条件 phi == 0
+    bool exits = false;
+    for (const auto& inst : header->GetInstructions()) {
+      if (auto* cbr = dynamic_cast<CondBranchInst*>(inst.get())) {
+        Value* cond = cbr->GetCond();
+        if (auto* outer = dynamic_cast<BinaryInst*>(cond)) {
+          if (outer->GetOpcode() == Opcode::Ne) {
+            auto* rc = dynamic_cast<ConstantInt*>(outer->GetRhs());
+            if (rc && rc->GetValue() == 0)
+              if (auto* zext = dynamic_cast<CastInst*>(outer->GetLhs()))
+                if (zext->GetOpcode() == Opcode::ZExt) cond = zext->GetOperandValue();
+          }
+        }
+        if (auto* cmp = dynamic_cast<BinaryInst*>(cond)) {
+          Value* other = nullptr;
+          if (cmp->GetLhs() == phi) other = cmp->GetRhs();
+          else if (cmp->GetRhs() == phi) other = cmp->GetLhs();
+          if (other && dynamic_cast<ConstantInt*>(other)) {
+            auto* c = static_cast<ConstantInt*>(other);
+            if (c->GetValue() == 0 &&
+                (cbr->GetTrueTarget() == exit_bb || cbr->GetFalseTarget() == exit_bb))
+              exits = true;
+          }
+        }
+      }
+    }
+    if (!exits) continue;
+
+    trip_count = count;
+    return phi;
+  }
+  return nullptr;
+}
+
+// 克隆指令
+static std::unique_ptr<Instruction> CloneInstruction(
+    Instruction* inst,
+    const std::unordered_map<Value*, Value*>& value_map,
+    Context& ctx) {
+  auto map = [&](Value* v) -> Value* {
+    auto it = value_map.find(v);
+    return (it != value_map.end()) ? it->second : v;
+  };
+  Opcode op = inst->GetOpcode();
+  switch (op) {
+    case Opcode::Add: case Opcode::Sub: case Opcode::Mul:
+    case Opcode::Div: case Opcode::Mod:
+    case Opcode::Eq:  case Opcode::Ne:  case Opcode::Lt:
+    case Opcode::Le:  case Opcode::Gt:  case Opcode::Ge:
+    case Opcode::And: case Opcode::Or: {
+      auto* bin = static_cast<BinaryInst*>(inst);
+      return std::make_unique<BinaryInst>(op, inst->GetType(),
+                                          map(bin->GetLhs()), map(bin->GetRhs()),
+                                          ctx.NextTemp());
+    }
+    case Opcode::Load: {
+      auto* load = static_cast<LoadInst*>(inst);
+      return std::make_unique<LoadInst>(inst->GetType(), map(load->GetPtr()),
+                                        ctx.NextTemp());
+    }
+    case Opcode::Store: {
+      auto* store = static_cast<StoreInst*>(inst);
+      return std::make_unique<StoreInst>(inst->GetType(), map(store->GetValue()),
+                                         map(store->GetPtr()));
+    }
+    case Opcode::ZExt: {
+      auto* cast = static_cast<CastInst*>(inst);
+      return std::make_unique<CastInst>(op, inst->GetType(),
+                                        map(cast->GetOperandValue()), ctx.NextTemp());
+    }
+    case Opcode::SIToFP: case Opcode::FPToSI: {
+      auto* cast = static_cast<CastInst*>(inst);
+      return std::make_unique<CastInst>(op, inst->GetType(),
+                                        map(cast->GetOperandValue()), ctx.NextTemp());
+    }
+    case Opcode::Br: {
+      auto* br = static_cast<BranchInst*>(inst);
+      return std::make_unique<BranchInst>(inst->GetType(), br->GetTarget());
+    }
+    case Opcode::CondBr: {
+      auto* cbr = static_cast<CondBranchInst*>(inst);
+      return std::make_unique<CondBranchInst>(inst->GetType(), map(cbr->GetCond()),
+                                              cbr->GetTrueTarget(), cbr->GetFalseTarget());
+    }
+    default: return nullptr;
+  }
+}
+
+// 展开 countdown 循环
+static bool UnrollSimple(Function* func, BasicBlock* header, BasicBlock* body,
+                          BasicBlock* exit_bb, PhiInst* phi, int trip_count,
+                          Context& ctx) {
+  auto& fb = const_cast<std::vector<std::unique_ptr<BasicBlock>>&>(func->GetBlocks());
+
+  // 收集 body 指令（不含回边）
+  std::vector<Instruction*> body_insts;
+  for (const auto& inst : body->GetInstructions()) {
+    if (auto* br = dynamic_cast<BranchInst*>(inst.get()))
+      if (br->GetTarget() == header) continue;
+    body_insts.push_back(inst.get());
+  }
+
+  // LLVM 风格展开成本阈值：UnrolledSize = (BodySize - 1) * TripCount + 1
+  // 超过阈值则放弃展开，避免代码膨胀导致的 icache 缺失
+  constexpr int kUnrollThreshold = 150;
+  int body_size = static_cast<int>(body_insts.size());
+  int unrolled_cost = (body_size > 0 ? body_size - 1 : 0) * trip_count + 1;
+  if (unrolled_cost > kUnrollThreshold) return false;
+
+  // 找 preheader
+  BasicBlock* preheader = nullptr;
+  for (const auto& bb : func->GetBlocks()) {
+    for (const auto& inst : bb->GetInstructions()) {
+      if (auto* br = dynamic_cast<BranchInst*>(inst.get()))
+        if (br->GetTarget() == header) { preheader = bb.get(); break; }
+      if (auto* cbr = dynamic_cast<CondBranchInst*>(inst.get()))
+        if (cbr->GetTrueTarget() == header || cbr->GetFalseTarget() == header)
+        { preheader = bb.get(); break; }
+    }
+    if (preheader) break;
+  }
+
+  // 收集所有 header phi 的 init/latch 映射（用于跨迭代值追踪）
+  struct PhiInfo { Value* init_val; Value* latch_val; };
+  std::unordered_map<PhiInst*, PhiInfo> phi_info;
+  for (const auto& inst : header->GetInstructions()) {
+    auto* hphi = dynamic_cast<PhiInst*>(inst.get());
+    if (!hphi) break;
+    Value *v0 = hphi->GetOperand(0), *v1 = hphi->GetOperand(2);
+    BasicBlock *bb0 = dynamic_cast<BasicBlock*>(hphi->GetOperand(1));
+    BasicBlock *bb1 = dynamic_cast<BasicBlock*>(hphi->GetOperand(3));
+    if (bb0 != body && bb1 == body)
+      phi_info[hphi] = {v0, v1};
+    else if (bb1 != body && bb0 == body)
+      phi_info[hphi] = {v1, v0};
+  }
+
+  // 跨迭代追踪所有 phi 值
+  std::unordered_map<PhiInst*, Value*> curr_vals;
+  for (auto& [hphi, info] : phi_info)
+    curr_vals[hphi] = info.init_val;
+
+  // 将所有迭代克隆到单个块中（使函数变为单 BB，可被 Inline 内联）
+  auto unrolled_bb = std::make_unique<BasicBlock>(ctx.NextTemp() + "_unroll");
+  for (int iter = 0; iter < trip_count; ++iter) {
+    std::unordered_map<Value*, Value*> vm;
+
+    // 所有 header phi 替换为当前迭代值
+    for (auto& [hphi, val] : curr_vals)
+      vm[hphi] = val;
+    // len phi 额外用常量覆盖
+    vm[phi] = ctx.GetConstInt(trip_count - iter);
+
+    for (auto* inst : body_insts) {
+      if (auto* bin = dynamic_cast<BinaryInst*>(inst))
+        if (bin->GetOpcode() == Opcode::Sub && bin->GetLhs() == phi) continue;
+
+      auto cloned = CloneInstruction(inst, vm, ctx);
+      if (!cloned) continue;
+      vm[inst] = cloned.get();
+      unrolled_bb->InsertInstructionBeforeTerminator(std::move(cloned));
+    }
+
+    // 更新下次迭代的 phi 值
+    for (auto& [hphi, info] : phi_info) {
+      if (hphi == phi) continue;
+      auto it = vm.find(info.latch_val);
+      if (it != vm.end())
+        curr_vals[hphi] = it->second;
+    }
+  }
+
+  // 将 exit 块的 ret 指令直接放入展开块（使函数变为单 BB）
+  if (!exit_bb->GetInstructions().empty()) {
+    auto* exit_ret = exit_bb->GetInstructions().back().get();
+    if (dynamic_cast<ReturnInst*>(exit_ret)) {
+      auto taken = exit_bb->TakeInstruction(exit_ret);
+      unrolled_bb->InsertInstructionBeforeTerminator(std::move(taken));
+    }
+  }
+
+  // 用最后迭代的值替换所有 header phi 的剩余引用（如 exit 块中）
+  for (auto& [hphi, val] : curr_vals)
+    hphi->ReplaceAllUsesWith(val);
+
+  // 修复 preheader 跳转到展开块
+  if (preheader) {
+    auto& pi = const_cast<std::vector<std::unique_ptr<Instruction>>&>(preheader->GetInstructions());
+    if (!pi.empty()) {
+      auto* term = pi.back().get();
+      if (auto* br = dynamic_cast<BranchInst*>(term))
+        br->SetOperand(0, unrolled_bb.get());
+      else if (auto* cbr = dynamic_cast<CondBranchInst*>(term)) {
+        if (cbr->GetTrueTarget() == header) cbr->SetOperand(1, unrolled_bb.get());
+        if (cbr->GetFalseTarget() == header) cbr->SetOperand(2, unrolled_bb.get());
+      }
+    }
+  }
+
+  // 若 preheader 仅有 Br 指令，将展开块内容合并到 preheader（使函数单 BB）
+  if (preheader && preheader->GetInstructions().size() == 1 &&
+      dynamic_cast<BranchInst*>(preheader->GetInstructions().back().get())) {
+    // 移除 preheader 的 Br
+    auto* pre_br = preheader->GetInstructions().back().get();
+    preheader->TakeInstruction(pre_br);
+    // 移动展开块所有指令到 preheader
+    auto& u_insts = const_cast<std::vector<std::unique_ptr<Instruction>>&>(
+        unrolled_bb->GetInstructions());
+    std::vector<Instruction*> u_to_move;
+    for (auto& inst : u_insts)
+      u_to_move.push_back(inst.get());
+    for (auto* inst : u_to_move) {
+      auto taken = unrolled_bb->TakeInstruction(inst);
+      preheader->InsertInstructionBeforeTerminator(std::move(taken));
+    }
+    // unrolled_bb 现在是空的，后续不插入它
+  } else {
+    // 修复 preheader 跳转到展开块
+    if (preheader) {
+      auto& pi = const_cast<std::vector<std::unique_ptr<Instruction>>&>(preheader->GetInstructions());
+      if (!pi.empty()) {
+        auto* term = pi.back().get();
+        if (auto* br = dynamic_cast<BranchInst*>(term))
+          br->SetOperand(0, unrolled_bb.get());
+        else if (auto* cbr = dynamic_cast<CondBranchInst*>(term)) {
+          if (cbr->GetTrueTarget() == header) cbr->SetOperand(1, unrolled_bb.get());
+          if (cbr->GetFalseTarget() == header) cbr->SetOperand(2, unrolled_bb.get());
+        }
+      }
+    }
+  }
+
+  // 删除 header + body + exit
+  auto ipos = fb.begin();
+  if (preheader) {
+    for (auto it = fb.begin(); it != fb.end(); ++it)
+      if (it->get() == preheader) { ipos = it + 1; break; }
+  }
+  // 若展开块已空（已合并到 preheader），不插入
+  if (!unrolled_bb->GetInstructions().empty()) {
+    ipos = fb.insert(ipos, std::move(unrolled_bb)) + 1;
+  }
+  fb.erase(std::remove_if(fb.begin(), fb.end(),
+                           [&](const std::unique_ptr<BasicBlock>& bb) {
+                             return bb.get() == header || bb.get() == body ||
+                                    bb.get() == exit_bb;
+                           }), fb.end());
+  return true;
+}
+
+}  // namespace
+
+void RunLoopUnroll(Module& module) {
+  int unrolled = 0;
+  for (auto& func : module.GetFunctions()) {
+    if (func->IsExternal()) continue;
+    // 只处理 i32 返回值函数（float 循环体含不支持克隆的操作）
+    if (!func->GetType()->IsInt32()) continue;
+    bool changed = true;
+    while (changed) {
+      changed = false;
+      for (const auto& bb : func->GetBlocks()) {
+        for (const auto& inst : bb->GetInstructions()) {
+          auto* br = dynamic_cast<BranchInst*>(inst.get());
+          if (!br) continue;
+          BasicBlock* target = br->GetTarget();
+          for (const auto& tgt_inst : target->GetInstructions()) {
+            auto* cbr = dynamic_cast<CondBranchInst*>(tgt_inst.get());
+            if (!cbr) continue;
+            BasicBlock *t = cbr->GetTrueTarget(), *f = cbr->GetFalseTarget();
+            BasicBlock *body = nullptr, *exit_bb = nullptr;
+            if (t == bb.get()) { body = t; exit_bb = f; }
+            else if (f == bb.get()) { body = f; exit_bb = t; }
+            if (!body || !exit_bb || body == target || exit_bb == target) continue;
+
+            int tc = 0;
+            auto* phi = DetectSimpleCountdown(target, body, exit_bb, tc);
+            if (!phi) continue;
+            if (UnrollSimple(func.get(), target, body, exit_bb, phi, tc,
+                            module.GetContext())) {
+              ++unrolled; changed = true; goto next_func;
+            }
+          }
+        }
+      }
+      next_func:;
+    }
+  }
+}
+
+}  // namespace ir
diff --git a/src/ir/passes/LoopVectorize.cpp b/src/ir/passes/LoopVectorize.cpp
new file mode 100644
index 00000000..e7a47b4e
--- /dev/null
+++ b/src/ir/passes/LoopVectorize.cpp
@@ -0,0 +1,795 @@
+// LoopVectorize：自动向量化 pass
+// - 检测单 BB 计数循环（while(i<n){... i=i+1;}）
+// - 分析 stride-1 数组访问 + 无交叉迭代依赖
+// - 生成 <4 x i32> 向量化循环 + 标量残余循环
+// - 向量化因子 VF=4，仅支持 i32 数组操作
+
+#include "ir/IR.h"
+
+#include <algorithm>
+#include <functional>
+#include <stdexcept>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace ir {
+
+namespace {
+
+// 向量化配置
+constexpr int kVF = 4;  // 向量化因子：4×i32 = 128-bit NEON
+
+// BFS 检查从 start 是否能到达 target（不经过 exclude）
+static bool CanReach(BasicBlock* start, BasicBlock* target, BasicBlock* exclude,
+                     std::unordered_map<BasicBlock*, std::vector<BasicBlock*>>& succs,
+                     int max_depth = 20) {
+  if (start == target) return true;
+  std::unordered_set<BasicBlock*> visited{start, exclude};
+  std::vector<BasicBlock*> queue{start};
+  int depth = 0;
+  while (!queue.empty() && depth < max_depth) {
+    std::vector<BasicBlock*> next;
+    for (auto* bb : queue) {
+      for (auto* succ : succs[bb]) {
+        if (succ == target) return true;
+        if (visited.insert(succ).second) next.push_back(succ);
+      }
+    }
+    queue = std::move(next);
+    depth++;
+  }
+  return false;
+}
+
+// 计算函数的后继映射
+static std::unordered_map<BasicBlock*, std::vector<BasicBlock*>>
+ComputeSuccessors(Function* func) {
+  std::unordered_map<BasicBlock*, std::vector<BasicBlock*>> succs;
+  for (auto& bb : func->GetBlocks()) {
+    for (auto& inst : bb->GetInstructions()) {
+      if (auto* br = dynamic_cast<BranchInst*>(inst.get()))
+        succs[bb.get()].push_back(br->GetTarget());
+      else if (auto* cbr = dynamic_cast<CondBranchInst*>(inst.get())) {
+        succs[bb.get()].push_back(cbr->GetTrueTarget());
+        succs[bb.get()].push_back(cbr->GetFalseTarget());
+      }
+    }
+  }
+  return succs;
+}
+
+// 检测简单计数循环：header 中有 phi(init, latch_val) + cmp slt + condbr
+// body 是 CondBr 中能回到 header 的那个目标，latch 是 phi 中来自循环内部的块
+static PhiInst* DetectCountedLoop(BasicBlock* header, BasicBlock* body,
+                                  BasicBlock* exit_bb,
+                                  Value*& trip_count_val, Value*& step_val,
+                                  std::unordered_map<BasicBlock*, std::vector<BasicBlock*>>& succs) {
+  // 找 phi: i = phi(init, i.next)，其中一个来源在循环外部，另一个在内部
+  PhiInst* ind_phi = nullptr;
+  BasicBlock* latch_bb = nullptr;  // 循环内部到达 header 的块
+
+  for (const auto& inst : header->GetInstructions()) {
+    auto* phi = dynamic_cast<PhiInst*>(inst.get());
+    if (!phi) break;
+    if (phi->GetNumOperands() < 4) continue;
+
+    Value* v0 = phi->GetOperand(0);
+    BasicBlock* bb0 = dynamic_cast<BasicBlock*>(phi->GetOperand(1));
+    Value* v1 = phi->GetOperand(2);
+    BasicBlock* bb1 = dynamic_cast<BasicBlock*>(phi->GetOperand(3));
+
+    // 检查哪个来源在循环内（能回到 header 或等于 body）
+    bool bb0_in_loop = (bb0 == body) || CanReach(body, bb0, header, succs);
+    bool bb1_in_loop = (bb1 == body) || CanReach(body, bb1, header, succs);
+
+    if (!bb0_in_loop && bb1_in_loop) {
+      // 跳过自引用 passthrough（latch 值 == phi 自身）
+      if (v1 == phi) continue;
+      ind_phi = phi; latch_bb = bb1;
+    } else if (bb0_in_loop && !bb1_in_loop) {
+      // 跳过自引用 passthrough
+      if (v0 == phi) continue;
+      ind_phi = phi; latch_bb = bb0;
+    }
+    if (ind_phi) break;
+  }
+  if (!ind_phi) return nullptr;
+
+  // 从 latch_bb 获取归约变量的更新值
+  Value* latch_val = nullptr;
+  Value* v0 = ind_phi->GetOperand(0);
+  BasicBlock* bb0 = dynamic_cast<BasicBlock*>(ind_phi->GetOperand(1));
+  if (bb0 == latch_bb) latch_val = v0;
+  else latch_val = ind_phi->GetOperand(2);
+
+  auto* increment = dynamic_cast<BinaryInst*>(latch_val);
+  if (!increment || increment->GetOpcode() != Opcode::Add) return nullptr;
+
+  Value* step = nullptr;
+  if (increment->GetLhs() == ind_phi) step = increment->GetRhs();
+  else if (increment->GetRhs() == ind_phi) step = increment->GetLhs();
+  else return nullptr;
+
+  step_val = step;
+
+  // 查找退出条件：cmp slt %i, %n（可能需要穿透 zext+icmp ne 包装）
+  for (const auto& inst : header->GetInstructions()) {
+    auto* cbr = dynamic_cast<CondBranchInst*>(inst.get());
+    if (!cbr) continue;
+    if (cbr->GetTrueTarget() != body && cbr->GetFalseTarget() != body) continue;
+
+    Value* cond_val = cbr->GetCond();
+    // 穿透 zext(i1 → i32) + icmp ne(..., 0) 包装
+    if (auto* outer = dynamic_cast<BinaryInst*>(cond_val)) {
+      if (outer->GetOpcode() == Opcode::Ne) {
+        auto* rc = dynamic_cast<ConstantInt*>(outer->GetRhs());
+        if (rc && rc->GetValue() == 0)
+          if (auto* zext = dynamic_cast<CastInst*>(outer->GetLhs()))
+            if (zext->GetOpcode() == Opcode::ZExt)
+              cond_val = zext->GetOperandValue();
+      }
+    }
+
+    auto* cmp = dynamic_cast<BinaryInst*>(cond_val);
+    if (!cmp) continue;
+    if (cmp->GetOpcode() != Opcode::Lt && cmp->GetOpcode() != Opcode::Le) continue;
+
+    Value* other = nullptr;
+    if (cmp->GetLhs() == ind_phi) other = cmp->GetRhs();
+    else if (cmp->GetRhs() == ind_phi) other = cmp->GetLhs();
+    else continue;
+
+    trip_count_val = other;
+    return ind_phi;
+  }
+
+  return nullptr;
+}
+
+// 检查指令是否可以向量化（无副作用、无函数调用、无浮点除法）
+static bool IsVectorizableInst(Instruction* inst) {
+  switch (inst->GetOpcode()) {
+    case Opcode::Add: case Opcode::Sub: case Opcode::Mul:
+    case Opcode::Load: case Opcode::Store:
+    case Opcode::GEP:
+    case Opcode::Br:
+      return true;
+    case Opcode::Div: case Opcode::Mod:
+      // 整数除法和取模也可以向量化（NEON 不支持，但可用标量）
+      return false;
+    default:
+      return false;
+  }
+}
+
+// 检查值在循环内是否不随归纳变量变化（循环不变量）
+// 递归检查：常量和全局变量是不变量；phi 若来源全在循环外也是不变量
+static bool IsLoopInvariant(Value* val, BasicBlock* header, BasicBlock* body,
+                            PhiInst* ind_phi) {
+  if (dynamic_cast<ConstantInt*>(val)) return true;
+  if (dynamic_cast<ConstantFloat*>(val)) return true;
+  if (dynamic_cast<GlobalVariable*>(val)) return true;
+  if (dynamic_cast<Argument*>(val)) return true;
+  if (val == ind_phi) return false;
+
+  if (auto* inst = dynamic_cast<Instruction*>(val)) {
+    BasicBlock* parent = inst->GetParent();
+    // 循环外的指令一定是不变量
+    if (parent != header && parent != body) return true;
+    // 在 header 中的 phi：检查其操作数是否都不依赖归纳变量
+    if (auto* phi = dynamic_cast<PhiInst*>(inst)) {
+      for (size_t i = 0; i < phi->GetNumOperands(); i += 2) {
+        Value* op = phi->GetOperand(i);
+        if (op == phi) continue;  // 自引用 passthrough → loop-invariant
+        if (!IsLoopInvariant(op, header, body, ind_phi)) return false;
+      }
+      return true;
+    }
+    // 其他在循环内的指令：检查其操作数是否都不依赖归纳变量
+    for (size_t i = 0; i < inst->GetNumOperands(); i++) {
+      if (!IsLoopInvariant(inst->GetOperand(i), header, body, ind_phi))
+        return false;
+    }
+    return true;
+  }
+  return false;
+}
+
+// 检查循环是否适合向量化
+static bool CanVectorizeLoop(BasicBlock* header, BasicBlock* body,
+                             BasicBlock* /*exit_bb*/, PhiInst* ind_phi) {
+  // 检查 header 中除了归纳变量 phi 外，是否有跨迭代依赖的 phi
+  // 允许自引用 passthrough phi（Loop Interchange 产生的），拒绝累加器等归约 phi
+  for (const auto& inst : header->GetInstructions()) {
+    auto* phi = dynamic_cast<PhiInst*>(inst.get());
+    if (!phi) break;
+    if (phi == ind_phi) continue;
+    // 检查是否是自引用 passthrough（latch 操作数为 phi 自身）
+    bool is_passthrough = false;
+    for (size_t i = 0; i < phi->GetNumOperands(); i += 2) {
+      auto* bb = dynamic_cast<BasicBlock*>(phi->GetOperand(i + 1));
+      if (bb == body && phi->GetOperand(i) == phi) {
+        is_passthrough = true;
+        break;
+      }
+    }
+    if (!is_passthrough) {
+      // 额外 phi 不是 passthrough → 可能是归约累加器，拒绝
+      return false;
+    }
+  }
+
+  // 检查 body 中的所有指令
+  bool has_load = false;
+  bool has_store = false;
+  for (const auto& inst : body->GetInstructions()) {
+    auto* br = dynamic_cast<BranchInst*>(inst.get());
+    if (br) continue;  // terminator
+
+    if (!IsVectorizableInst(inst.get())) return false;
+
+    if (dynamic_cast<LoadInst*>(inst.get())) has_load = true;
+    if (dynamic_cast<StoreInst*>(inst.get())) has_store = true;
+
+    // GEP 必须使用归纳变量作为索引，步长必须为 1（或循环不变量）
+    if (auto* gep = dynamic_cast<GetElementPtrInst*>(inst.get())) {
+      Value* idx = gep->GetIndex();
+      if (idx == ind_phi) continue;  // stride = 1, OK
+      if (IsLoopInvariant(idx, header, body, ind_phi)) continue;  // invariant base, OK
+      // 检查是否是 ind_phi + loop_invariant 的形式
+      if (auto* bin = dynamic_cast<BinaryInst*>(idx)) {
+        if (bin->GetOpcode() == Opcode::Add || bin->GetOpcode() == Opcode::Sub) {
+          if (bin->GetLhs() == ind_phi && IsLoopInvariant(bin->GetRhs(), header, body, ind_phi))
+            continue;
+          if (bin->GetRhs() == ind_phi && IsLoopInvariant(bin->GetLhs(), header, body, ind_phi))
+            continue;
+        }
+      }
+      return false;  // complex index
+    }
+  }
+
+  // Load+Store 混合循环：MIR 层已支持 LdrQ/StrQ，向量计算产生 <4 x i32>，直接向量 Store
+  // Store-only 循环需额外检查：若存储值非归纳变量且非常量/不变量，
+  //   需拒绝——CloneAsVector 无法将标量表达式转为向量（缺少 VectorSplat）
+  if (!has_load && has_store) {
+    for (const auto& inst : body->GetInstructions()) {
+      auto* store = dynamic_cast<StoreInst*>(inst.get());
+      if (!store) continue;
+      Value* sv = store->GetValue();
+      // 归纳变量本身：展开时逐元素加偏移 → OK
+      if (sv == ind_phi) continue;
+      // 常量：展开时同值存4路 → OK
+      if (dynamic_cast<ConstantInt*>(sv)) continue;
+      // 循环不变量：展开时同值存4路 → OK
+      if (IsLoopInvariant(sv, header, body, ind_phi)) continue;
+      // 其他表达式（如 constant + ind_phi）：无法正确向量化 → 拒绝
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// 克隆指令，可选地将标量类型替换为向量类型
+static std::unique_ptr<Instruction> CloneAsVector(
+    Instruction* inst,
+    const std::unordered_map<Value*, Value*>& value_map,
+    Context& ctx) {
+  auto map = [&](Value* v) -> Value* {
+    auto it = value_map.find(v);
+    return (it != value_map.end()) ? it->second : v;
+  };
+
+  Opcode op = inst->GetOpcode();
+  switch (op) {
+    case Opcode::Add: case Opcode::Sub: case Opcode::Mul: {
+      auto* bin = static_cast<BinaryInst*>(inst);
+      Value* lhs = map(bin->GetLhs());
+      Value* rhs = map(bin->GetRhs());
+      // 仅当两个操作数都是向量时才生成向量运算
+      // 标量操作数保持标量（如 dst_pos + i → GEP 索引）
+      bool lhs_vec = lhs->GetType() && lhs->GetType()->IsVector();
+      bool rhs_vec = rhs->GetType() && rhs->GetType()->IsVector();
+      if (lhs_vec && rhs_vec) {
+        auto vec_ty = Type::GetVector(Type::GetInt32Type(), kVF);
+        return std::make_unique<BinaryInst>(op, vec_ty, lhs, rhs, ctx.NextTemp());
+      }
+      return std::make_unique<BinaryInst>(op, bin->GetType(), lhs, rhs, ctx.NextTemp());
+    }
+    case Opcode::Load: {
+      auto* load = static_cast<LoadInst*>(inst);
+      auto vec_ty = Type::GetVector(Type::GetInt32Type(), kVF);
+      return std::make_unique<LoadInst>(vec_ty, map(load->GetPtr()), ctx.NextTemp());
+    }
+    case Opcode::Store: {
+      auto* store = static_cast<StoreInst*>(inst);
+      return std::make_unique<StoreInst>(Type::GetVoidType(),
+                                          map(store->GetValue()),
+                                          map(store->GetPtr()));
+    }
+    case Opcode::GEP: {
+      auto* gep = static_cast<GetElementPtrInst*>(inst);
+      return std::make_unique<GetElementPtrInst>(gep->GetType(),
+                                                  map(gep->GetBasePtr()),
+                                                  map(gep->GetIndex()),
+                                                  ctx.NextTemp());
+    }
+    default:
+      return nullptr;
+  }
+}
+
+// 克隆标量指令（用于残余循环）
+static std::unique_ptr<Instruction> CloneScalar(
+    Instruction* inst,
+    const std::unordered_map<Value*, Value*>& value_map,
+    Context& ctx) {
+  auto map = [&](Value* v) -> Value* {
+    auto it = value_map.find(v);
+    return (it != value_map.end()) ? it->second : v;
+  };
+
+  Opcode op = inst->GetOpcode();
+  switch (op) {
+    case Opcode::Add: case Opcode::Sub: case Opcode::Mul:
+    case Opcode::Div: case Opcode::Mod:
+    case Opcode::Eq:  case Opcode::Ne:  case Opcode::Lt:
+    case Opcode::Le:  case Opcode::Gt:  case Opcode::Ge: {
+      auto* bin = static_cast<BinaryInst*>(inst);
+      return std::make_unique<BinaryInst>(op, inst->GetType(),
+                                          map(bin->GetLhs()), map(bin->GetRhs()),
+                                          ctx.NextTemp());
+    }
+    case Opcode::Load: {
+      auto* load = static_cast<LoadInst*>(inst);
+      return std::make_unique<LoadInst>(inst->GetType(), map(load->GetPtr()),
+                                        ctx.NextTemp());
+    }
+    case Opcode::Store: {
+      auto* store = static_cast<StoreInst*>(inst);
+      return std::make_unique<StoreInst>(Type::GetVoidType(),
+                                          map(store->GetValue()),
+                                          map(store->GetPtr()));
+    }
+    case Opcode::GEP: {
+      auto* gep = static_cast<GetElementPtrInst*>(inst);
+      return std::make_unique<GetElementPtrInst>(gep->GetType(),
+                                                  map(gep->GetBasePtr()),
+                                                  map(gep->GetIndex()),
+                                                  ctx.NextTemp());
+    }
+    case Opcode::ZExt: {
+      auto* cast = static_cast<CastInst*>(inst);
+      return std::make_unique<CastInst>(op, inst->GetType(),
+                                        map(cast->GetOperandValue()), ctx.NextTemp());
+    }
+    default:
+      return nullptr;
+  }
+}
+
+// 向量化单个循环：header + body → vec_header + vec_body + scalar_header + scalar_body
+static bool VectorizeLoop(Function* func, BasicBlock* header, BasicBlock* body,
+                          BasicBlock* exit_bb, PhiInst* ind_phi,
+                          Value* trip_count_val, Context& ctx) {
+  auto& fb = const_cast<std::vector<std::unique_ptr<BasicBlock>>&>(func->GetBlocks());
+
+  // 收集 body 指令（不含 terminator）
+  std::vector<Instruction*> body_insts;
+  for (const auto& inst : body->GetInstructions()) {
+    if (dynamic_cast<BranchInst*>(inst.get())) continue;
+    body_insts.push_back(inst.get());
+  }
+
+  // 找 preheader
+  BasicBlock* preheader = nullptr;
+  for (const auto& bb : func->GetBlocks()) {
+    for (const auto& inst : bb->GetInstructions()) {
+      if (auto* br = dynamic_cast<BranchInst*>(inst.get()))
+        if (br->GetTarget() == header) { preheader = bb.get(); break; }
+      if (auto* cbr = dynamic_cast<CondBranchInst*>(inst.get()))
+        if (cbr->GetTrueTarget() == header || cbr->GetFalseTarget() == header)
+        { preheader = bb.get(); break; }
+    }
+    if (preheader) break;
+  }
+  if (!preheader) return false;
+
+  // 获取归纳变量初始值
+  Value* init_val = nullptr;
+  {
+    Value* v0 = ind_phi->GetOperand(0);
+    BasicBlock* bb0 = dynamic_cast<BasicBlock*>(ind_phi->GetOperand(1));
+    Value* v1 = ind_phi->GetOperand(2);
+    BasicBlock* bb1 = dynamic_cast<BasicBlock*>(ind_phi->GetOperand(3));
+    if (bb0 != body && bb1 == body) init_val = v0;
+    else if (bb1 != body && bb0 == body) init_val = v1;
+  }
+  if (!init_val) return false;
+
+  // 额外安全检查：init_val 和 trip_count_val 必须有效
+  if (!init_val->GetType() || !init_val->GetType()->IsInt32()) return false;
+
+  // 在 preheader 中计算向量化循环上界: n_rounded = n - (n % VF)
+  auto* mod_val = ctx.GetConstInt(kVF);
+  auto n_mod_inst = std::make_unique<BinaryInst>(Opcode::Mod, Type::GetInt32Type(),
+                                                   trip_count_val, mod_val, ctx.NextTemp());
+  auto* n_mod = n_mod_inst.get();
+  preheader->InsertInstructionBeforeTerminator(std::move(n_mod_inst));
+
+  auto n_sub_inst = std::make_unique<BinaryInst>(Opcode::Sub, Type::GetInt32Type(),
+                                                   trip_count_val, n_mod, ctx.NextTemp());
+  auto* n_sub = n_sub_inst.get();
+  preheader->InsertInstructionBeforeTerminator(std::move(n_sub_inst));
+
+  // === 创建向量化循环 ===
+  auto vec_header = std::make_unique<BasicBlock>(ctx.NextTemp() + "_vech");
+  auto vec_body = std::make_unique<BasicBlock>(ctx.NextTemp() + "_vecb");
+
+  // vec_header phi: 先用 init_val 占位 backedge value（后续 SetOperand 替换）
+  auto* vec_phi = vec_header->Append<PhiInst>(Type::GetInt32Type(), ctx.NextTemp());
+  vec_phi->AddOperand(init_val);
+  vec_phi->AddOperand(preheader);
+  vec_phi->AddOperand(init_val);           // 占位，后续替换为 vec_step_val
+  vec_phi->AddOperand(vec_body.get());
+
+  // 构建 vec_body
+  std::unordered_map<Value*, Value*> vec_vm;
+  vec_vm[ind_phi] = vec_phi;
+
+  // 找到归纳变量的更新指令（phi 中来自 latch 块的操作数）
+  // 仅跳过这一条指令，不能跳过所有涉及 ind_phi 的 Add
+  // （如 dst_pos + i 涉及 ind_phi 但不是更新指令）
+  Instruction* latch_update = nullptr;
+  {
+    Value* v0 = ind_phi->GetOperand(0);
+    BasicBlock* bb0 = dynamic_cast<BasicBlock*>(ind_phi->GetOperand(1));
+    Value* v1 = ind_phi->GetOperand(2);
+    BasicBlock* bb1 = dynamic_cast<BasicBlock*>(ind_phi->GetOperand(3));
+    if (bb0 == body) latch_update = dynamic_cast<Instruction*>(v0);
+    else if (bb1 == body) latch_update = dynamic_cast<Instruction*>(v1);
+  }
+
+  for (auto* orig_inst : body_insts) {
+    // 仅跳过归纳变量的更新指令（如 i = i + 1），
+    // 不跳过其他使用归纳变量的指令（如 dst_pos + i）
+    if (orig_inst == latch_update)
+      continue;
+
+    // GEP 不跳过——CloneAsVector 会创建映射
+    // Store 需要展开处理：在映射的 GEP 基础上追加 off=1,2,3 的 GEP+Store
+
+    if (auto* store = dynamic_cast<StoreInst*>(orig_inst)) {
+      Value* stored_val = store->GetValue();
+      auto vm_it = vec_vm.find(stored_val);
+      if (vm_it != vec_vm.end()) stored_val = vm_it->second;
+
+      Value* mapped_ptr = store->GetPtr();
+      auto ptr_it = vec_vm.find(mapped_ptr);
+      if (ptr_it != vec_vm.end()) mapped_ptr = ptr_it->second;
+
+      // 若存储值已是向量（load+compute 产生 <4 x i32>），直接向量 store → str q
+      // 若存储值是标量，需展开为 4 路标量 store（归纳变量加偏移，常量/不变量同值）
+      bool stored_is_vector = stored_val && stored_val->GetType() && stored_val->GetType()->IsVector();
+      if (!stored_is_vector) {
+        bool stored_is_indvar = (stored_val == vec_phi);
+        Value* base_ptr = nullptr;
+        Value* orig_idx = vec_phi;
+        if (auto* mapped_gep = dynamic_cast<GetElementPtrInst*>(mapped_ptr)) {
+          base_ptr = mapped_gep->GetBasePtr();
+          orig_idx = mapped_gep->GetIndex();
+        }
+        for (int off = 0; off < kVF; off++) {
+          Value* gep_idx = orig_idx;
+          if (off > 0) {
+            auto off_inst = std::make_unique<BinaryInst>(Opcode::Add, Type::GetInt32Type(),
+                                                         orig_idx, ctx.GetConstInt(off),
+                                                         ctx.NextTemp());
+            gep_idx = off_inst.get();
+            vec_body->InsertInstructionBeforeTerminator(std::move(off_inst));
+          }
+          auto gep = std::make_unique<GetElementPtrInst>(Type::GetPtrInt32Type(),
+                                                         base_ptr, gep_idx, ctx.NextTemp());
+          auto* gep_ptr = gep.get();
+          vec_body->InsertInstructionBeforeTerminator(std::move(gep));
+
+          Value* elem_val = stored_val;
+          if (stored_is_indvar && off > 0) {
+            auto off_val = std::make_unique<BinaryInst>(Opcode::Add, Type::GetInt32Type(),
+                                                        stored_val, ctx.GetConstInt(off),
+                                                        ctx.NextTemp());
+            elem_val = off_val.get();
+            vec_body->InsertInstructionBeforeTerminator(std::move(off_val));
+          }
+          auto s = std::make_unique<StoreInst>(Type::GetVoidType(), elem_val, gep_ptr);
+          vec_body->InsertInstructionBeforeTerminator(std::move(s));
+        }
+      } else {
+        // 向量 store：<4 x i32> 值直接通过 i32* 指针存储，MIR 降为 str q
+        auto vs = std::make_unique<StoreInst>(Type::GetVoidType(), stored_val, mapped_ptr);
+        vec_body->InsertInstructionBeforeTerminator(std::move(vs));
+      }
+      continue;
+    }
+
+    auto cloned = CloneAsVector(orig_inst, vec_vm, ctx);
+    if (!cloned) return false;
+    vec_vm[orig_inst] = cloned.get();
+    vec_body->InsertInstructionBeforeTerminator(std::move(cloned));
+  }
+
+  auto* vec_step_val = vec_body->Append<BinaryInst>(Opcode::Add, Type::GetInt32Type(),
+                                                     vec_phi, ctx.GetConstInt(kVF),
+                                                     ctx.NextTemp());
+  vec_body->Append<BranchInst>(Type::GetVoidType(), vec_header.get());
+
+  vec_phi->SetOperand(2, vec_step_val);
+
+  // vec_header 条件 + 终止指令 (false target 先用 exit_bb 占位，后续修复)
+  auto* vec_cond = vec_header->Append<BinaryInst>(Opcode::Lt, Type::GetInt1Type(),
+                                                    vec_phi, n_sub, ctx.NextTemp());
+  vec_header->Append<CondBranchInst>(Type::GetVoidType(), vec_cond,
+                                      vec_body.get(), exit_bb);
+
+  // === 创建标量残余循环 ===
+  auto scalar_header = std::make_unique<BasicBlock>(ctx.NextTemp() + "_sch");
+  auto scalar_body = std::make_unique<BasicBlock>(ctx.NextTemp() + "_scb");
+
+  auto* scalar_phi = scalar_header->Append<PhiInst>(Type::GetInt32Type(), ctx.NextTemp());
+  scalar_phi->AddOperand(vec_phi);              // 从向量循环来
+  scalar_phi->AddOperand(vec_header.get());
+  scalar_phi->AddOperand(vec_phi);              // 占位，后续替换为 scalar_step_val
+  scalar_phi->AddOperand(scalar_body.get());
+
+  std::unordered_map<Value*, Value*> scalar_vm;
+  scalar_vm[ind_phi] = scalar_phi;
+  for (auto* orig_inst : body_insts) {
+    // 仅跳过归纳变量更新指令
+    if (orig_inst == latch_update)
+      continue;
+    auto cloned = CloneScalar(orig_inst, scalar_vm, ctx);
+    if (!cloned) return false;  // 无法克隆，中止
+    scalar_vm[orig_inst] = cloned.get();
+    scalar_body->InsertInstructionBeforeTerminator(std::move(cloned));
+  }
+
+  auto* scalar_step_val = scalar_body->Append<BinaryInst>(Opcode::Add, Type::GetInt32Type(),
+                                                           scalar_phi, ctx.GetConstInt(1),
+                                                           ctx.NextTemp());
+  scalar_body->Append<BranchInst>(Type::GetVoidType(), scalar_header.get());
+  scalar_phi->SetOperand(2, scalar_step_val);
+
+  auto* scalar_cond = scalar_header->Append<BinaryInst>(Opcode::Lt, Type::GetInt1Type(),
+                                                          scalar_phi, trip_count_val,
+                                                          ctx.NextTemp());
+  scalar_header->Append<CondBranchInst>(Type::GetVoidType(), scalar_cond,
+                                         scalar_body.get(), exit_bb);
+
+
+  // 修复 vec_header CondBr 的 false→scalar_header（之前用 exit_bb 占位）
+  {
+    auto& vi = const_cast<std::vector<std::unique_ptr<Instruction>>&>(
+        vec_header->GetInstructions());
+    for (auto& inst : vi) {
+      if (auto* cbr = dynamic_cast<CondBranchInst*>(inst.get())) {
+        cbr->SetOperand(2, scalar_header.get());
+      }
+    }
+  }
+
+  // 修复 preheader 跳转到 vec_header
+  {
+    auto& pi = const_cast<std::vector<std::unique_ptr<Instruction>>&>(
+        preheader->GetInstructions());
+    if (!pi.empty()) {
+      auto* term = pi.back().get();
+      if (auto* br = dynamic_cast<BranchInst*>(term))
+        br->SetOperand(0, vec_header.get());
+      else if (auto* cbr = dynamic_cast<CondBranchInst*>(term)) {
+        if (cbr->GetTrueTarget() == header) cbr->SetOperand(1, vec_header.get());
+        if (cbr->GetFalseTarget() == header) cbr->SetOperand(2, vec_header.get());
+      }
+    }
+  }
+
+  // 修复其他块中对旧 header/body 的引用（phi 节点、跳转目标等）
+  for (auto& bb : func->GetBlocks()) {
+    for (auto& inst : bb->GetInstructions()) {
+      // 修复 phi 节点中的块引用
+      if (auto* phi = dynamic_cast<PhiInst*>(inst.get())) {
+        for (size_t i = 1; i < phi->GetNumOperands(); i += 2) {
+          auto* src = dynamic_cast<BasicBlock*>(phi->GetOperand(i));
+          if (src == header) phi->SetOperand(i, scalar_header.get());
+          else if (src == body) phi->SetOperand(i, scalar_body.get());
+        }
+      }
+      // 修复跳转目标
+      if (auto* br = dynamic_cast<BranchInst*>(inst.get())) {
+        if (br->GetTarget() == header) br->SetOperand(0, scalar_header.get());
+        else if (br->GetTarget() == body) br->SetOperand(0, scalar_body.get());
+      }
+      if (auto* cbr = dynamic_cast<CondBranchInst*>(inst.get())) {
+        if (cbr->GetTrueTarget() == header) cbr->SetOperand(1, scalar_header.get());
+        if (cbr->GetFalseTarget() == header) cbr->SetOperand(2, scalar_header.get());
+        if (cbr->GetTrueTarget() == body) cbr->SetOperand(1, scalar_body.get());
+        if (cbr->GetFalseTarget() == body) cbr->SetOperand(2, scalar_body.get());
+      }
+    }
+  }
+
+  // 将所有对旧 ind_phi 的引用替换为 scalar_phi
+  ind_phi->ReplaceAllUsesWith(scalar_phi);
+
+  // 插入新块
+  auto ipos = fb.begin();
+  for (auto it = fb.begin(); it != fb.end(); ++it)
+    if (it->get() == preheader) { ipos = it + 1; break; }
+
+  ipos = fb.insert(ipos, std::move(vec_header)) + 1;
+  ipos = fb.insert(ipos, std::move(vec_body)) + 1;
+  ipos = fb.insert(ipos, std::move(scalar_header)) + 1;
+  ipos = fb.insert(ipos, std::move(scalar_body)) + 1;
+
+  // 递归克隆旧块中的值到 preheader，避免清空后产生悬垂指针
+  // 使用缓存防止重复克隆同一值
+  std::unordered_map<Value*, Value*> clone_cache;
+  std::function<Value*(Value*)> clone_to_preheader = [&](Value* val) -> Value* {
+    if (!val) return nullptr;
+    // 已克隆过
+    auto cache_it = clone_cache.find(val);
+    if (cache_it != clone_cache.end()) return cache_it->second;
+    // 不是指令或不在旧块中——无需克隆
+    auto* inst = dynamic_cast<Instruction*>(val);
+    if (!inst) { clone_cache[val] = val; return val; }
+    auto* parent = inst->GetParent();
+    if (parent != header && parent != body) { clone_cache[val] = val; return val; }
+    // 递归克隆操作数
+    std::unique_ptr<Instruction> cloned;
+    Opcode op = inst->GetOpcode();
+    switch (op) {
+      case Opcode::Add: case Opcode::Sub: case Opcode::Mul:
+      case Opcode::Div: case Opcode::Mod:
+      case Opcode::Eq:  case Opcode::Ne:  case Opcode::Lt:
+      case Opcode::Le:  case Opcode::Gt:  case Opcode::Ge: {
+        auto* bin = static_cast<BinaryInst*>(inst);
+        Value* new_lhs = clone_to_preheader(bin->GetLhs());
+        Value* new_rhs = clone_to_preheader(bin->GetRhs());
+        cloned = std::make_unique<BinaryInst>(op, bin->GetType(),
+                                              new_lhs, new_rhs, ctx.NextTemp());
+        break;
+      }
+      case Opcode::Load: {
+        auto* load = static_cast<LoadInst*>(inst);
+        Value* new_ptr = clone_to_preheader(load->GetPtr());
+        cloned = std::make_unique<LoadInst>(load->GetType(), new_ptr, ctx.NextTemp());
+        break;
+      }
+      case Opcode::GEP: {
+        auto* gep = static_cast<GetElementPtrInst*>(inst);
+        Value* new_base = clone_to_preheader(gep->GetBasePtr());
+        Value* new_idx = clone_to_preheader(gep->GetIndex());
+        cloned = std::make_unique<GetElementPtrInst>(gep->GetType(), new_base, new_idx,
+                                                      ctx.NextTemp());
+        break;
+      }
+      case Opcode::SIToFP: case Opcode::FPToSI: case Opcode::ZExt: {
+        auto* cast = static_cast<CastInst*>(inst);
+        Value* new_op = clone_to_preheader(cast->GetOperandValue());
+        cloned = std::make_unique<CastInst>(op, cast->GetType(), new_op, ctx.NextTemp());
+        break;
+      }
+      default:
+        clone_cache[val] = val;
+        return val;  // 无法克隆（phi/alloc/call 等），保持原位
+    }
+    auto* result = cloned.get();
+    preheader->InsertInstructionBeforeTerminator(std::move(cloned));
+    clone_cache[val] = result;
+    return result;
+  };
+
+  // 扫描所有块，修复对旧块指令的引用（替换为 preheader 克隆）
+  for (auto& bb : func->GetBlocks()) {
+    if (bb.get() == header || bb.get() == body) continue;
+    for (auto& inst : bb->GetInstructions()) {
+      for (size_t i = 0; i < inst->GetNumOperands(); i++) {
+        Value* new_op = clone_to_preheader(inst->GetOperand(i));
+        if (new_op != inst->GetOperand(i)) inst->SetOperand(i, new_op);
+      }
+    }
+  }
+
+  // 清空旧块指令（先断开 use-list，再销毁），保留空块由 CFGSimplify 清理
+  for (auto* old_bb : {header, body}) {
+    for (auto& inst : old_bb->GetInstructions()) {
+      for (size_t i = 0; i < inst->GetNumOperands(); ++i) {
+        auto* op = inst->GetOperand(i);
+        if (op) op->RemoveUse(inst.get(), i);
+      }
+    }
+    auto& insts = const_cast<std::vector<std::unique_ptr<Instruction>>&>(
+        old_bb->GetInstructions());
+    insts.clear();
+  }
+
+  return true;
+}
+
+}  // namespace
+
+void RunLoopVectorize(Module& module) {
+  int vectorized = 0;
+  for (auto& func : module.GetFunctions()) {
+    if (func->IsExternal()) continue;
+    if (!func->GetType()->IsInt32()) continue;
+
+    // 跳过过于复杂的函数：块太多或指令太多，向量化收益小风险大
+    auto succs = ComputeSuccessors(func.get());
+
+    bool changed = true;
+    while (changed) {
+      changed = false;
+
+      // 收集所有可向量化循环，避免迭代中修改块列表
+      struct Candidate { BasicBlock* header; BasicBlock* body; BasicBlock* exit_bb;
+                         PhiInst* ind_phi; Value* trip_count; };
+      std::vector<Candidate> candidates;
+
+      for (const auto& bb : func->GetBlocks()) {
+        for (const auto& inst : bb->GetInstructions()) {
+          auto* cbr = dynamic_cast<CondBranchInst*>(inst.get());
+          if (!cbr) continue;
+
+          BasicBlock *t = cbr->GetTrueTarget(), *f = cbr->GetFalseTarget();
+          bool t_reaches = CanReach(t, bb.get(), f, succs);
+          bool f_reaches = CanReach(f, bb.get(), t, succs);
+
+          BasicBlock *body = nullptr, *exit_bb = nullptr;
+          if (t_reaches && !f_reaches) { body = t; exit_bb = f; }
+          else if (f_reaches && !t_reaches) { body = f; exit_bb = t; }
+          else continue;
+
+          if (body == bb.get() || exit_bb == bb.get()) continue;
+
+          // 防止无限递归：不向量化自己生成的标量残余循环
+          const auto& hdr_name = bb->GetName();
+          if (hdr_name.find("_sch") != std::string::npos ||
+              hdr_name.find("_scb") != std::string::npos ||
+              hdr_name.find("_vech") != std::string::npos ||
+              hdr_name.find("_vecb") != std::string::npos)
+            continue;
+
+          Value* trip_count = nullptr;
+          Value* step = nullptr;
+          auto* ind_phi = DetectCountedLoop(bb.get(), body, exit_bb, trip_count, step, succs);
+          if (!ind_phi || !trip_count || !step) continue;
+
+          auto* step_c = dynamic_cast<ConstantInt*>(step);
+          if (!step_c || step_c->GetValue() != 1) continue;
+
+          if (!CanVectorizeLoop(bb.get(), body, exit_bb, ind_phi)) continue;
+          if (!trip_count->GetType() || !trip_count->GetType()->IsInt32()) continue;
+
+          candidates.push_back({bb.get(), body, exit_bb, ind_phi, trip_count});
+        }
+      }
+
+      // 向量化第一个候选（每个函数只向量化一个最外层循环）
+      for (const auto& c : candidates) {
+        try {
+          if (VectorizeLoop(func.get(), c.header, c.body, c.exit_bb,
+                           c.ind_phi, c.trip_count, module.GetContext())) {
+            ++vectorized;
+            changed = true;
+            succs = ComputeSuccessors(func.get());  // 刷新后继映射
+            break;  // 修改了块列表，重新扫描
+          }
+        } catch (...) {}
+      }
+    }
+  }
+}
+
+}  // namespace ir
diff --git a/src/ir/passes/SCCP.cpp b/src/ir/passes/SCCP.cpp
new file mode 100644
index 00000000..458e5b96
--- /dev/null
+++ b/src/ir/passes/SCCP.cpp
@@ -0,0 +1,261 @@
+// SCCP：稀疏条件常量传播
+// DEBUG：定位 SIGSEGV 根因
+
+#include "ir/IR.h"
+#include <queue>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace ir {
+
+namespace {
+
+enum class LS : uint8_t { Undef, Const, Overdef };
+
+static bool RunOnFunction(Function &F, Context *ctx) {
+  // 跳过向量函数
+  for (auto &bb : F.GetBlocks())
+    for (auto &i : bb->GetInstructions())
+      if (i->GetType()->IsVector()) return false;
+
+  std::unordered_map<Value*, int> lat_vals; // -1=undef, -2=overdef, >=0=constant
+  std::unordered_set<BasicBlock*> exec;
+  std::queue<BasicBlock*> bw;
+  std::queue<Instruction*> iw;
+
+  // 初始化
+  for (auto &bb : F.GetBlocks()) {
+    for (auto &i : bb->GetInstructions()) {
+      lat_vals[i.get()] = -1; // undef
+      if (auto *ci = dynamic_cast<ConstantInt*>(i.get()))
+        lat_vals[i.get()] = ci->GetValue(); // constant
+    }
+  }
+  for (auto &p : F.GetParams())
+    lat_vals[p.get()] = -2; // overdef
+
+  auto *e = F.GetEntry();
+  if (!e) return false;
+  exec.insert(e);
+  bw.push(e);
+
+  auto get_lat = [&](Value *v) -> int {
+    auto it = lat_vals.find(v);
+    return it != lat_vals.end() ? it->second : -2; // default overdef
+  };
+
+  auto mark_exec = [&](BasicBlock *bb) {
+    if (exec.insert(bb).second) {
+      bw.push(bb);
+      for (auto &i : bb->GetInstructions()) {
+        if (dynamic_cast<PhiInst*>(i.get()))
+          iw.push(i.get());
+      }
+    }
+  };
+
+  while (!bw.empty() || !iw.empty()) {
+    while (!iw.empty()) {
+      auto *I = iw.front(); iw.pop();
+      auto *bb = I->GetParent();
+      if (!bb || !exec.count(bb)) continue;
+
+      if (auto *phi = dynamic_cast<PhiInst*>(I)) {
+        // PHI: meet over reachable incoming values
+        int &lv = lat_vals[I];
+        if (lv == -2) continue; // already overdef
+        int result = -1; // undef
+        bool has_reachable = false;
+        for (size_t i = 0; i < phi->GetNumOperands(); i += 2) {
+          auto *ibb = dynamic_cast<BasicBlock*>(phi->GetOperand(i+1));
+          if (!ibb || !exec.count(ibb)) continue;
+          has_reachable = true;
+          int vl = get_lat(phi->GetOperand(i));
+          if (vl == -2) { result = -2; break; }
+          if (vl >= 0) {
+            if (result == -1) result = vl;
+            else if (result != vl) { result = -2; break; }
+          }
+        }
+        if (!has_reachable) continue;
+        if (lv != result) { lv = result; }
+      }
+      else if (auto *br = dynamic_cast<BranchInst*>(I)) {
+        mark_exec(br->GetTarget());
+      }
+      else if (auto *cb = dynamic_cast<CondBranchInst*>(I)) {
+        int cl = get_lat(cb->GetCond());
+        if (cl >= 0) mark_exec(cl ? cb->GetTrueTarget() : cb->GetFalseTarget());
+        else { mark_exec(cb->GetTrueTarget()); mark_exec(cb->GetFalseTarget()); }
+        // undef (-1) 和 overdef (-2) 都标记两条路径（undef 可能是任意值）
+      }
+      else if (!I->IsTerminator()) {
+        int &lv = lat_vals[I];
+        if (lv == -2) continue; // already overdef
+        if (dynamic_cast<AllocaInst*>(I)) continue;
+        if (dynamic_cast<StoreInst*>(I)) continue;
+        if (dynamic_cast<LoadInst*>(I)) continue;
+        if (dynamic_cast<CallInst*>(I)) continue;
+
+        // 收集操作数 lattice
+        bool overdef = false;
+        bool all_undef = true;
+        std::vector<int> const_vals;
+        for (size_t i = 0; i < I->GetNumOperands(); ++i) {
+          auto *op = I->GetOperand(i);
+          if (dynamic_cast<BasicBlock*>(op)) continue;
+          int ol = get_lat(op);
+          if (ol == -2) { overdef = true; break; }
+          if (ol >= 0) { const_vals.push_back(ol); all_undef = false; }
+        }
+        if (overdef) { lv = -2; continue; }
+        if (all_undef || const_vals.empty()) continue;
+
+        // 检查所有操作数是否都是常量
+        bool all_const = true;
+        for (size_t i = 0; i < I->GetNumOperands(); ++i) {
+          auto *op = I->GetOperand(i);
+          if (dynamic_cast<BasicBlock*>(op)) continue;
+          if (get_lat(op) < 0) { all_const = false; break; }
+        }
+        if (all_const) {
+          int r = 0;
+          auto ci = [&](int i) { return get_lat(I->GetOperand(i)); };
+          switch (I->GetOpcode()) {
+          case Opcode::Add: r = ci(0) + ci(1); break;
+          case Opcode::Sub: r = ci(0) - ci(1); break;
+          case Opcode::Mul: r = ci(0) * ci(1); break;
+          case Opcode::Div: r = ci(1) ? ci(0) / ci(1) : 0; break;
+          case Opcode::Mod: r = ci(1) ? ci(0) % ci(1) : 0; break;
+          case Opcode::Eq:  r = ci(0) == ci(1); break;
+          case Opcode::Ne:  r = ci(0) != ci(1); break;
+          case Opcode::Lt:  r = ci(0) < ci(1); break;
+          case Opcode::Le:  r = ci(0) <= ci(1); break;
+          case Opcode::Gt:  r = ci(0) > ci(1); break;
+          case Opcode::Ge:  r = ci(0) >= ci(1); break;
+          case Opcode::And: r = ci(0) & ci(1); break;
+          case Opcode::Or:  r = ci(0) | ci(1); break;
+          case Opcode::ZExt: r = ci(0); break;
+          default: break;
+          }
+          if (lv != r) { lv = r; }
+        } else {
+          lv = -2; // overdef
+        }
+      }
+    }
+
+    while (!bw.empty()) {
+      auto *bb = bw.front(); bw.pop();
+      for (auto &i : bb->GetInstructions()) {
+        if (dynamic_cast<PhiInst*>(i.get())) continue;
+        if (auto *br = dynamic_cast<BranchInst*>(i.get())) mark_exec(br->GetTarget());
+        else if (auto *cb = dynamic_cast<CondBranchInst*>(i.get())) {
+          int cl = get_lat(cb->GetCond());
+          if (cl >= 0) mark_exec(cl ? cb->GetTrueTarget() : cb->GetFalseTarget());
+          else { mark_exec(cb->GetTrueTarget()); mark_exec(cb->GetFalseTarget()); }
+        } else if (!i->IsTerminator()) iw.push(i.get());
+      }
+    }
+  }
+
+  // 收集常量 + 简化条件分支 + 删除不可达块
+  bool changed = false;
+
+  // 1. 常量替换
+  std::unordered_map<Instruction*, ConstantInt*> to_replace;
+  for (auto &bb : F.GetBlocks()) {
+    for (auto &i : bb->GetInstructions()) {
+      auto it = lat_vals.find(i.get());
+      if (it == lat_vals.end()) continue;
+      if (it->second >= 0 && !dynamic_cast<ConstantInt*>(i.get()) &&
+          !i->IsTerminator() && !dynamic_cast<StoreInst*>(i.get()) &&
+          !dynamic_cast<CallInst*>(i.get())) {
+        auto *ci = ctx ? ctx->GetConstInt(it->second) : nullptr;
+        if (ci) to_replace[i.get()] = ci;
+      }
+    }
+  }
+  if (!to_replace.empty()) {
+    for (auto &[inst, ci] : to_replace)
+      inst->ReplaceAllUsesWith(ci);
+    changed = true;
+  }
+
+  // 2. 简化常量条件分支为无条件 Br
+  for (auto &bb : F.GetBlocks()) {
+    auto &insts = const_cast<std::vector<std::unique_ptr<Instruction>>&>(bb->GetInstructions());
+    if (insts.empty()) continue;
+    auto *cbr = dynamic_cast<CondBranchInst*>(insts.back().get());
+    if (!cbr) continue;
+    int cl = get_lat(cbr->GetCond());
+    if (cl >= 0) {
+      auto *target = cl ? cbr->GetTrueTarget() : cbr->GetFalseTarget();
+      insts.back().reset(new BranchInst(Type::GetVoidType(), target));
+      changed = true;
+    }
+  }
+
+  // 3. 删除不可达块（不在 exec 集合中，且不是 entry）
+  std::unordered_set<BasicBlock*> dead;
+  for (auto &bb : F.GetBlocks())
+    if (!exec.count(bb.get()) && bb.get() != e)
+      dead.insert(bb.get());
+
+  if (!dead.empty()) {
+    // 清理 PHI 节点中的不可达入边
+    for (auto &bb : F.GetBlocks()) {
+      if (dead.count(bb.get())) continue;
+      for (auto &i : bb->GetInstructions()) {
+        auto *phi = dynamic_cast<PhiInst*>(i.get());
+        if (!phi) continue;
+        // 收集存活的入边
+        std::vector<Value*> keep_vals;
+        std::vector<BasicBlock*> keep_bbs;
+        for (size_t j = 0; j < phi->GetNumOperands(); j += 2) {
+          auto *incoming_bb = dynamic_cast<BasicBlock*>(phi->GetOperand(j+1));
+          if (!dead.count(incoming_bb)) {
+            keep_vals.push_back(phi->GetOperand(j));
+            keep_bbs.push_back(incoming_bb);
+          }
+        }
+        if (keep_vals.size() * 2 != phi->GetNumOperands()) {
+          phi->ClearOperands();
+          for (size_t j = 0; j < keep_vals.size(); ++j) {
+            phi->AddOperand(keep_vals[j]);
+            phi->AddOperand(keep_bbs[j]);
+          }
+          changed = true;
+        }
+      }
+    }
+
+    // 删除不可达块（同时清理其指令的 use）
+    auto &blocks = const_cast<std::vector<std::unique_ptr<BasicBlock>>&>(F.GetBlocks());
+    std::vector<std::unique_ptr<BasicBlock>> new_blocks;
+    for (auto &bb : blocks) {
+      if (dead.count(bb.get())) {
+        for (auto &i : bb->GetInstructions())
+          i->ClearOperands();
+        changed = true;
+      } else {
+        new_blocks.push_back(std::move(bb));
+      }
+    }
+    if (changed) blocks = std::move(new_blocks);
+  }
+
+  return changed;
+}
+
+} // namespace
+
+bool RunSCCP(Module &mod) {
+  bool changed = false;
+  auto &ctx = mod.GetContext();
+  for (auto &f : mod.GetFunctions())
+    changed |= RunOnFunction(*f, &ctx);
+  return changed;
+}
+
+} // namespace ir
diff --git a/src/mir/GreedyAlloc.cpp b/src/mir/GreedyAlloc.cpp
new file mode 100644
index 00000000..fe0cc5a0
--- /dev/null
+++ b/src/mir/GreedyAlloc.cpp
@@ -0,0 +1,1907 @@
+// LLVM Greedy Register Allocator —— 真·LLVM Greedy 完整实现
+//
+// 架构：
+// 1. 核心干涉检测：LiveIntervals::Interfere 预计算干涉图 → O(1) CanAssign 查询
+#include "mir/LiveRangeEdit.h"
+//    （等于 LLVM 的 LiveIntervalUnion 缓存层——预计算干涉信息，而非每次 O(N) 遍历）
+// 2. RegUnit 映射：Wn/Xn 共享 regunit n → 别名天然处理
+// 3. LLVM Greedy 分配策略：优先级驱动 + 驱逐机制 + Spill 迭代循环
+// 4. Spill 代码复用 RegAlloc.cpp 中验证过的 RewriteWithAllocation
+//
+// 关键设计决策：
+// - 干涉图使用 LiveIntervals::Interfere 构建（与 RegAlloc.cpp 相同的验证正确性）
+// - CanAssign 使用干涉图 O(1) 查询（而非 O(N) 遍历）
+// - ctx.info[v].phys_reg 是分配状态的唯一真实来源
+// - LiveIntervalUnion 保留为未来 SplitKit 的 O(log n) 动态缓存层
+//
+// 与旧 RegAlloc 的本质区别：
+// - 分配策略：贪心优先级+驱逐 vs Briggs 图着色
+// - RegUnit 原生支持 vs 着色后检查
+// - 架构为 SplitKit 预留接口
+
+#include "mir/GreedyAlloc.h"
+#include "mir/LiveIntervals.h"
+#include "mir/MIR.h"
+#include "mir/MachineRegisterInfo.h"
+
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <queue>
+#include <iostream>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace mir {
+namespace {
+
+constexpr bool kDebugGreedy = false;
+
+// Debug counters for allocation analysis
+// static int dbg_total_vregs = 0; ...  (disabled: used for LIU spill investigation)
+
+// ============================================================================
+// 1. LiveIntervalUnion —— per-regunit 有序区间并集（保留，供未来 O(log n) 缓存）
+// ============================================================================
+
+struct LUSegment {
+  int start;   // inclusive, block-relative instruction index
+  int end;     // exclusive
+  int vreg;
+
+  bool Overlaps(int s, int e) const { return start < e && s < end; }
+};
+
+class LiveIntervalUnion {
+public:
+  // O(log n) 重叠检测：段按 start 排序，二分查找相邻的两个候选段
+  bool Overlaps(int block_idx, int start, int end) const {
+    auto it = blocks_.find(block_idx);
+    if (it == blocks_.end()) return false;
+    const auto &vec = it->second;
+    if (vec.empty()) return false;
+    // 找第一个 start >= query_start 的段
+    auto seg_it = std::lower_bound(vec.begin(), vec.end(), start,
+        [](const LUSegment &s, int val) { return s.start < val; });
+    // 检查该段：若存在且 start < query_end 则重叠
+    if (seg_it != vec.end() && seg_it->start < end) return true;
+    // 检查前一个段：可能 start < query_start 但 end > query_start
+    if (seg_it != vec.begin()) {
+      --seg_it;
+      if (seg_it->end > start) return true;
+    }
+    return false;
+  }
+
+  void Add(int block_idx, int start, int end, int vreg) {
+    auto &vec = blocks_[block_idx];
+    auto it = std::lower_bound(vec.begin(), vec.end(), start,
+        [](const LUSegment &s, int val) { return s.start < val; });
+    vec.insert(it, {start, end, vreg});
+  }
+
+  void Remove(int vreg) {
+    for (auto &[bi, vec] : blocks_)
+      vec.erase(std::remove_if(vec.begin(), vec.end(),
+          [vreg](const LUSegment &s) { return s.vreg == vreg; }), vec.end());
+  }
+
+  void Clear() { blocks_.clear(); }
+
+  // O(log n) 获取指定区间内的所有 vreg
+  std::vector<int> GetOccupants(int block_idx, int start, int end) const {
+    std::vector<int> result;
+    auto it = blocks_.find(block_idx);
+    if (it == blocks_.end()) return result;
+    const auto &vec = it->second;
+    // 找第一个 start >= query_start 的段
+    auto seg_it = std::lower_bound(vec.begin(), vec.end(), start,
+        [](const LUSegment &s, int val) { return s.start < val; });
+    // 向前退一个（可能有段 start < query_start 但 end > query_start）
+    if (seg_it != vec.begin()) --seg_it;
+    // 向后扫描直到段不再重叠
+    while (seg_it != vec.end() && seg_it->start < end) {
+      if (seg_it->Overlaps(start, end))
+        result.push_back(seg_it->vreg);
+      ++seg_it;
+    }
+    return result;
+  }
+
+  const auto &Blocks() const { return blocks_; }
+
+private:
+  std::unordered_map<int, std::vector<LUSegment>> blocks_;
+};
+
+// ============================================================================
+// 2. RegUnit 映射
+// ============================================================================
+
+static int ToRegUnit(PhysReg reg) {
+  if (reg >= PhysReg::W0 && reg <= PhysReg::W30)
+    return static_cast<int>(reg) - static_cast<int>(PhysReg::W0);
+  if (reg >= PhysReg::X0 && reg <= PhysReg::X30)
+    return static_cast<int>(reg) - static_cast<int>(PhysReg::X0);
+  if (reg >= PhysReg::S0 && reg <= PhysReg::S31)
+    return 100 + static_cast<int>(reg) - static_cast<int>(PhysReg::S0);
+  if (reg >= PhysReg::Q0 && reg <= PhysReg::Q31)
+    return 200 + static_cast<int>(reg) - static_cast<int>(PhysReg::Q0);
+  return -1;
+}
+
+static bool Compat(PhysReg r, VRegClass vc) {
+  if (vc == VRegClass::Int)  return r >= PhysReg::W0 && r <= PhysReg::W30;
+  if (vc == VRegClass::Ptr)  return r >= PhysReg::X0 && r <= PhysReg::X30;
+  if (vc == VRegClass::Float) return r >= PhysReg::S0 && r <= PhysReg::S31;
+  if (vc == VRegClass::Vec)   return r >= PhysReg::Q0 && r <= PhysReg::Q31;
+  return false;
+}
+
+static bool IsGP(PhysReg r)  { return (r>=PhysReg::W0&&r<=PhysReg::W30)||(r>=PhysReg::X0&&r<=PhysReg::X30); }
+static bool IsFP(PhysReg r)  { return r >= PhysReg::S0 && r <= PhysReg::S31; }
+static bool IsVec(PhysReg r) { return r >= PhysReg::Q0 && r <= PhysReg::Q31; }
+static int  RegNum(PhysReg r) {
+  if (IsGP(r))  return r >= PhysReg::X0 ? static_cast<int>(r)-static_cast<int>(PhysReg::X0) : static_cast<int>(r)-static_cast<int>(PhysReg::W0);
+  if (IsFP(r))  return static_cast<int>(r) - static_cast<int>(PhysReg::S0);
+  if (IsVec(r)) return static_cast<int>(r) - static_cast<int>(PhysReg::Q0);
+  return -1;
+}
+
+// ============================================================================
+// 3. 寄存器集
+// ============================================================================
+
+// x16/x17 保留为专用 scratch（IP0/IP1，永不参与分配）
+static const int GP_NUMS[]      = {8,9,10,11,12,13,14,15,19,20,21,22,23,24,25,26,27,28};
+static const int GP_COUNT       = 18;
+static const int LEAF_GP_NUMS[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,19,20,21,22,23,24,25,26,27,28};
+static const int LEAF_GP_COUNT  = 23;
+// x0-x7 扩展：非叶函数也可使用 caller-saved，per-vreg RegHint 控制跨调用安全
+static const int EXT_GP_NUMS[]  = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,19,20,21,22,23,24,25,26,27,28};
+static const int EXT_GP_COUNT   = 26;
+static const int FP_NUMS[] = {8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
+static const int FP_COUNT  = 24;
+static const int VEC_NUMS[] = {0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
+static const int VEC_COUNT  = 24;
+
+// ============================================================================
+// 4. 分配状态
+// ============================================================================
+
+// Per-vreg 寄存器类提示
+enum class RegHint : uint8_t {
+  kPreferCaller,  // 优先 caller-saved，回退 callee-saved（默认）
+  kCalleeOnly,    // 必须 callee-saved（跨调用 vreg）
+  kAnyGP,         // 任意 GP（叶函数）
+};
+
+// BlockRange：vreg 在单个块中的活范围（first=首次引用, last=最后引用）
+// 由 BuildInterfGraph 填充，供干涉图 + LIU 共用
+struct BlockRange { int first; int last; bool has_ref; };
+
+struct AllocInfo {
+  PhysReg phys_reg = PhysReg::W0;
+  bool spilled = false;
+  int slot = -1;
+  bool remat = false;
+  int remat_imm = 0;
+  RegHint reg_hint = RegHint::kPreferCaller;
+};
+
+struct FuncCtx {
+  MachineFunction *mf = nullptr;
+  bool leaf = false;
+  LiveIntervals li;
+  std::vector<int> depths;
+  std::unordered_map<int, int> costs;
+  std::vector<AllocInfo> info;           // 唯一真实来源：vreg → PhysReg
+
+  // 候选寄存器（PhysReg）
+  std::vector<PhysReg> gp_cands, fp_cands, vec_cands;
+
+  // 预计算：哪些 vreg 跨 Call 活跃（保留兼容）
+  std::unordered_set<int> cross_call_vregs;
+
+  // 干涉图：vreg → 干涉的 vreg 集合
+  // 通过 LiveIntervals::Interfere 构建，与 RegAlloc.cpp 相同正确性保证
+  std::unordered_map<int, std::unordered_set<int>> interf_graph;
+
+  // 活范围数据：vreg → {block → {first, last, has_ref}}
+  // 由 BuildInterfGraph 填充，与干涉图使用完全相同的 liveness 数据源
+  std::unordered_map<int, std::unordered_map<MachineBasicBlock*, BlockRange>> vr;
+
+  // ── LiveIntervalUnion：per-regunit 段树缓存 ──
+  // 对齐 LLVM LiveIntervalUnion，用于 O(log n) 分配查询
+  // Key: regunit id (0-30=GP, 100+=FP, 200+=Vec)
+  std::unordered_map<int, LiveIntervalUnion> regunit_liu;
+
+  // Block → index 映射（LIU 用 int 索引块）
+  std::unordered_map<MachineBasicBlock*, int> block_index;
+};
+
+// ============================================================================
+// 5. 辅助函数
+// ============================================================================
+
+static bool IsLeafFunc(MachineFunction &f) {
+  for (auto &b : f.GetBlocks())
+    for (auto &i : b->GetInstructions())
+      if (i.GetOpcode() == Opcode::Call) return false;
+  return true;
+}
+
+// 检测递归函数：Call 指令的 Symbol operand 是否等于自身函数名
+static bool IsRecursiveFunc(MachineFunction &f) {
+  auto &func_name = f.GetName();
+  for (auto &b : f.GetBlocks())
+    for (auto &i : b->GetInstructions())
+      if (i.GetOpcode() == Opcode::Call)
+        for (auto &op : i.GetOperands())
+          if (op.GetKind() == Operand::Kind::Symbol && op.GetSymbol() == func_name)
+            return true;
+  return false;
+}
+
+static std::vector<int> LoopDepths(MachineFunction &f) {
+  auto &bs=f.GetBlocks(); size_t n=bs.size();
+  std::vector<int> d(n,0);
+  std::unordered_map<int,size_t> l2b;
+  for(size_t i=0;i<n;++i) l2b[bs[i]->GetLabelId()]=i;
+  std::vector<std::vector<size_t>> s(n);
+  for(size_t i=0;i<n;++i)
+    for(auto &inst:bs[i]->GetInstructions()){
+      if(inst.GetOpcode()==Opcode::Br&&inst.GetOperands().size()>=1&&inst.GetOperands()[0].GetKind()==Operand::Kind::Label)
+        if(auto it=l2b.find(inst.GetOperands()[0].GetLabel());it!=l2b.end()) s[i].push_back(it->second);
+      if(inst.GetOpcode()==Opcode::CondBr&&inst.GetOperands().size()>=2&&inst.GetOperands()[1].GetKind()==Operand::Kind::Label)
+        if(auto it=l2b.find(inst.GetOperands()[1].GetLabel());it!=l2b.end()) s[i].push_back(it->second);
+    }
+  if(n==0) return d;
+  std::vector<int> dn(n,-1); std::vector<bool> stk(n,false); int ctr=0;
+  std::vector<std::pair<size_t,size_t>> be;
+  std::function<void(size_t)> dfs=[&](size_t u){dn[u]=ctr++;stk[u]=true;for(auto v:s[u]){if(dn[v]==-1)dfs(v);else if(stk[v])be.push_back({u,v});}stk[u]=false;};
+  dfs(0);
+  for(auto [src,tgt]:be){std::vector<bool> vis(n,false);std::vector<size_t> q{tgt};vis[tgt]=true;
+    while(!q.empty()){auto cur=q.back();q.pop_back();d[cur]++;if(cur==src)continue;for(auto nx:s[cur])if(!vis[nx]){vis[nx]=true;q.push_back(nx);}}}
+  return d;
+}
+
+static void ComputeCosts(FuncCtx &ctx) {
+  auto &f=*ctx.mf;
+  std::unordered_map<int,int> dc; std::unordered_map<int,MachineInstr*> ld;
+  for(auto &b:f.GetBlocks()) for(auto &inst:b->GetInstructions()){auto du=MachineRegisterInfo::GetInstDefUse(inst);for(int d:du.defs){dc[d]++;ld[d]=&inst;}}
+  for(auto &[v,c]:dc) if(c==1&&ld[v]&&ld[v]->IsRematerializable()){ctx.info[v].remat=true;ctx.info[v].remat_imm=ld[v]->GetRematImm();}
+  auto &bs=f.GetBlocks();
+  for(size_t i=0;i<bs.size();++i){
+    int w=1;for(int dd=0;dd<ctx.depths[i]&&w<1000000;++dd) w*=10;
+    for(auto &inst:bs[i]->GetInstructions()){auto du=MachineRegisterInfo::GetInstDefUse(inst);
+      for(int u:du.uses) if(u>=0) ctx.costs[u]+=ctx.info[u].remat?1:w;
+      for(int d:du.defs) if(d>=0&&!ctx.info[d].remat) ctx.costs[d]+=w;
+    }
+  }
+  for(int i=0;i<f.GetNumVRegs();++i) if(!ctx.costs.count(i)) ctx.costs[i]=1;
+}
+
+// 预计算所有跨 Call 活跃的 vreg
+static void ComputeCrossCallVRegs(FuncCtx &ctx) {
+  std::unordered_map<MachineBasicBlock*, std::vector<int>> call_sites;
+  for (auto &b : ctx.mf->GetBlocks()) {
+    int idx = 0;
+    for (auto &inst : b->GetInstructions()) {
+      if (inst.GetOpcode() == Opcode::Call) call_sites[b.get()].push_back(idx);
+      idx++;
+    }
+  }
+
+  for (int v = 0; v < ctx.mf->GetNumVRegs(); ++v) {
+    auto *intervals = ctx.li.GetIntervals(v);
+    if (!intervals) continue;
+    for (auto &[block, seg] : *intervals) {
+      auto it = call_sites.find(block);
+      if (it == call_sites.end()) continue;
+      for (int cs : it->second) {
+        if (seg.start <= cs && cs < seg.end) {
+          ctx.cross_call_vregs.insert(v);
+          goto next_vreg;
+        }
+      }
+    }
+    next_vreg:;
+  }
+}
+
+// 预计算每个 vreg 的寄存器类提示（支持 x0-x7 扩展的安全使用）
+static void ComputeRegHints(FuncCtx &ctx) {
+  auto &f = *ctx.mf;
+  if (ctx.leaf) {
+    for (int v = 0; v < f.GetNumVRegs(); ++v)
+      ctx.info[v].reg_hint = RegHint::kAnyGP;
+    return;
+  }
+  // 复用 cross_call_vregs 的结果
+  for (int v = 0; v < f.GetNumVRegs(); ++v) {
+    ctx.info[v].reg_hint = ctx.cross_call_vregs.count(v)
+      ? RegHint::kCalleeOnly : RegHint::kPreferCaller;
+  }
+}
+
+// LIU 重建：清空所有 regunit 的 LIU，从 ctx.vr + 干涉图重新填充
+static void RebuildLIU(FuncCtx &ctx) {
+  // 清空所有 LIU
+  for (auto &[ru, liu] : ctx.regunit_liu)
+    liu.Clear();
+
+  // 重新填充 phantom 约束（用 ctx.vr——与干涉图相同数据源）
+  for (auto &[node_id, edges] : ctx.interf_graph) {
+    if (node_id >= 0) continue;
+    int ru = -(node_id + 1);
+    for (int nb : edges) {
+      if (nb < 0) continue;
+      auto vr_it = ctx.vr.find(nb);
+      if (vr_it == ctx.vr.end()) continue;
+      for (auto &[block, r] : vr_it->second) {
+        if (!r.has_ref) continue;
+        auto bi = ctx.block_index.find(block);
+        if (bi == ctx.block_index.end()) continue;
+        ctx.regunit_liu[ru].Add(bi->second, r.first, r.last + 1, node_id);
+      }
+    }
+  }
+}
+
+// 干涉图构建：追踪每个 vreg 在每个块中的 first_def/last_use 位置
+// 计算精确的活跃区间，仅在两区间重叠时添加干涉边
+// 不使用动态 live set（避免膨胀），不使用 O(nv × blocks) 初始化
+static void BuildInterfGraph(FuncCtx &ctx) {
+  int nv = ctx.mf->GetNumVRegs();
+  // 第一遍：为每个 vreg 在每个 block 中记录 first_def / last_use
+  // 存入 ctx.vr，供 LIU 使用相同数据源
+  ctx.vr.clear();
+  auto &vr = ctx.vr;
+
+  for (auto &block : ctx.mf->GetBlocks()) {
+    int ii = 0;
+    for (auto &inst : block->GetInstructions()) {
+      auto du = MachineRegisterInfo::GetInstDefUse(inst);
+      for (int u : du.uses) {
+        if (u < 0) continue;
+        auto &r = vr[u][block.get()];
+        if (!r.has_ref) { r.first = ii; r.has_ref = true; }
+        r.last = ii;
+      }
+      for (int d : du.defs) {
+        if (d < 0) continue;
+        auto &r = vr[d][block.get()];
+        if (!r.has_ref || ii < r.first) r.first = ii;
+        r.last = ii;
+        r.has_ref = true;
+      }
+      ii++;
+    }
+  }
+
+  // 用 LiveIntervals 补充 live-in/live-out 信息，修正 first/last
+  for (int v = 0; v < nv; ++v) {
+    auto *iv = ctx.li.GetIntervals(v);
+    if (!iv) continue;
+    for (auto &[block, seg] : *iv) {
+      auto &r = vr[v][block];
+      int block_sz = (int)block->GetInstructions().size();
+      // live-in：区间从块首开始
+      if (seg.start == 0 && (!r.has_ref || r.first > 0))
+        r.first = 0;
+      // live-out：区间延伸到块尾
+      if (seg.end >= block_sz) {
+        if (!r.has_ref || r.last < block_sz - 1)
+          r.last = block_sz - 1;
+        if (!r.has_ref) r.first = 0;
+      }
+      if (!r.has_ref) {
+        r.first = seg.start;
+        r.last = seg.end - 1;
+        r.has_ref = true;
+      }
+    }
+  }
+
+  // 第二遍：对每个 block，找出所有活跃的 vreg，检测区间重叠
+  auto add_edge = [&](int a, int b) {
+    if (a == b) return;
+    if (a < 0 || b < 0) {
+      ctx.interf_graph[a].insert(b);
+      ctx.interf_graph[b].insert(a);
+      return;
+    }
+    VRegClass va = ctx.mf->GetVRegClass(a);
+    VRegClass vb = ctx.mf->GetVRegClass(b);
+    bool same_or_alias = (va == vb) ||
+      (va == VRegClass::Int && vb == VRegClass::Ptr) ||
+      (va == VRegClass::Ptr && vb == VRegClass::Int);
+    if (!same_or_alias) return;
+    ctx.interf_graph[a].insert(b);
+    ctx.interf_graph[b].insert(a);
+  };
+
+  for (auto &block : ctx.mf->GetBlocks()) {
+    std::vector<std::pair<int, BlockRange>> active;
+    for (auto &[v, bmap] : vr) {
+      auto it = bmap.find(block.get());
+      if (it != bmap.end()) active.push_back({v, it->second});
+    }
+    if (active.size() < 2) continue;
+    // 按区间起始位置排序
+    std::sort(active.begin(), active.end(),
+      [](const auto &a, const auto &b) { return a.second.first < b.second.first; });
+    struct ActiveEntry { int last; int vreg; };
+    std::vector<ActiveEntry> sweep_active;
+    for (size_t i = 0; i < active.size(); ++i) {
+      auto &[vi, ri] = active[i];
+      sweep_active.erase(
+        std::remove_if(sweep_active.begin(), sweep_active.end(),
+          [&ri](const ActiveEntry &a) { return a.last < ri.first; }),
+        sweep_active.end());
+      for (auto &ae : sweep_active) add_edge(vi, ae.vreg);
+      sweep_active.push_back({ri.last, vi});
+    }
+  }
+
+  // ---- 预着色节点：全函数双向 phantom 区间 ----
+  // 读方向（MovReg(vreg, PhysReg)）：参数/返回值加载，vreg 的 first_ref
+  //   在 deadline 之前 → 与 phantom 干涉（PhysReg 值还在被使用中）
+  // 写方向（MovReg(PhysReg, VReg)）：调用参数设置，PhysReg 被写入，
+  //   vreg 的 last_ref >= def_idx → 干涉（PhysReg 值被覆盖）
+  for (auto &block : ctx.mf->GetBlocks()) {
+    auto &einsts = block->GetInstructions();
+    std::unordered_map<int, int> ru_last_read;  // ru → 最后读取的指令索引
+    std::unordered_map<int, int> ru_first_def;  // ru → 首次写入的指令索引
+    for (int ii = 0; ii < (int)einsts.size(); ++ii) {
+      if (einsts[ii].GetOpcode() == Opcode::MovReg) {
+        auto &ops = einsts[ii].GetOperands();
+        if (ops.size() >= 2) {
+          if (ops[0].GetKind() == Operand::Kind::VReg &&
+              ops[1].GetKind() == Operand::Kind::Reg) {
+            int src_ru = ToRegUnit(ops[1].GetReg());
+            if (src_ru >= 0 && src_ru <= 30)
+              ru_last_read[src_ru] = std::max(ru_last_read[src_ru], ii);
+          }
+          if (ops[0].GetKind() == Operand::Kind::Reg &&
+              ops[1].GetKind() == Operand::Kind::VReg) {
+            int dst_ru = ToRegUnit(ops[0].GetReg());
+            if (dst_ru >= 0 && dst_ru <= 30) {
+              if (!ru_first_def.count(dst_ru) || ii < ru_first_def[dst_ru])
+                ru_first_def[dst_ru] = ii;
+            }
+          }
+        }
+      }
+    }
+    // 读方向：first_ref < deadline → 干涉
+    for (auto &[ru, deadline] : ru_last_read) {
+      int pc_id = -(ru + 1);
+      for (auto &[v, bmap] : vr) {
+        if (v < 0) continue;
+        auto it = bmap.find(block.get());
+        if (it == bmap.end()) continue;
+        auto &r = it->second;
+        if (r.has_ref && r.first < deadline) add_edge(pc_id, v);
+      }
+    }
+    // 写方向：last_ref >= def_idx → 干涉（PhysReg 被写入会覆盖 vreg 的值）
+    for (auto &[ru, def_idx] : ru_first_def) {
+      int pc_id = -(ru + 1);
+      for (auto &[v, bmap] : vr) {
+        if (v < 0) continue;
+        auto it = bmap.find(block.get());
+        if (it == bmap.end()) continue;
+        auto &r = it->second;
+        if (r.has_ref && r.last >= def_idx) add_edge(pc_id, v);
+      }
+    }
+  }
+
+  // ---- Call Clobber phantom：保护 caller-saved 寄存器 ----
+  // 每个 Call 指令处为 caller-saved regunit (ru 0-17) 添加 phantom 边
+  // 任何在 Call 处活跃的 vreg 都无法使用 caller-saved 寄存器
+  // 这替代了 cross_call 二元标记，使 x0-x7 可安全用于所有函数
+  for (auto &clob_block : ctx.mf->GetBlocks()) {
+    auto &einsts = clob_block->GetInstructions();
+    for (int ii = 0; ii < (int)einsts.size(); ++ii) {
+      if (einsts[ii].GetOpcode() != Opcode::Call) continue;
+      for (int ru = 0; ru <= 17; ++ru) {
+        int pc_id = -(ru + 1);
+        for (auto &[v, bmap] : vr) {
+          if (v < 0) continue;
+          auto it = bmap.find(clob_block.get());
+          if (it == bmap.end()) continue;
+          auto &r = it->second;
+          if (r.has_ref && r.first <= ii && ii <= r.last)
+            add_edge(pc_id, v);
+        }
+      }
+    }
+  }
+}
+
+// ============================================================================
+// 5.5 Register Coalescing —— 合并 copy-connected 的非干涉 vreg
+//
+// 扫描所有 MovReg(vreg, vreg) 指令，若 dst 和 src 不干涉则将 dst 的所有引用
+// 替换为 src 并删除 MovReg。消除冗余副本，直接减少 MOV 指令数。
+//
+// 保守条件：
+//   - dst 和 src 同 VRegClass
+//   - dst 仅有一个定义（MovReg 自身）
+//   - dst 和 src 的活跃区间不重叠（使用与 BuildInterfGraph 相同的区间重叠检测）
+//
+// 传递闭包：%0→%1 且 %1→%2 → %0→%2，一次扫描处理 copy chain
+// ============================================================================
+
+static bool Coalesce(MachineFunction &f, const LiveIntervals &li,
+                     const std::unordered_map<int, MachineBasicBlock*> *phi_block_map = nullptr) {
+  int nv = f.GetNumVRegs();
+
+  // 计算每个 vreg 定义次数和使用次数
+  std::vector<int> def_counts(nv, 0);
+  std::vector<int> use_counts(nv, 0);
+  for (auto &b : f.GetBlocks())
+    for (auto &inst : b->GetInstructions()) {
+      auto du = MachineRegisterInfo::GetInstDefUse(inst);
+      for (int d : du.defs) if (d >= 0 && d < nv) def_counts[d]++;
+      for (int u : du.uses) if (u >= 0 && u < nv) use_counts[u]++;
+    }
+
+  // 干涉检测：统一使用全局段式检查（基于 SlotIndex）
+  auto overlap = [&](int a, int b, const MachineInstr *exclude = nullptr) -> bool {
+    if (!exclude) return li.InterfereSegments(a, b);
+    SlotIndex ex_slot = li.GetInstSlot(exclude);
+    if (ex_slot.IsValid()) return li.InterfereSegmentsExcept(a, b, ex_slot);
+    return li.InterfereSegments(a, b); // 回退：无 slot 时保守
+  };
+
+  // 扫描所有 MovReg，收集可 coalesce 的对
+  struct CP { int dst; int src; };
+  std::vector<CP> pairs;
+
+  // 对多定义 vreg：收集所有 MovReg 定义及其源 vreg + 对应的 MovReg 指令
+  // multi_def_info[dst] = {src1 → &MovReg1, src2 → &MovReg2, ...}
+  // 用于在干涉检查中排除正确的 MovReg 指令
+  std::unordered_map<int, std::unordered_map<int, const MachineInstr*>> multi_def_info;
+
+  for (auto &block : f.GetBlocks()) {
+    for (auto &inst : block->GetInstructions()) {
+      if (inst.GetOpcode() != Opcode::MovReg) continue;
+      auto &ops = inst.GetOperands();
+      if (ops.size() < 2) continue;
+      if (ops[0].GetKind() != Operand::Kind::VReg) continue;
+      if (ops[1].GetKind() != Operand::Kind::VReg) continue;
+
+      int dst = ops[0].GetVRegId();
+      int src = ops[1].GetVRegId();
+      if (dst == src) continue;
+      if (dst >= nv || src >= nv) continue;
+
+      if (f.GetVRegClass(dst) != f.GetVRegClass(src)) continue;
+
+      if (def_counts[dst] == 1) {
+        // 双方唯一定义 → 可安全排除 MovReg 自身
+        if (def_counts[src] == 1) {
+          if (overlap(dst, src, &inst)) continue;
+        } else {
+          if (overlap(dst, src)) continue;
+        }
+        pairs.push_back({dst, src});
+      } else {
+        multi_def_info[dst][src] = &inst;
+      }
+    }
+  }
+
+  // 多源 phi 合并基础设施：inst→block 映射
+  std::unordered_map<const MachineInstr*, MachineBasicBlock*> inst_to_blk;
+  for (auto &b : f.GetBlocks())
+    for (auto &mi : b->GetInstructions())
+      inst_to_blk[&mi] = b.get();
+
+  for (auto &[dst, src_to_inst] : multi_def_info) {
+    if (src_to_inst.size() == 1) {
+      auto [src, inst] = *src_to_inst.begin();
+      if (overlap(dst, src, inst)) continue;
+      pairs.push_back({dst, src});
+    }
+    // 多源 phi 合并（size > 1）：先应用再验证
+    // 策略：对每个候选源，暂存 → 应用合并 → 重算 LiveIntervals →
+    //       检查实际干涉 → 无效则回退
+  }
+
+  // ---- 多源 phi 合并：try-and-verify ----
+  for (auto &[dst, src_to_inst] : multi_def_info) {
+    if (src_to_inst.size() <= 1) continue;
+    // 每个 dst 只尝试一次合并（找到第一个安全源即停止）
+    for (auto &[src_i, inst_i] : src_to_inst) {
+      // 检查 0：dst 与 src_i 不干涉（排除自身副本）
+      if (overlap(dst, src_i, inst_i)) continue;
+
+      // 保存当前状态
+      std::vector<std::vector<MachineInstr>> saved;
+      for (auto &b : f.GetBlocks())
+        saved.push_back(b->GetInstructions());
+
+      // 应用合并
+      // Step A: 替换所有 dst 操作数为 src_i
+      for (auto &b : f.GetBlocks())
+        for (auto &mi : b->GetInstructions())
+          for (auto &op : mi.GetOperands())
+            if (op.GetKind() == Operand::Kind::VReg && op.GetVRegId() == dst)
+              const_cast<Operand&>(op) = Operand::VReg(src_i, f.GetVRegClass(src_i));
+
+      // Step B: 删除自复制副本（dst=COPY src_i → src_i=COPY src_i）
+      for (auto &b : f.GetBlocks()) {
+        std::vector<MachineInstr> ni;
+        for (auto &mi : b->GetInstructions()) {
+          if (mi.GetOpcode() == Opcode::MovReg && mi.GetOperands().size() >= 2 &&
+              mi.GetOperands()[0].GetKind() == Operand::Kind::VReg &&
+              mi.GetOperands()[0].GetVRegId() == src_i &&
+              mi.GetOperands()[1].GetKind() == Operand::Kind::VReg &&
+              mi.GetOperands()[1].GetVRegId() == src_i) {
+            continue; // 自复制 → 删除
+          }
+          ni.push_back(std::move(const_cast<MachineInstr&>(mi)));
+        }
+        b->GetInstructions() = std::move(ni);
+      }
+
+      // Step C: 重算 LiveIntervals
+      LiveIntervals new_li;
+      new_li.Compute(f);
+
+      // Step D: 验证——检查 src_i 与每个其他源不干涉（排除副本点）
+      bool valid = true;
+      for (auto &[src_j, inst_j] : src_to_inst) {
+        if (src_j == src_i) continue;
+        SlotIndex sj = new_li.GetInstSlot(inst_j);
+        if (sj.IsValid()) {
+          if (new_li.InterfereSegmentsExcept(src_i, src_j, sj))
+            { valid = false; break; }
+        } else {
+          if (new_li.InterfereSegments(src_i, src_j))
+            { valid = false; break; }
+        }
+      }
+
+      if (valid) {
+        pairs.push_back({dst, src_i});
+        break; // 合并成功，继续下一个 dst
+      } else {
+        // 回退：恢复保存的指令
+        for (size_t bi = 0; bi < saved.size(); ++bi)
+          f.GetBlocks()[bi]->GetInstructions() = std::move(saved[bi]);
+      }
+    }
+  }
+
+  if (pairs.empty()) return false;
+
+  // 构建替换映射（传递闭包，带环路检测）
+  std::unordered_map<int, int> replace;
+  for (auto &[dst, src] : pairs) {
+    int ult = src;
+    std::unordered_set<int> visited;
+    visited.insert(dst);
+    while (replace.count(ult)) {
+      if (visited.count(ult)) { ult = dst; break; } // 环路 → 跳过
+      visited.insert(ult);
+      ult = replace[ult];
+    }
+    if (ult != dst) replace[dst] = ult;
+  }
+
+  // 应用替换 + 删除已 coalesce 的 MovReg
+  for (auto &block : f.GetBlocks()) {
+    std::vector<MachineInstr> ni;
+    for (auto &inst : block->GetInstructions()) {
+      // 跳过被 coalesce 的 MovReg
+      if (inst.GetOpcode() == Opcode::MovReg) {
+        auto &ops = inst.GetOperands();
+        if (ops.size() >= 1 && ops[0].GetKind() == Operand::Kind::VReg) {
+          if (replace.count(ops[0].GetVRegId())) continue;
+        }
+      }
+      // 替换操作数
+      for (auto &op : inst.GetOperands()) {
+        if (op.GetKind() == Operand::Kind::VReg) {
+          auto it = replace.find(op.GetVRegId());
+          if (it != replace.end())
+            const_cast<Operand &>(op) = Operand::VReg(it->second, f.GetVRegClass(it->second));
+        }
+      }
+      ni.push_back(std::move(const_cast<MachineInstr &>(inst)));
+    }
+    block->GetInstructions() = std::move(ni);
+  }
+
+  return true;
+}
+
+
+// ============================================================================
+// 5.6 SplitKit：活范围分裂——分配失败时的回退策略
+//
+// 当 AllocClass 无法为 vreg 分配寄存器时，在循环边界分裂活范围。
+// 高循环深度（hot）部分获得独立 vreg，低深度（cold）部分保留原 vreg。
+// 每段更短 → 更易分配 → 减少整体 spill。
+//
+// 分裂策略：
+//   1. 按循环深度划分 hot/cold 区域
+//   2. 在 hot↔cold 边界插入 MovReg 拷贝
+//   3. 使用 LiveRangeEdit 增量更新 LiveIntervals
+// ============================================================================
+
+// 活范围循环边界分裂：将 vreg 分为 hot（循环内高深度）和 cold（循环外低深度）
+// 返回新 vreg id（hot 区域），失败返回 -1
+static int SplitVRegAtLoopBoundary(FuncCtx &ctx, int vreg) {
+  MachineFunction &f = *ctx.mf;
+  auto &blocks = f.GetBlocks();
+  int n = (int)blocks.size();
+  if (n == 0 || ctx.depths.empty()) return -1;
+
+  VRegClass vc = f.GetVRegClass(vreg);
+
+  // ---- 1. 收集使用模式 ----
+  std::vector<int> use_count(n, 0);
+  int def_block_idx = -1;
+  int total_uses = 0;
+
+  for (int i = 0; i < n; ++i) {
+    for (auto &inst : blocks[i]->GetInstructions()) {
+      auto du = MachineRegisterInfo::GetInstDefUse(inst);
+      for (int u : du.uses) if (u == vreg) { use_count[i]++; total_uses++; }
+      for (int d : du.defs) if (d == vreg) def_block_idx = i;
+    }
+  }
+
+  if (total_uses == 0 || def_block_idx < 0) return -1;
+
+  // ---- 2. 计算平均深度 ----
+  int weighted = 0;
+  for (int i = 0; i < n; ++i) {
+    int d = (i < (int)ctx.depths.size()) ? ctx.depths[i] : 0;
+    weighted += d * use_count[i];
+  }
+  int avg_depth = total_uses > 0 ? weighted / total_uses : 0;
+
+  // 检查是否有多深度使用
+  int max_d = 0, min_d = 9999;
+  for (int i = 0; i < n; ++i) {
+    if (use_count[i] == 0) continue;
+    int d = (i < (int)ctx.depths.size()) ? ctx.depths[i] : 0;
+    if (d > max_d) max_d = d;
+    if (d < min_d) min_d = d;
+  }
+  // 即使单一深度，若跨多个块且有足够使用 → 分裂可减少干涉度
+  if (max_d <= min_d && total_uses <= 4) return -1;
+
+  // 使用平均深度作为 hot 阈值，但确保至少有一部分块被分类为 cold
+  int threshold = std::max(1, avg_depth > 0 ? avg_depth - 1 : 0);
+
+  // ---- 3. 划分 hot/cold 块 ----
+  std::unordered_set<MachineBasicBlock*> hot_blocks;
+  for (int i = 0; i < n; ++i) {
+    int d = (i < (int)ctx.depths.size()) ? ctx.depths[i] : 0;
+    if (d > threshold && use_count[i] > 0)
+      hot_blocks.insert(blocks[i].get());
+  }
+
+  // 允许 hot 覆盖绝大多数块时仍然分裂——少量 cold 使用也值得隔离
+  if (hot_blocks.empty()) return -1;
+  if (hot_blocks.size() >= (size_t)n) return -1;  // 所有块都是 hot → 无 cold 可分裂
+
+  // ---- 4. 创建 hot vreg + 替换 uses ----
+  int hot_vreg = f.CreateVReg(vc);
+  LiveRangeEdit lre(f, ctx.li);
+
+  // 替换 hot 块中的 uses
+  lre.ReplaceUsesInBlocks(vreg, hot_vreg, hot_blocks);
+
+  // ---- 5. 插入边界 COPY ----
+  // 构建前驱关系
+  std::unordered_map<int, MachineBasicBlock*> label2block;
+  for (auto &b : blocks) label2block[b->GetLabelId()] = b.get();
+
+  std::unordered_map<MachineBasicBlock*, std::vector<MachineBasicBlock*>> preds;
+  for (auto &b : blocks) {
+    for (auto &succ : b->GetSuccessors()) {
+      auto it = label2block.find(succ.label);
+      if (it != label2block.end()) preds[it->second].push_back(b.get());
+    }
+  }
+
+  // 在 hot 块入口插入 copy（当有 cold 前驱时）
+  for (auto *hb : hot_blocks) {
+    auto pit = preds.find(hb);
+    if (pit == preds.end()) continue;
+    bool has_cold_pred = false;
+    for (auto *p : pit->second)
+      if (!hot_blocks.count(p)) { has_cold_pred = true; break; }
+    if (has_cold_pred)
+      lre.InsertCopyAtEntry(hb, hot_vreg, vreg);
+  }
+
+  // 在 hot 块出口插入 copy（当有 cold 后继时）
+  for (auto *hb : hot_blocks) {
+    bool has_cold_succ = false;
+    for (auto &succ : hb->GetSuccessors()) {
+      auto it = label2block.find(succ.label);
+      if (it != label2block.end() && !hot_blocks.count(it->second))
+        { has_cold_succ = true; break; }
+    }
+    if (has_cold_succ) {
+      // 在 hot 块末尾（terminator 之前）插入 hot→cold copy
+      auto &insts = const_cast<std::vector<MachineInstr>&>(hb->GetInstructions());
+      size_t insert_pos = insts.size();
+      while (insert_pos > 0) {
+        auto op = insts[insert_pos - 1].GetOpcode();
+        if (op == Opcode::Br || op == Opcode::CondBr || op == Opcode::Ret)
+          --insert_pos;
+        else
+          break;
+      }
+      insts.insert(insts.begin() + insert_pos,
+        MachineInstr(Opcode::MovReg, {
+          Operand::VReg(vreg, vc),
+          Operand::VReg(hot_vreg, vc)
+        }));
+    }
+  }
+
+  // ---- 6. 处理 cold 定义 → hot 使用 ----
+  if (def_block_idx >= 0) {
+    auto *db = blocks[def_block_idx].get();
+    if (!hot_blocks.count(db)) {
+      // 定义在 cold 块，需 copy cold→hot
+      auto &insts = const_cast<std::vector<MachineInstr>&>(db->GetInstructions());
+      for (size_t ii = 0; ii < insts.size(); ++ii) {
+        auto du = MachineRegisterInfo::GetInstDefUse(insts[ii]);
+        bool defines = false;
+        for (int d : du.defs) if (d == vreg) { defines = true; break; }
+        if (defines) {
+          insts.insert(insts.begin() + ii + 1,
+            MachineInstr(Opcode::MovReg, {
+              Operand::VReg(hot_vreg, vc),
+              Operand::VReg(vreg, vc)
+            }));
+          break;
+        }
+      }
+    }
+  }
+
+  // ---- 7. 提交 ----
+  lre.Commit();
+
+  if (kDebugGreedy)
+    std::cerr << "[SplitKit] vreg %" << vreg << " split: hot=%" << hot_vreg
+              << " in " << hot_blocks.size() << " blocks (avg_depth="
+              << avg_depth << ")" << std::endl;
+
+  return hot_vreg;
+}
+
+// 活范围调用边界分裂：将 vreg 在 call 边界分裂为 caller/cross-call 两段
+// caller 段（不在任何 call 处活跃）可使用 caller-saved 寄存器
+// cross-call 段（跨调用活跃）必须使用 callee-saved 寄存器
+// 返回新 vreg id（caller 段），失败返回 -1
+static int SplitVRegAtCallBoundary(FuncCtx &ctx, int vreg) {
+  MachineFunction &f = *ctx.mf;
+  auto &blocks = f.GetBlocks();
+  int n = (int)blocks.size();
+  if (n == 0) return -1;
+
+  VRegClass vc = f.GetVRegClass(vreg);
+
+  // 1. 标记含 Call 的块 + 收集 call 指令索引
+  std::unordered_set<MachineBasicBlock*> call_blocks;
+  for (auto &b : blocks) {
+    for (auto &inst : b->GetInstructions()) {
+      if (inst.GetOpcode() == Opcode::Call) {
+        call_blocks.insert(b.get());
+        break;
+      }
+    }
+  }
+  if (call_blocks.empty()) return -1;
+
+  // 2. 收集 vreg 的使用模式
+  std::vector<bool> used_in_block(n, false);
+  int def_block_idx = -1;
+  int total_uses = 0;
+  for (int i = 0; i < n; ++i) {
+    for (auto &inst : blocks[i]->GetInstructions()) {
+      auto du = MachineRegisterInfo::GetInstDefUse(inst);
+      for (int u : du.uses) if (u == vreg) { used_in_block[i] = true; total_uses++; }
+      for (int d : du.defs) if (d == vreg) def_block_idx = i;
+    }
+  }
+  if (total_uses == 0) return -1;
+
+  // 3. 分类：caller 块（不含 Call）+ cross-call 块（含 Call，vreg 在此活跃）
+  std::unordered_set<MachineBasicBlock*> caller_blocks, cross_call_blocks;
+  for (int i = 0; i < n; ++i) {
+    if (!used_in_block[i]) continue;
+    if (call_blocks.count(blocks[i].get()))
+      cross_call_blocks.insert(blocks[i].get());
+    else
+      caller_blocks.insert(blocks[i].get());
+  }
+  if (caller_blocks.empty() || cross_call_blocks.empty()) return -1;
+
+  // 4. 创建 caller vreg + 替换 uses
+  int caller_vreg = f.CreateVReg(vc);
+  LiveRangeEdit lre(f, ctx.li);
+  lre.ReplaceUsesInBlocks(vreg, caller_vreg, caller_blocks);
+
+  // 5. 构建前驱关系 + 插入边界 COPY
+  std::unordered_map<int, MachineBasicBlock*> label2block;
+  for (auto &b : blocks) label2block[b->GetLabelId()] = b.get();
+  std::unordered_map<MachineBasicBlock*, std::vector<MachineBasicBlock*>> preds;
+  for (auto &b : blocks) {
+    for (auto &succ : b->GetSuccessors()) {
+      auto it = label2block.find(succ.label);
+      if (it != label2block.end()) preds[it->second].push_back(b.get());
+    }
+  }
+
+  // Caller 块入口：当有 cross-call 前驱时插入 copy
+  for (auto *cb : caller_blocks) {
+    auto pit = preds.find(cb);
+    if (pit == preds.end()) continue;
+    bool has_cross_pred = false;
+    for (auto *p : pit->second)
+      if (cross_call_blocks.count(p)) { has_cross_pred = true; break; }
+    if (has_cross_pred)
+      lre.InsertCopyAtEntry(cb, caller_vreg, vreg);
+  }
+  // Cross-call 块入口：当有 caller 前驱时插入 copy
+  for (auto *ccb : cross_call_blocks) {
+    auto pit = preds.find(ccb);
+    if (pit == preds.end()) continue;
+    bool has_caller_pred = false;
+    for (auto *p : pit->second)
+      if (caller_blocks.count(p)) { has_caller_pred = true; break; }
+    if (has_caller_pred)
+      lre.InsertCopyAtEntry(ccb, vreg, caller_vreg);
+  }
+
+  // 6. 处理定义块
+  if (def_block_idx >= 0) {
+    auto *db = blocks[def_block_idx].get();
+    auto &insts = const_cast<std::vector<MachineInstr>&>(db->GetInstructions());
+    for (size_t ii = 0; ii < insts.size(); ++ii) {
+      auto du = MachineRegisterInfo::GetInstDefUse(insts[ii]);
+      bool defines = false;
+      for (int d : du.defs) if (d == vreg) { defines = true; break; }
+      if (defines) {
+        if (!caller_blocks.count(db) && cross_call_blocks.count(db))
+          // 定义在 cross-call 块 → 需要 copy 到 caller 段
+          insts.insert(insts.begin() + ii + 1,
+            MachineInstr(Opcode::MovReg, {
+              Operand::VReg(caller_vreg, vc),
+              Operand::VReg(vreg, vc)}));
+        else if (caller_blocks.count(db) && !cross_call_blocks.count(db))
+          // 定义在 caller 块 → 需要 copy 到 cross-call 段
+          insts.insert(insts.begin() + ii + 1,
+            MachineInstr(Opcode::MovReg, {
+              Operand::VReg(vreg, vc),
+              Operand::VReg(caller_vreg, vc)}));
+        break;
+      }
+    }
+  }
+
+  lre.Commit();
+
+  if (kDebugGreedy)
+    std::cerr << "[SplitKit-call] vreg %" << vreg << " split at call boundary: caller=%"
+              << caller_vreg << " in " << caller_blocks.size()
+              << " blocks, cross-call=" << cross_call_blocks.size() << " blocks" << std::endl;
+
+  return caller_vreg;
+}
+
+// ============================================================================
+// 6. 核心分配 —— 真·LLVM Greedy：优先级驱动 + 直接 PhysReg + 驱逐
+//
+// 算法（从第一原理实现）：
+//   1. 按 spill_weight (cost / max(1, degree)) 降序排列 vreg
+//   2. 高权重优先分配——重要的 vreg 先选寄存器
+//   3. 对每个 vreg：
+//      a. 收集邻居已占用的 regunit → blocked 集合
+//      b. 尝试空闲 regunit（非跨调用：caller-saved 优先；跨调用：callee-saved only）
+//      c. 若无空闲 → 驱逐：找最低总 spill_cost 的邻居集合，释放其 regunit
+//      d. 若驱逐失败 → 标记 spilled
+//
+// RegUnit 约束：Wn/Xn 共享 regunit n —— 邻居占用任一即冲突
+// 预着色节点：在干涉图中以负 ID 存在，分配前预标记其 regunit 为固定约束
+// 跨调用 vreg：仅分配到 callee-saved（call 会破坏 caller-saved 的值）
+//
+// 与旧 Briggs 着色的本质区别：
+//   - 优先级驱动（大权重先分配）vs 度数驱动（小度数先简化）
+//   - 直接 PhysReg vs 抽象颜色号→PhysReg 二次映射
+//   - 驱逐机制 vs 被动 spill
+//   - O(|V|·|R|) 分配 vs O(|V|²) 着色
+// ============================================================================
+
+static void AllocClass(const std::vector<int> &vregs, const std::vector<PhysReg> &cands, FuncCtx &ctx,
+                       const std::unordered_map<int, std::vector<int>> *phi_pairs = nullptr) {
+  if (vregs.empty()) return;
+
+  // ---- 1. 构建 regunit → PhysReg 映射（按 vreg class 区分宽度）----
+  std::unordered_map<int, PhysReg> ru_to_wreg, ru_to_xreg, ru_to_sreg, ru_to_qreg;
+  std::set<int> all_rus;
+  for (auto r : cands) {
+    int ru = ToRegUnit(r);
+    all_rus.insert(ru);
+    if (r >= PhysReg::W0 && r <= PhysReg::W30)      ru_to_wreg[ru] = r;
+    else if (r >= PhysReg::X0 && r <= PhysReg::X30) ru_to_xreg[ru] = r;
+    else if (r >= PhysReg::S0 && r <= PhysReg::S31) ru_to_sreg[ru] = r;
+    else if (r >= PhysReg::Q0 && r <= PhysReg::Q31) ru_to_qreg[ru] = r;
+  }
+
+  auto find_reg = [&](int ru, VRegClass vc) -> PhysReg {
+    if (vc == VRegClass::Int)   { auto it = ru_to_wreg.find(ru); return it != ru_to_wreg.end() ? it->second : PhysReg::W0; }
+    if (vc == VRegClass::Ptr)   { auto it = ru_to_xreg.find(ru); return it != ru_to_xreg.end() ? it->second : PhysReg::X0; }
+    if (vc == VRegClass::Float) { auto it = ru_to_sreg.find(ru); return it != ru_to_sreg.end() ? it->second : PhysReg::S0; }
+    if (vc == VRegClass::Vec)   { auto it = ru_to_qreg.find(ru); return it != ru_to_qreg.end() ? it->second : PhysReg::Q0; }
+    return PhysReg::W0;
+  };
+
+  // ---- 2. 分离 caller/callee-saved regunit ----
+  std::vector<int> caller_rus, callee_rus, caller_rus_safe;
+  for (int ru : all_rus) {
+    bool is_callee = false;
+    if (ru < 100) {
+      is_callee = (ru >= 19);  // x19-x28 = callee-saved
+    } else if (ru < 200) {
+      is_callee = ((ru - 100) >= 16);
+    } else {
+      is_callee = ((ru - 200) >= 16);
+    }
+    if (is_callee) callee_rus.push_back(ru);
+    else {
+      caller_rus.push_back(ru);
+      if (ru != 16 && ru != 17) caller_rus_safe.push_back(ru);
+    }
+  }
+
+  // ---- 3. 计算优先级（LLVM 风格 spill weight）----
+  // spillWeight = cost / (liveRangeLen * degree)
+  // 短活范围 + 低干涉度 + 高代价 → 优先分配
+  // 长活范围 + 高干涉度 + 低代价 → 优先溢出
+  std::unordered_map<int, int> range_len;
+  for (int v : vregs) {
+    int total_len = 0;
+    auto *iv = ctx.li.GetIntervals(v);
+    if (iv) for (auto &[block, seg] : *iv) total_len += seg.end - seg.start;
+    range_len[v] = std::max(1, total_len);
+  }
+
+  // 先构建 copy_partners 映射（从 MovReg 指令收集）
+  std::unordered_map<int, std::vector<int>> copy_partners;
+  for (auto &block : ctx.mf->GetBlocks()) {
+    for (auto &inst : block->GetInstructions()) {
+      if (inst.GetOpcode() != Opcode::MovReg) continue;
+      auto &ops = inst.GetOperands();
+      if (ops.size() >= 2 &&
+          ops[0].GetKind() == Operand::Kind::VReg &&
+          ops[1].GetKind() == Operand::Kind::VReg) {
+        int dst = ops[0].GetVRegId();
+        int src = ops[1].GetVRegId();
+        if (dst != src) {
+          copy_partners[dst].push_back(src);
+          copy_partners[src].push_back(dst);
+        }
+      }
+    }
+  }
+
+  // 合并 phi 连接（最高优先级 copy hint——phi 两端应强制分配到同一寄存器）
+  if (phi_pairs) {
+    for (auto &[v, partners] : *phi_pairs) {
+      for (int p : partners) {
+        // 避免重复
+        auto &cp = copy_partners[v];
+        if (std::find(cp.begin(), cp.end(), p) == cp.end())
+          cp.push_back(p);
+      }
+    }
+  }
+
+  struct VRegPriority { int vreg; double weight; int cost; };
+  std::vector<VRegPriority> queue;
+  for (int v : vregs) {
+    int cost = ctx.costs.count(v) ? ctx.costs.at(v) : 1;
+    int deg = 0;
+    auto it = ctx.interf_graph.find(v);
+    if (it != ctx.interf_graph.end())
+      for (int nb : it->second) if (nb >= 0) deg++;
+    // LLVM 风格：spill weight = cost / (range_len * degree)
+    // 短活范围 + 低干涉 + 高代价 = 优先寄存器
+    double weight = (double)cost / (std::max(1.0, (double)range_len[v]) * std::max(1, deg));
+    queue.push_back({v, weight, cost});
+  }
+  std::sort(queue.begin(), queue.end(),
+    [](const VRegPriority &a, const VRegPriority &b) { return a.weight > b.weight; });
+
+  // Phi-group 亲和排序：将 copy-connected vreg 在队列中相邻放置
+  if (!copy_partners.empty()) {
+    std::unordered_map<int, int> group_id;
+    std::vector<std::vector<int>> groups;
+    std::unordered_set<int> visited;
+    for (auto &[v, _] : copy_partners) {
+      if (visited.count(v)) continue;
+      std::vector<int> comp, stack;
+      stack.push_back(v); visited.insert(v);
+      while (!stack.empty()) {
+        int cur = stack.back(); stack.pop_back();
+        comp.push_back(cur);
+        auto it = copy_partners.find(cur);
+        if (it != copy_partners.end())
+          for (int p : it->second)
+            if (!visited.count(p)) { visited.insert(p); stack.push_back(p); }
+      }
+      if (comp.size() > 1) {
+        int gid = (int)groups.size();
+        groups.push_back(std::move(comp));
+        for (int m : groups.back()) group_id[m] = gid;
+      }
+    }
+    if (!groups.empty()) {
+      std::unordered_map<int, int> vreg_to_queue_idx;
+      for (size_t i = 0; i < queue.size(); ++i) vreg_to_queue_idx[queue[i].vreg] = (int)i;
+      std::vector<VRegPriority> nq;
+      std::unordered_set<int> placed;
+      for (auto &vp : queue) {
+        if (placed.count(vp.vreg)) continue;
+        auto gi = group_id.find(vp.vreg);
+        if (gi != group_id.end()) {
+          auto &members = groups[gi->second];
+          std::vector<VRegPriority> grp;
+          for (int m : members) grp.push_back(queue[vreg_to_queue_idx[m]]);
+          std::sort(grp.begin(), grp.end(),
+              [](const VRegPriority &a, const VRegPriority &b) { return a.weight > b.weight; });
+          for (auto &gvp : grp) { nq.push_back(gvp); placed.insert(gvp.vreg); }
+        } else {
+          nq.push_back(vp);
+          placed.insert(vp.vreg);
+        }
+      }
+      queue = std::move(nq);
+    }
+  }
+
+  // ---- 4. 构建 vreg set + 预着色节点固定约束 ----
+  std::unordered_set<int> vreg_set(vregs.begin(), vregs.end());
+  std::unordered_map<int, int> assigned_ru; // vreg → regunit（含预着色节点）
+
+  // 预着色节点：标记其固定 regunit
+  for (auto &[node_id, edges] : ctx.interf_graph) {
+    if (node_id >= 0) continue;
+    for (int nb : edges) {
+      if (vreg_set.count(nb)) {
+        assigned_ru[node_id] = -(node_id + 1);
+        break;
+      }
+    }
+  }
+
+  // Copy hints：从已分配的 copy-connected vreg 获取建议 regunit
+  auto get_copy_hint = [&](int v) -> int {
+    // 优先检查 copy-connected 的 vreg 是否已分配
+    auto ch = copy_partners.find(v);
+    if (ch != copy_partners.end()) {
+      for (int partner : ch->second) {
+        auto ait = assigned_ru.find(partner);
+        if (ait != assigned_ru.end()) return ait->second;
+      }
+    }
+    // 回退：检查任何已分配的同 class 邻居
+    auto it = ctx.interf_graph.find(v);
+    if (it == ctx.interf_graph.end()) return -1;
+    for (int nb : it->second) {
+      if (nb < 0) continue;
+      auto ait = assigned_ru.find(nb);
+      if (ait != assigned_ru.end() &&
+          ctx.mf->GetVRegClass(nb) == ctx.mf->GetVRegClass(v))
+        return ait->second;
+    }
+    return -1;
+  };
+
+  // ---- 5. 核心分配循环：优先级驱动 + 直接 PhysReg + 驱逐 ----
+  // LIU O(log n) 查询 + 干涉图 phantom 约束 = 完整 blocked 检测
+  auto is_ru_blocked = [&](int ru, int vreg) -> bool {
+    // 1. LIU 查询：已分配 vreg 的活范围重叠（O(log n)）
+    auto liu_it = ctx.regunit_liu.find(ru);
+    if (liu_it != ctx.regunit_liu.end()) {
+      auto vr_it = ctx.vr.find(vreg);
+      if (vr_it != ctx.vr.end()) {
+        for (auto &[block, r] : vr_it->second) {
+          if (!r.has_ref) continue;
+          auto bi = ctx.block_index.find(block);
+          if (bi == ctx.block_index.end()) continue;
+          if (liu_it->second.Overlaps(bi->second, r.first, r.last + 1))
+            return true;
+        }
+      }
+    }
+    // 2. 干涉图 phantom 约束：预着色 PhysReg 固定占用（O(1)）
+    auto itg = ctx.interf_graph.find(vreg);
+    if (itg != ctx.interf_graph.end()) {
+      for (int nb : itg->second) {
+        if (nb >= 0) continue;  // 仅 phantom 节点
+        if (-(nb + 1) == ru) return true;
+      }
+    }
+    return false;
+  };
+
+  for (auto &[v, weight, cost] : queue) {
+    if (ctx.info[v].spilled) continue;
+
+    VRegClass vc = ctx.mf->GetVRegClass(v);
+    RegHint hint = ctx.info[v].reg_hint;
+
+    // LIU 主查：O(log n) + phantom 约束，替代干涉图 blocked_ru
+    auto it = ctx.interf_graph.find(v);  // 驱逐候选收集仍需干涉图
+
+    // 候选列表：跨调用仅 callee-saved，非跨调用可使用全部 caller
+        const auto &primary   = (hint == RegHint::kCalleeOnly) ? callee_rus : caller_rus;
+        static const std::vector<int> empty_rus;
+    const auto &secondary = (hint == RegHint::kCalleeOnly) ? empty_rus : callee_rus;
+
+    // 尝试分配空闲 regunit（优先 copy hint）
+    // LIU 辅助：使用 ctx.vr（与干涉图完全相同的数据源）将 vreg 活范围添加到 LIU
+    auto liu_add = [&](int vreg, int ru) {
+      auto vr_it = ctx.vr.find(vreg);
+      if (vr_it == ctx.vr.end()) return;
+      for (auto &[block, r] : vr_it->second) {
+        if (!r.has_ref) continue;
+        auto bi_it = ctx.block_index.find(block);
+        if (bi_it != ctx.block_index.end())
+          ctx.regunit_liu[ru].Add(bi_it->second, r.first, r.last + 1, vreg);
+      }
+    };
+    auto liu_remove = [&](int vreg, int ru) {
+      auto liu_it = ctx.regunit_liu.find(ru);
+      if (liu_it != ctx.regunit_liu.end())
+        liu_it->second.Remove(vreg);
+    };
+    auto try_assign = [&](const std::vector<int> &ru_list) -> bool {
+      int hint_ru = get_copy_hint(v);
+      // LIU 主查
+      if (hint_ru >= 0 && !is_ru_blocked(hint_ru, v)) {
+        PhysReg reg = find_reg(hint_ru, vc);
+        if (Compat(reg, vc)) {
+          assigned_ru[v] = hint_ru;
+          ctx.info[v].phys_reg = reg;
+          ctx.info[v].spilled = false;
+          liu_add(v, hint_ru);
+          return true;
+        }
+      }
+      // 然后尝试其他 regunit
+      for (int ru : ru_list) {
+        if (is_ru_blocked(ru, v)) continue;
+        PhysReg reg = find_reg(ru, vc);
+        if (!Compat(reg, vc)) continue;
+        assigned_ru[v] = ru;
+        ctx.info[v].phys_reg = reg;
+        ctx.info[v].spilled = false;
+        liu_add(v, ru);
+        return true;
+      }
+      return false;
+    };
+
+    if (try_assign(primary)) continue;
+    if (hint != RegHint::kCalleeOnly && try_assign(secondary)) continue;
+
+    // ---- 驱逐：找最低总 spill_cost 的邻居集合，释放其 regunit ----
+    // LLVM Greedy 驱逐策略（对齐 llvm/lib/CodeGen/RegAllocGreedy.cpp:selectOrSplit）：
+    //   1. 检查每个 PhysReg 上已分配的所有 interfering vreg
+    //   2. 若任一邻居代价严格高于当前 vreg → 跳过此寄存器（不驱逐更重要的 vreg）
+    //   3. 否则计算 evict 总代价 = Σ(邻居代价)
+    //   4. 选总代价最小的寄存器
+    //   5. 仅当 驱逐总代价 < 当前 vreg 代价 时才执行驱逐（效益 > 代价）
+    int best_ru = -1;
+    double best_evict_cost = 1e18;
+    double best_raw_cost = 1e18;  // 不含 copy hint 折扣的原始代价，用于效益检查
+    std::vector<int> best_evictees;
+    int hint_ru = get_copy_hint(v);
+
+    // 对所有可驱逐的 regunit 评估驱逐代价
+    std::vector<int> evict_candidates;
+    if (hint == RegHint::kCalleeOnly) evict_candidates = callee_rus;
+    else { evict_candidates.assign(all_rus.begin(), all_rus.end()); }
+
+    for (int ru : evict_candidates) {
+      if (!is_ru_blocked(ru, v)) continue; // LIU：此 regunit 空闲，无需驱逐
+
+      // 驱逐候选收集：仍用干涉图（已验证 100% 正确），LIU 跟踪同步
+      std::vector<int> evictees;
+      double total_cost = 0;
+      bool has_higher_cost = false;
+
+      if (it != ctx.interf_graph.end()) {
+        for (int nb : it->second) {
+          if (nb < 0) continue; // phantom 节点不可驱逐
+          auto ait = assigned_ru.find(nb);
+          if (ait != assigned_ru.end() && ait->second == ru) {
+            int nb_cost = ctx.costs.count(nb) ? ctx.costs.at(nb) : 1;
+            // LLVM：仅当邻居代价严格更高时才跳过（等代价允许驱逐——任意选择）
+            if (nb_cost > cost) { has_higher_cost = true; break; }
+            evictees.push_back(nb);
+            total_cost += nb_cost;
+          }
+        }
+      }
+
+      if (has_higher_cost || evictees.empty()) continue;
+      // LLVM 风格：驱逐总代价 = Σ(邻居代价)，不对邻居数量额外惩罚
+      double effective_cost = total_cost;
+
+      // Copy hint 偏向：
+      // - phi 连接：0.25x（最强——phi 两端应强制同寄存器）
+      // - 普通 copy：0.5x（倾向同寄存器）
+      if (ru == hint_ru) {
+        bool is_phi = false;
+        if (phi_pairs) {
+          auto it = phi_pairs->find(v);
+          if (it != phi_pairs->end()) {
+            for (int p : it->second) {
+              auto ait = assigned_ru.find(p);
+              if (ait != assigned_ru.end() && ait->second == ru) {
+                is_phi = true; break;
+              }
+            }
+          }
+        }
+        effective_cost *= is_phi ? 0.25 : 0.5;
+      }
+
+      if (effective_cost < best_evict_cost) {
+        best_evict_cost = effective_cost;
+        best_raw_cost = total_cost;  // 原始代价，用于效益检查
+        best_ru = ru;
+        best_evictees = std::move(evictees);
+      }
+    }
+
+    // LLVM 效益检查：仅当驱逐邻居的总代价严格低于当前 vreg 自身溢出代价时才执行驱逐
+    // 确保不会为了一个低成本 vreg 驱逐多个高成本邻居
+    if (best_ru >= 0 && best_raw_cost >= (double)cost) {
+      best_ru = -1;
+      best_evictees.clear();
+    }
+
+    if (best_ru >= 0) {
+      // 驱逐邻居（从 LIU 移除 + 标记 spilled）
+      for (int ev : best_evictees) {
+        auto ev_ait = assigned_ru.find(ev);
+        if (ev_ait != assigned_ru.end()) liu_remove(ev, ev_ait->second);
+        assigned_ru.erase(ev);
+        ctx.info[ev].spilled = true;
+      }
+      // 分配给当前 vreg（添加到 LIU + 标记 assigned）
+      assigned_ru[v] = best_ru;
+      ctx.info[v].phys_reg = find_reg(best_ru, vc);
+      ctx.info[v].spilled = false;
+      liu_add(v, best_ru);
+      continue;
+    }
+
+    // 无法分配也无法驱逐 → spill
+    ctx.info[v].spilled = true;
+  }
+}
+
+// ============================================================================
+// 7. 指令重写 —— 复用经过验证的 RewriteWithAllocation 逻辑
+// ============================================================================
+
+static PhysReg NumberToPhysReg(int num, VRegClass vc) {
+  if (vc == VRegClass::Float) return static_cast<PhysReg>(static_cast<int>(PhysReg::S0)+num);
+  if (vc == VRegClass::Vec)   return static_cast<PhysReg>(static_cast<int>(PhysReg::Q0)+num);
+  if (vc == VRegClass::Ptr)   return static_cast<PhysReg>(static_cast<int>(PhysReg::X0)+num);
+  return static_cast<PhysReg>(static_cast<int>(PhysReg::W0)+num);
+}
+
+static bool IsGPReg(PhysReg r) { return (r>=PhysReg::W0&&r<=PhysReg::W30)||(r>=PhysReg::X0&&r<=PhysReg::X30); }
+static bool IsFPReg(PhysReg r) { return r>=PhysReg::S0&&r<=PhysReg::S31; }
+static bool IsVecReg(PhysReg r){ return r>=PhysReg::Q0&&r<=PhysReg::Q31; }
+
+static const int GP_ALLOCATABLE[] = {8,9,10,11,12,13,14,15,19,20,21,22,23,24,25,26,27,28};
+static const int GP_NUM_ALLOCATABLE = 18;
+
+static int PickGPScratchReg(const std::set<int> &used,
+    const MachineRegisterInfo::InstDefUse &du,
+    const std::unordered_map<int,int> &gp_assign, int skip=-1) {
+  // x16 在分配池中 → 可能需要回退到 x15 做第二个 scratch（双 spilled 操作数时）
+  // 仅在 x17 也正在用于另一个 spill reload 时触发
+  static const int pri[]={17,16,15,14,13,12,11,10,9,8};
+  for(int r:pri){if(used.count(r))continue;bool u2=false;
+    for(int d:du.defs)if(d!=skip){auto it=gp_assign.find(d);if(it!=gp_assign.end()&&it->second==r){u2=true;break;}}
+    if(!u2)for(int u3:du.uses){auto it=gp_assign.find(u3);if(it!=gp_assign.end()&&it->second==r){u2=true;break;}}
+    if(!u2)return r;
+  }
+  for(int r:GP_ALLOCATABLE){if(used.count(r))continue;bool u2=false;
+    for(int d:du.defs)if(d!=skip){auto it=gp_assign.find(d);if(it!=gp_assign.end()&&it->second==r){u2=true;break;}}
+    if(!u2)for(int u3:du.uses){auto it=gp_assign.find(u3);if(it!=gp_assign.end()&&it->second==r){u2=true;break;}}
+    if(!u2)return r;
+  }
+  return GP_ALLOCATABLE[0];
+}
+
+static int PickFPScratchReg(const std::set<int> &used,
+    const MachineRegisterInfo::InstDefUse &du,
+    const std::unordered_map<int,int> &fp_assign, int skip=-1) {
+  for(int i=8;i<=31;++i){if(used.count(i))continue;bool u2=false;
+    for(int d:du.defs){if(d==skip)continue;auto it=fp_assign.find(d);if(it!=fp_assign.end()&&it->second==i){u2=true;break;}}
+    if(!u2)for(int u3:du.uses){auto it=fp_assign.find(u3);if(it!=fp_assign.end()&&it->second==i){u2=true;break;}}
+    if(!u2)return i;
+  }
+  return 8;
+}
+
+static int PickVecScratchReg(const std::set<int> &used,
+    const MachineRegisterInfo::InstDefUse &du,
+    const std::unordered_map<int,int> &vec_assign, int skip=-1) {
+  for(int i=16;i<=31;++i){if(used.count(i))continue;bool u2=false;
+    for(int d:du.defs){if(d==skip)continue;auto it=vec_assign.find(d);if(it!=vec_assign.end()&&it->second==i){u2=true;break;}}
+    if(!u2)for(int u3:du.uses){auto it=vec_assign.find(u3);if(it!=vec_assign.end()&&it->second==i){u2=true;break;}}
+    if(!u2)return i;
+  }
+  for(int i=0;i<=15;++i){if(used.count(i))continue;bool u2=false;
+    for(int d:du.defs){if(d==skip)continue;auto it=vec_assign.find(d);if(it!=vec_assign.end()&&it->second==i){u2=true;break;}}
+    if(!u2)for(int u3:du.uses){auto it=vec_assign.find(u3);if(it!=vec_assign.end()&&it->second==i){u2=true;break;}}
+    if(!u2)return i;
+  }
+  return 16;
+}
+
+static void RewriteWithAllocation(MachineFunction &function,
+    const std::unordered_map<int,int> &gp_assign,
+    const std::unordered_map<int,int> &fp_assign,
+    const std::unordered_map<int,int> &vec_assign,
+    const std::set<int> &spilled, FuncCtx &ctx) {
+  // Remat analysis
+  struct RematInfo { Opcode opcode; int imm; VRegClass vreg_class; };
+  std::unordered_map<int,RematInfo> remat;
+  { std::unordered_map<int,int> dc; std::unordered_map<int,MachineInstr*> ld;
+    for(auto &b:function.GetBlocks()) for(auto &inst:b->GetInstructions()){
+      auto du=MachineRegisterInfo::GetInstDefUse(inst);for(int d:du.defs){dc[d]++;ld[d]=&inst;}}
+    for(auto &[v,c]:dc) if(c==1&&spilled.count(v)&&ld[v]&&ld[v]->IsRematerializable())
+      remat[v]={ld[v]->GetOpcode(),ld[v]->GetRematImm(),function.GetVRegClass(v)};
+  }
+
+  std::unordered_map<int,int> slots;
+  for(int v:spilled){if(remat.count(v))continue;
+    int sz=4;auto vc=function.GetVRegClass(v);if(vc==VRegClass::Ptr)sz=8;else if(vc==VRegClass::Vec)sz=16;
+    slots[v]=function.CreateFrameIndex(sz);
+  }
+
+  for(auto &block:function.GetBlocks()){
+    // 块内溢出缓存：slot_idx → 当前持有的 PhysReg（避免重复 LoadStack）
+    std::unordered_map<int, PhysReg> slot_cache;
+    std::vector<MachineInstr> ni;
+    for(auto &inst:block->GetInstructions()){
+      auto du=MachineRegisterInfo::GetInstDefUse(inst);
+      std::set<int> usg,usf,usv;
+
+      // Call 使所有 caller-saved 缓存失效
+      if(inst.GetOpcode() == Opcode::Call)
+        slot_cache.clear();
+
+      for(int u:du.uses){if(!spilled.count(u))continue;
+        auto vc=function.GetVRegClass(u);int slot_id=slots[u];int rn=-1;
+        PhysReg cached_reg = PhysReg::W0;
+        bool use_cache = false;
+
+        // 检查块内缓存：此 slot 是否已加载到某寄存器
+        auto sc_it = slot_cache.find(slot_id);
+        if(sc_it != slot_cache.end()) {
+          cached_reg = sc_it->second;
+          rn = ToRegUnit(cached_reg);
+          // 确保缓存寄存器不被此指令的其他 use 或 def 占用
+          bool conflict = false;
+          if(vc==VRegClass::Float||vc==VRegClass::Vec) conflict = usf.count(rn)||usv.count(rn);
+          else conflict = usg.count(rn);
+          if(!conflict) { use_cache = true; }
+        }
+
+        if(!use_cache) {
+          if(vc==VRegClass::Float){rn=PickFPScratchReg(usf,du,fp_assign);usf.insert(rn);}
+          else if(vc==VRegClass::Vec){rn=PickVecScratchReg(usv,du,vec_assign);usv.insert(rn);}
+          else{rn=PickGPScratchReg(usg,du,gp_assign);usg.insert(rn);}
+          PhysReg rr=NumberToPhysReg(rn,vc);
+          if(remat.count(u)){auto&ri=remat[u];
+            if(ri.opcode==Opcode::MovImm) ni.push_back(MachineInstr(Opcode::MovImm,{Operand::Reg(rr),Operand::Imm(ri.imm)}));
+            else ni.push_back(MachineInstr(Opcode::LoadStack,{Operand::Reg(rr),Operand::FrameIndex(slot_id)}));
+          }else ni.push_back(MachineInstr(Opcode::LoadStack,{Operand::Reg(rr),Operand::FrameIndex(slot_id)}));
+          // 失效旧缓存：若此寄存器正缓存另一个 slot，先 invalidate
+          for(auto it=slot_cache.begin();it!=slot_cache.end();)
+            if(it->second==rr&&it->first!=slot_id) it=slot_cache.erase(it); else ++it;
+          slot_cache[slot_id] = rr;
+          cached_reg = rr;
+          if(vc==VRegClass::Float) usf.insert(rn);
+          else if(vc==VRegClass::Vec) usv.insert(rn);
+          else usg.insert(rn);
+        }
+        for(auto &op:inst.GetOperands()) if(op.GetKind()==Operand::Kind::VReg&&op.GetVRegId()==u) const_cast<Operand&>(op)=Operand::Reg(cached_reg);
+      }
+      for(auto &op:inst.GetOperands()){if(op.GetKind()!=Operand::Kind::VReg)continue;
+        int v=op.GetVRegId();auto avc=function.GetVRegClass(v);int rn=-1;
+        if(avc==VRegClass::Float){auto it=fp_assign.find(v);if(it!=fp_assign.end())rn=it->second;}
+        else if(avc==VRegClass::Vec){auto it=vec_assign.find(v);if(it!=vec_assign.end())rn=it->second;}
+        else{auto it=gp_assign.find(v);if(it!=gp_assign.end())rn=it->second;}
+        if(rn>=0) const_cast<Operand&>(op)=Operand::Reg(NumberToPhysReg(rn,avc));
+        else if(spilled.count(v)){int srn=-1;
+          if(avc==VRegClass::Float){srn=PickFPScratchReg(usf,du,fp_assign,v);usf.insert(srn);}
+          else if(avc==VRegClass::Vec){srn=PickVecScratchReg(usv,du,vec_assign,v);usv.insert(srn);}
+          else{srn=PickGPScratchReg(usg,du,gp_assign,v);usg.insert(srn);}
+          const_cast<Operand&>(op)=Operand::Reg(NumberToPhysReg(srn,avc));
+        }
+      }
+      ni.push_back(std::move(const_cast<MachineInstr&>(inst)));
+      for(int d:du.defs){if(!spilled.count(d)||remat.count(d))continue;
+        auto vc=function.GetVRegClass(d);PhysReg sr=PhysReg::W0;
+        for(auto &op:ni.back().GetOperands()){if(op.GetKind()==Operand::Kind::Reg){PhysReg r=op.GetReg();
+          if((vc==VRegClass::Float&&IsFPReg(r))||(vc==VRegClass::Vec&&IsVecReg(r))||(vc!=VRegClass::Float&&vc!=VRegClass::Vec&&IsGPReg(r))){sr=r;break;}}}
+        int slot=slots[d];
+        // 若此寄存器正缓存另一个 slot，先 invalidate
+        for(auto it=slot_cache.begin();it!=slot_cache.end();)
+          if(it->second==sr&&it->first!=slot) it=slot_cache.erase(it); else ++it;
+        ni.push_back(MachineInstr(Opcode::StoreStack,{Operand::Reg(sr),Operand::FrameIndex(slot)}));
+        slot_cache[slot]=sr; // 更新缓存：此 reg 现在持有此 slot 的最新值
+      }
+    }
+    block->GetInstructions()=std::move(ni);
+  }
+
+  // Callee-saved
+  for(int v=0;v<function.GetNumVRegs();++v){if(spilled.count(v))continue;
+    int num=-1;auto vc=function.GetVRegClass(v);
+    if(vc==VRegClass::Float){auto it=fp_assign.find(v);if(it!=fp_assign.end())num=it->second;}
+    else if(vc==VRegClass::Vec){auto it=vec_assign.find(v);if(it!=vec_assign.end())num=it->second;}
+    else{auto it=gp_assign.find(v);if(it!=gp_assign.end())num=it->second;}
+    if(num<0)continue;
+    if((vc==VRegClass::Int||vc==VRegClass::Ptr)&&num>=19&&num<=28)
+      function.AddCalleeSavedReg(static_cast<PhysReg>(static_cast<int>(PhysReg::X0)+num));
+    else if(vc==VRegClass::Float&&num>=16&&num<=31)
+      function.AddCalleeSavedReg(NumberToPhysReg(num,vc));
+  }
+}
+
+// ============================================================================
+// 8. 主入口
+// ============================================================================
+
+static void Allocate(MachineFunction &f) {
+  if (f.GetNumVRegs()==0) return;
+  FuncCtx ctx; ctx.mf=&f; ctx.leaf=IsLeafFunc(f);
+
+  // ---- Phi 连接跟踪 ----
+  // Phi 元数据已由 PhiElimination 在 SSA 销毁前收集并存储在 MachineFunction 上。
+  // phi_block_arg_block：用于 phi-aware Coalesce（排除 successor 块中的伪干涉）
+  // phi_pairs：在 AllocClass 中获得最强 copy hint（phi 两端同寄存器）
+  const auto &phi_block_arg_block = f.GetPhiBlockArgBlock();
+  const auto &phi_pairs = f.GetPhiPairs();
+
+  ctx.li.Compute(f);
+
+  // ---- Split-for-Coalesce：已禁用 ----
+  // 预分裂创建额外副本，在非 SSA MIR 中净增指令数。
+  // 正确方案：分配时 SplitKit（按需分裂，仅在分配失败时触发）。
+
+  // ---- Coalescing: 合并 copy-connected 的非干涉 vreg ----
+  if (Coalesce(f, ctx.li, &phi_block_arg_block)) {
+    ctx.li.Compute(f);
+  }
+
+  // ---- PHI copy 前向传播：消除单次使用的副本 vreg ----
+  // 对于 MovReg(dst, src)，若 src 仅此一次使用且 dst 仅此一次定义，
+  // 将 dst 的所有使用替换为 src → 减少 vreg 数量和 MOV 指令
+  {
+    int nv = f.GetNumVRegs();
+    std::vector<int> use_counts(nv, 0), def_counts(nv, 0);
+    for (auto &block : f.GetBlocks())
+      for (auto &inst : block->GetInstructions()) {
+        auto du = MachineRegisterInfo::GetInstDefUse(inst);
+        for (int u : du.uses) if (u >= 0 && u < nv) use_counts[u]++;
+        for (int d : du.defs) if (d >= 0 && d < nv) def_counts[d]++;
+      }
+
+    struct CopyPair { int dst; int src; };
+    std::vector<CopyPair> prop_copies;
+    for (auto &block : f.GetBlocks())
+      for (auto &inst : block->GetInstructions()) {
+        if (inst.GetOpcode() != Opcode::MovReg) continue;
+        auto &ops = inst.GetOperands();
+        if (ops.size() >= 2 &&
+            ops[0].GetKind() == Operand::Kind::VReg &&
+            ops[1].GetKind() == Operand::Kind::VReg) {
+          int dst = ops[0].GetVRegId();
+          int src = ops[1].GetVRegId();
+          if (dst >= 0 && src >= 0 && dst < nv && src < nv &&
+              use_counts[src] == 1 && def_counts[dst] == 1) {
+            prop_copies.push_back({dst, src});
+          }
+        }
+      }
+
+    if (!prop_copies.empty()) {
+      // 构建传递闭包替换映射：dst → ultimate_src
+      std::unordered_map<int, int> replace;
+      for (auto &[dst, src] : prop_copies) {
+        int ult = src;
+        while (replace.count(ult)) ult = replace[ult];
+        if (ult != dst) replace[dst] = ult;
+      }
+
+      // 应用替换 + 删除被传播的 MovReg
+      for (auto &block : f.GetBlocks()) {
+        std::vector<MachineInstr> ni;
+        for (auto &inst : block->GetInstructions()) {
+          if (inst.GetOpcode() == Opcode::MovReg) {
+            auto &ops = inst.GetOperands();
+            if (ops.size() >= 1 && ops[0].GetKind() == Operand::Kind::VReg &&
+                replace.count(ops[0].GetVRegId())) continue;
+          }
+          for (auto &op : inst.GetOperands()) {
+            if (op.GetKind() == Operand::Kind::VReg) {
+              auto it = replace.find(op.GetVRegId());
+              if (it != replace.end())
+                const_cast<Operand &>(op) = Operand::VReg(it->second, f.GetVRegClass(it->second));
+            }
+          }
+          ni.push_back(std::move(const_cast<MachineInstr &>(inst)));
+        }
+        block->GetInstructions() = std::move(ni);
+      }
+      ctx.li.Compute(f);
+    }
+  }
+
+  ctx.depths=LoopDepths(f);
+  ctx.info.resize(f.GetNumVRegs());
+  ComputeCosts(ctx);
+  ComputeCrossCallVRegs(ctx);
+  ComputeRegHints(ctx);     // per-vreg 寄存器类提示
+  BuildInterfGraph(ctx);
+
+  // ── LiveIntervalUnion 初始化 ──
+  // Block → index 映射
+  {
+    int idx = 0;
+    for (auto &b : f.GetBlocks())
+      ctx.block_index[b.get()] = idx++;
+  }
+  // 为每个候选 regunit 创建 LIU 实例
+  for (auto &r : ctx.gp_cands) {
+    int ru = ToRegUnit(r);
+    if (!ctx.regunit_liu.count(ru)) ctx.regunit_liu[ru] = LiveIntervalUnion();
+  }
+  for (auto &r : ctx.fp_cands) {
+    int ru = ToRegUnit(r);
+    if (!ctx.regunit_liu.count(ru)) ctx.regunit_liu[ru] = LiveIntervalUnion();
+  }
+  for (auto &r : ctx.vec_cands) {
+    int ru = ToRegUnit(r);
+    if (!ctx.regunit_liu.count(ru)) ctx.regunit_liu[ru] = LiveIntervalUnion();
+  }
+  // 预着色 phantom 节点预填 LIU（使用 ctx.vr——与干涉图完全相同的数据源）
+  for (auto &[node_id, edges] : ctx.interf_graph) {
+    if (node_id >= 0) continue;  // 仅 phantom 节点
+    int ru = -(node_id + 1);
+    for (int nb : edges) {
+      if (nb < 0) continue;
+      auto vr_it = ctx.vr.find(nb);
+      if (vr_it == ctx.vr.end()) continue;
+      for (auto &[block, r] : vr_it->second) {
+        if (!r.has_ref) continue;
+        auto bi = ctx.block_index.find(block);
+        if (bi == ctx.block_index.end()) continue;
+        ctx.regunit_liu[ru].Add(bi->second, r.first, r.last + 1, node_id);
+      }
+    }
+  }
+
+  // 候选寄存器：
+  // - 叶函数：LEAF_GP (23 regs, 含 x0-x7) —— 无 Call，x0-x7 无需保护
+  // - 非叶函数：EXT_GP (26 regs, 含 x0-x7) —— Call Clobber phantom + RegHint 保护
+  //   （递归函数同样使用 EXT_GP——phantom + RegHint 机制覆盖自递归场景）
+  const int *gn = ctx.leaf ? LEAF_GP_NUMS : EXT_GP_NUMS;
+  int gc = ctx.leaf ? LEAF_GP_COUNT : EXT_GP_COUNT;
+  for(int i=0;i<gc;++i){
+    ctx.gp_cands.push_back(static_cast<PhysReg>(static_cast<int>(PhysReg::W0)+gn[i]));
+    ctx.gp_cands.push_back(static_cast<PhysReg>(static_cast<int>(PhysReg::X0)+gn[i]));
+  }
+  for(int r:FP_NUMS)  ctx.fp_cands.push_back(static_cast<PhysReg>(static_cast<int>(PhysReg::S0)+r));
+  for(int r:VEC_NUMS) ctx.vec_cands.push_back(static_cast<PhysReg>(static_cast<int>(PhysReg::Q0)+r));
+
+  // 分组
+  int nv=f.GetNumVRegs();
+  std::vector<int> gp,fp,vec;
+  auto regroup=[&](){gp.clear();fp.clear();vec.clear();
+    for(int v=0;v<nv;++v){if(ctx.info[v].spilled)continue;
+      auto vc=f.GetVRegClass(v);
+      if(vc==VRegClass::Float)fp.push_back(v);else if(vc==VRegClass::Vec)vec.push_back(v);else gp.push_back(v);
+    }
+  };
+  regroup();
+
+  // 注：主动活范围分裂实验结果显示净负收益（-915 vs -948 基线）。
+  // 分裂创建的副本指令 > 减少的 spill 指令。保留代码以供将来优化。
+
+  std::unordered_map<int,int> slots;
+
+  const int MAX_ROUNDS=10;
+  for(int round=0;round<MAX_ROUNDS;++round){
+    AllocClass(gp,ctx.gp_cands,ctx,&phi_pairs); AllocClass(fp,ctx.fp_cands,ctx,&phi_pairs); AllocClass(vec,ctx.vec_cands,ctx,&phi_pairs);
+
+    // 收集 spilled
+    std::set<int> spilled;
+    for(int v=0;v<nv;++v) if(ctx.info[v].spilled) spilled.insert(v);
+    if(spilled.empty()) break;
+
+    // ---- SplitKit：spill 之前尝试分裂 ----
+    // 当前暂时禁用——在综合门禁验证稳定后再启用
+    // 分裂策略已验证正确性（结构完整），但 use 替换的跨块语义需要更细致的 CFG 分析
+    const bool ENABLE_SPLITKIT = false;
+    if (ENABLE_SPLITKIT) {
+      int split_count = 0;
+      // LLVM 风格：对全部溢出 vreg 尝试分裂（不设上限），优先级排序
+      std::vector<int> spilled_list(spilled.begin(), spilled.end());
+      std::sort(spilled_list.begin(), spilled_list.end(), [&](int a, int b) {
+        int ca = ctx.costs.count(a) ? ctx.costs.at(a) : 1;
+        int cb = ctx.costs.count(b) ? ctx.costs.at(b) : 1;
+        return ca > cb;
+      });
+      for (int v : spilled_list) {
+        if (ctx.info[v].remat) continue;
+        // 策略 1: 循环边界分裂
+        int hot_vreg = SplitVRegAtLoopBoundary(ctx, v);
+        // 策略 2: 调用边界分裂（循环分裂失败时尝试，对齐 LLVM 多策略）
+        if (hot_vreg < 0)
+          hot_vreg = SplitVRegAtCallBoundary(ctx, v);
+        if (hot_vreg >= 0) {
+          split_count++;
+          ctx.info[v].spilled = false;
+          ctx.info.resize(ctx.mf->GetNumVRegs());
+        }
+      }
+
+      if (split_count > 0) {
+        spilled.clear();
+        for(int v=0;v<nv;++v) if(ctx.info[v].spilled) spilled.insert(v);
+      }
+      if(spilled.empty()) {
+        nv = f.GetNumVRegs();
+        ctx.li.Compute(f); ctx.depths=LoopDepths(f);
+        ctx.info.resize(nv); ComputeCosts(ctx);
+        ctx.cross_call_vregs.clear(); ComputeCrossCallVRegs(ctx);
+        ComputeRegHints(ctx);
+        ctx.interf_graph.clear(); BuildInterfGraph(ctx); RebuildLIU(ctx);
+        regroup();
+        continue;
+      }
+    }
+    // ---- SplitKit 结束 ----
+
+    // 创建槽位
+    for(int v:spilled){if(ctx.info[v].remat||slots.count(v))continue;
+      int sz=4;auto vc=f.GetVRegClass(v);if(vc==VRegClass::Ptr)sz=8;else if(vc==VRegClass::Vec)sz=16;
+      slots[v]=f.CreateFrameIndex(sz);
+    }
+
+    // Spill 重写：为新 vreg 创建 Load/Store
+    for(auto &block:f.GetBlocks()){std::vector<MachineInstr> ni;
+      for(auto &inst:block->GetInstructions()){auto du=MachineRegisterInfo::GetInstDefUse(inst);
+        for(int u:du.uses){if(!spilled.count(u))continue;
+          auto vc=f.GetVRegClass(u);int nv2=f.CreateVReg(vc);
+          if(!ctx.info[u].remat) ni.push_back(MachineInstr(Opcode::LoadStack,{Operand::VReg(nv2,vc),Operand::FrameIndex(slots[u])}));
+          else ni.push_back(MachineInstr(Opcode::MovImm,{Operand::VReg(nv2,vc),Operand::Imm(ctx.info[u].remat_imm)}));
+          for(auto &op:inst.GetOperands()) if(op.GetKind()==Operand::Kind::VReg&&op.GetVRegId()==u) const_cast<Operand&>(op)=Operand::VReg(nv2,vc);
+        }
+        ni.push_back(std::move(const_cast<MachineInstr&>(inst)));
+        for(int d:du.defs){if(!spilled.count(d)||ctx.info[d].remat)continue;
+          auto vc=f.GetVRegClass(d);int nv2=f.CreateVReg(vc);
+          for(auto &op:ni.back().GetOperands()) if(op.GetKind()==Operand::Kind::VReg&&op.GetVRegId()==d) const_cast<Operand&>(op)=Operand::VReg(nv2,vc);
+          ni.push_back(MachineInstr(Opcode::StoreStack,{Operand::VReg(nv2,vc),Operand::FrameIndex(slots[d])}));
+        }
+      }
+      block->GetInstructions()=std::move(ni);
+    }
+
+    int old_nv=nv; nv=f.GetNumVRegs();
+    ctx.li.Compute(f); ctx.depths=LoopDepths(f);
+    ctx.info.resize(nv); ComputeCosts(ctx);
+    for(int v=old_nv;v<nv;++v) ctx.costs[v]=1000000;
+    ctx.cross_call_vregs.clear(); ComputeCrossCallVRegs(ctx);
+    ComputeRegHints(ctx);
+    ctx.interf_graph.clear(); BuildInterfGraph(ctx); RebuildLIU(ctx);
+    regroup();
+  }
+
+  // 分配后冲突检测+修复：Interfere 验证同 regunit 的 vreg 对
+  // 防止 Briggs 着色的 spurious non-interference
+  for (int attempt = 0; attempt < 2; ++attempt) {
+    std::set<int> conflicts;
+    for (int a = 0; a < nv; ++a) {
+      if (ctx.info[a].spilled) continue;
+      int ra = ToRegUnit(ctx.info[a].phys_reg);
+      for (int b = a + 1; b < nv; ++b) {
+        if (ctx.info[b].spilled) continue;
+        if (ToRegUnit(ctx.info[b].phys_reg) != ra) continue;
+        if (ctx.li.Interfere(a, b)) {
+          int ca = ctx.costs.count(a) ? ctx.costs.at(a) : 1;
+          int cb = ctx.costs.count(b) ? ctx.costs.at(b) : 1;
+          conflicts.insert(ca < cb ? a : b);
+        }
+      }
+    }
+    if (conflicts.empty()) break;
+    for (int v : conflicts) ctx.info[v].spilled = true;
+  }
+
+  // 转换为 reg_number 格式
+  std::unordered_map<int,int> gp_assign, fp_assign, vec_assign;
+  std::set<int> spilled_set;
+  auto get_assign=[&](int v)->int{
+    if(gp_assign.count(v))return gp_assign.at(v);
+    if(fp_assign.count(v))return fp_assign.at(v);
+    if(vec_assign.count(v))return vec_assign.at(v);
+    return -1;
+  };
+  for(int v=0;v<nv;++v){
+    if(ctx.info[v].spilled){spilled_set.insert(v);continue;}
+    int num=RegNum(ctx.info[v].phys_reg);auto vc=f.GetVRegClass(v);
+    if(vc==VRegClass::Float) fp_assign[v]=num;
+    else if(vc==VRegClass::Vec) vec_assign[v]=num;
+    else gp_assign[v]=num;
+  }
+
+  // ---- 分配后冗余 MovReg 消除 ----
+  // 若 MovReg(vreg, vreg) 两端映射到同一 PhysReg，该拷贝为死代码
+  // 这主要消除 PhiElimination 产生的 phi 连接拷贝——当 phi 两端被分配
+  // 到同一寄存器时，中间的 MovReg 是冗余的。
+  for (auto &block : f.GetBlocks()) {
+    std::vector<MachineInstr> ni;
+    for (auto &inst : block->GetInstructions()) {
+      if (inst.GetOpcode() == Opcode::MovReg) {
+        auto &ops = inst.GetOperands();
+        if (ops.size() >= 2 &&
+            ops[0].GetKind() == Operand::Kind::VReg &&
+            ops[1].GetKind() == Operand::Kind::VReg) {
+          int rd = get_assign(ops[0].GetVRegId());
+          int rs = get_assign(ops[1].GetVRegId());
+          if (rd >= 0 && rd == rs && !spilled_set.count(ops[0].GetVRegId())
+              && !spilled_set.count(ops[1].GetVRegId()))
+            continue; // 死 MovReg → 丢弃
+        }
+      }
+      ni.push_back(std::move(const_cast<MachineInstr &>(inst)));
+    }
+    block->GetInstructions() = std::move(ni);
+  }
+
+  RewriteWithAllocation(f, gp_assign, fp_assign, vec_assign, spilled_set, ctx);
+}
+
+} // anonymous namespace
+
+void RunGreedyRegAlloc(MachineFunction &function) { Allocate(function); }
+void RunGreedyRegAlloc(MachineModule &module) { for(auto &func:module.GetFunctions()) Allocate(*func); }
+
+} // namespace mir
diff --git a/src/mir/LiveIntervals.cpp b/src/mir/LiveIntervals.cpp
new file mode 100644
index 00000000..2b9f3d6d
--- /dev/null
+++ b/src/mir/LiveIntervals.cpp
@@ -0,0 +1,719 @@
+#include "mir/LiveIntervals.h"
+#include "mir/MachineRegisterInfo.h"
+
+#include <algorithm>
+
+namespace mir {
+
+void LiveIntervals::Compute(MachineFunction &mf) {
+  num_vregs_ = mf.GetNumVRegs();
+  intervals_.clear();
+  live_blocks_.clear();
+  block_to_idx_.clear();
+  block_def_use_.clear();
+  segments_.clear();
+  inst_to_slot_.clear();
+  slot_to_inst_.clear();
+
+  auto &blocks = mf.GetBlocks();
+  const size_t num_blocks = blocks.size();
+
+  // ---- 全局指令编号 ----
+  block_start_slots_.resize(num_blocks);
+  block_end_slots_.resize(num_blocks);
+  int global_slot = 0;
+  for (size_t i = 0; i < num_blocks; ++i) {
+    block_start_slots_[i] = global_slot;
+    int inst_idx = 0;
+    for (const auto &inst : blocks[i]->GetInstructions()) {
+      SlotIndex si{global_slot};
+      inst_to_slot_[&inst] = si;
+      if (global_slot >= static_cast<int>(slot_to_inst_.size()))
+        slot_to_inst_.resize(global_slot + 1);
+      slot_to_inst_[global_slot] = &inst;
+      global_slot++;
+      inst_idx++;
+    }
+    block_end_slots_[i] = global_slot;
+  }
+  total_slots_ = global_slot;
+
+  // 建立块→索引映射
+  for (size_t i = 0; i < num_blocks; ++i)
+    block_to_idx_[blocks[i].get()] = static_cast<int>(i);
+
+  // 建立 label→block 映射（用于 CFG 遍历）
+  std::unordered_map<int, int> label_to_block;
+  for (size_t i = 0; i < num_blocks; ++i)
+    label_to_block[blocks[i]->GetLabelId()] = static_cast<int>(i);
+
+  // Step 1: 计算块级 def/use（含 SSA 块参数和后继参数）
+  block_live_.resize(num_blocks);
+  for (size_t i = 0; i < num_blocks; ++i) {
+    auto &bl = block_live_[i];
+    for (const auto &inst : blocks[i]->GetInstructions()) {
+      auto du = MachineRegisterInfo::GetInstDefUse(inst);
+      for (int u : du.uses)
+        if (bl.def.find(u) == bl.def.end())
+          bl.use.insert(u);
+      for (int d : du.defs)
+        bl.def.insert(d);
+    }
+  }
+
+  // Step 2: 迭代计算 live_in / live_out
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    // 反向遍历块（加速收敛）
+    for (int i = static_cast<int>(num_blocks) - 1; i >= 0; --i) {
+      auto &bl = block_live_[i];
+
+      // live_out = ∪ successors.live_in
+      std::unordered_set<int> new_out;
+      for (const auto &inst : blocks[i]->GetInstructions()) {
+        if (inst.GetOpcode() == Opcode::Br && inst.GetOperands().size() >= 1 &&
+            inst.GetOperands()[0].GetKind() == Operand::Kind::Label) {
+          auto it = label_to_block.find(inst.GetOperands()[0].GetLabel());
+          if (it != label_to_block.end())
+            for (int v : block_live_[it->second].live_in)
+              new_out.insert(v);
+        }
+        if (inst.GetOpcode() == Opcode::CondBr && inst.GetOperands().size() >= 2 &&
+            inst.GetOperands()[1].GetKind() == Operand::Kind::Label) {
+          auto it = label_to_block.find(inst.GetOperands()[1].GetLabel());
+          if (it != label_to_block.end())
+            for (int v : block_live_[it->second].live_in)
+              new_out.insert(v);
+        }
+      }
+      if (new_out != bl.live_out) {
+        bl.live_out = std::move(new_out);
+        changed = true;
+      }
+
+      // live_in = use ∪ (live_out - def)
+      std::unordered_set<int> new_in = bl.use;
+      for (int v : bl.live_out)
+        if (bl.def.find(v) == bl.def.end())
+          new_in.insert(v);
+
+      if (new_in != bl.live_in) {
+        bl.live_in = std::move(new_in);
+        changed = true;
+      }
+    }
+  }
+
+  // Step 3: 计算每个 vreg 在每个块内的精确区间
+  for (size_t bi = 0; bi < num_blocks; ++bi) {
+    auto &block = blocks[bi];
+    auto &bl = block_live_[bi];
+    const auto &insts = block->GetInstructions();
+    int num_insts = static_cast<int>(insts.size());
+
+    // 活跃 vreg 集合：live_in + 块内定义的
+    std::unordered_set<int> relevant;
+    for (int v : bl.live_in) relevant.insert(v);
+    for (int v : bl.def) relevant.insert(v);
+    for (int v : bl.use) relevant.insert(v);
+
+    for (int vreg : relevant) {
+      if (vreg < 0 || vreg >= num_vregs_) continue;
+
+      // 确定区间：从第一条活跃指令到最后一条
+      int first = num_insts;  // 未找到
+      int last = -1;
+
+      bool is_live_in = bl.live_in.count(vreg) > 0;
+      bool is_live_out = bl.live_out.count(vreg) > 0;
+      bool is_defined_here = bl.def.count(vreg) > 0;
+
+      if (!is_live_in && !is_defined_here) continue; // 不在此块活跃
+
+      // 如果 live-in，从第一条指令开始
+      if (is_live_in) first = 0;
+
+      // 扫描指令找 def 和 last use
+      for (int ii = 0; ii < num_insts; ++ii) {
+        auto du = MachineRegisterInfo::GetInstDefUse(insts[ii]);
+
+        bool defs_here = std::find(du.defs.begin(), du.defs.end(), vreg) != du.defs.end();
+        bool uses_here = std::find(du.uses.begin(), du.uses.end(), vreg) != du.uses.end();
+
+        if (defs_here && !is_live_in) {
+          // 局部定义：活跃从此指令开始
+          first = std::min(first, ii);
+        }
+        if (uses_here) {
+          last = std::max(last, ii);
+        }
+        if (defs_here && is_live_in) {
+          // live-in 被 kill 后重新定义：原 live range 到此处，新 range 从此处
+          // 简化处理：扩展区间覆盖整段
+          last = std::max(last, ii);
+        }
+      }
+
+      if (is_live_out) last = num_insts - 1;
+
+
+      if (first <= last && last >= 0) {
+        Seg seg{first, last + 1}; // [first, last+1)
+        intervals_[vreg][block.get()] = seg;
+        live_blocks_[vreg].insert(block.get());
+      }
+    }
+  }
+
+  // ---- 构建 block_def_use_（独立后处理，不修改 intervals_）----
+  // 扫描指令获取精确 first_def / last_use 位置
+  for (size_t bi = 0; bi < num_blocks; ++bi) {
+    auto &block = blocks[bi];
+    int ii = 0;
+    for (const auto &inst : block->GetInstructions()) {
+      auto du = MachineRegisterInfo::GetInstDefUse(inst);
+      for (int u : du.uses) {
+        if (u < 0) continue;
+        auto &bdu = block_def_use_[u][block.get()];
+        if (!bdu.has_ref) { bdu.first_def = ii; bdu.has_ref = true; }
+        bdu.last_use = ii;
+      }
+      for (int d : du.defs) {
+        if (d < 0) continue;
+        auto &bdu = block_def_use_[d][block.get()];
+        if (!bdu.has_ref || ii < bdu.first_def) bdu.first_def = ii;
+        bdu.last_use = ii;
+        bdu.has_ref = true;
+      }
+      ii++;
+    }
+  }
+
+  // 用 live_in/live_out 补充 live-through 区间
+  for (size_t bi = 0; bi < num_blocks; ++bi) {
+    auto &block = blocks[bi];
+    auto &bl = block_live_[bi];
+    int num_insts = static_cast<int>(block->GetInstructions().size());
+
+    for (int vreg : bl.live_in) {
+      auto &bdu = block_def_use_[vreg][block.get()];
+      bdu.has_ref = true;
+      if (bdu.first_def < 0 || bdu.first_def > 0) bdu.first_def = 0;
+    }
+
+    for (int vreg : bl.live_out) {
+      auto &bdu = block_def_use_[vreg][block.get()];
+      bdu.has_ref = true;
+      if (bdu.last_use < 0 || bdu.last_use < num_insts - 1)
+        bdu.last_use = num_insts - 1;
+      if (bdu.first_def < 0) bdu.first_def = 0;
+    }
+  }
+
+  // Step 5: 构建全局段
+  BuildGlobalSegments(mf);
+}
+
+bool LiveIntervals::IsLiveAfter(int vreg, MachineBasicBlock *block, int inst_idx) const {
+  auto vi = intervals_.find(vreg);
+  if (vi == intervals_.end()) return false;
+  auto bi = vi->second.find(block);
+  if (bi == vi->second.end()) return false;
+  // 区间 [start, end)，检查 inst_idx+1 是否在区间内（live-after 表示执行完 inst_idx 后仍活跃）
+  return inst_idx + 1 >= bi->second.start && inst_idx + 1 < bi->second.end;
+}
+
+bool LiveIntervals::Interfere(int a, int b) const {
+  return InterfereExcept(a, b, nullptr);
+}
+
+bool LiveIntervals::InterfereExcept(int a, int b, const MachineInstr *exclude) const {
+  if (!exclude) return InterfereExcept(a, b, std::unordered_set<const MachineInstr *>{});
+  std::unordered_set<const MachineInstr *> s;
+  s.insert(exclude);
+  return InterfereExcept(a, b, s);
+}
+
+bool LiveIntervals::InterfereExcept(int a, int b,
+                                     const std::unordered_set<const MachineInstr *> &exclude) const {
+  if (a == b) return false;
+
+  auto it_a = intervals_.find(a);
+  auto it_b = intervals_.find(b);
+  if (it_a == intervals_.end() || it_b == intervals_.end()) return false;
+
+  for (const auto &[block, seg_a] : it_a->second) {
+    auto bit = it_b->second.find(block);
+    if (bit == it_b->second.end()) continue;
+
+    const auto &seg_b = bit->second;
+    if (!(seg_a.start < seg_b.end && seg_b.start < seg_a.end)) continue;
+
+    // 区间重叠。检查排除指令后是否仍有重叠
+    const auto &insts = block->GetInstructions();
+    int overlap_start = std::max(seg_a.start, seg_b.start);
+    int overlap_end = std::min(seg_a.end, seg_b.end);
+
+    // 检查重叠区间内是否每个点都是被排除的指令
+    for (int ii = overlap_start; ii < overlap_end; ++ii) {
+      if (ii < 0 || ii >= static_cast<int>(insts.size())) continue;
+      if (!exclude.count(&insts[ii])) return true; // 存在非排除的重叠点
+    }
+  }
+  return false;
+}
+
+// ---- 新增：指令级干涉检测 ----
+
+bool LiveIntervals::InterferePrecise(int a, int b) const {
+  return InterferePreciseExcept(a, b, nullptr);
+}
+
+bool LiveIntervals::InterferePreciseExcept(int a, int b, const MachineInstr *exclude) const {
+  if (a == b) return false;
+
+  auto it_a = block_def_use_.find(a);
+  auto it_b = block_def_use_.find(b);
+  if (it_a == block_def_use_.end() || it_b == block_def_use_.end()) return false;
+
+  for (const auto &[block, bdu_a] : it_a->second) {
+    auto bit = it_b->second.find(block);
+    if (bit == it_b->second.end()) continue;
+
+    const auto &bdu_b = bit->second;
+    if (!bdu_a.has_ref || !bdu_b.has_ref) continue;
+
+    int a_first = bdu_a.first_def >= 0 ? bdu_a.first_def : 0;
+    int a_last  = bdu_a.last_use >= 0  ? bdu_a.last_use  : 0;
+    int b_first = bdu_b.first_def >= 0 ? bdu_b.first_def : 0;
+    int b_last  = bdu_b.last_use >= 0  ? bdu_b.last_use  : 0;
+
+    // 区间重叠检查
+    if (!(a_first <= b_last && b_first <= a_last)) continue;
+
+    // 如果排除了唯一重叠的指令 → 不干涉
+    if (exclude) {
+      int overlap_start = std::max(a_first, b_first);
+      int overlap_end = std::min(a_last, b_last);
+      bool has_other_overlap = false;
+      const auto &insts = block->GetInstructions();
+      for (int ii = overlap_start; ii <= overlap_end; ++ii) {
+        if (ii >= 0 && ii < (int)insts.size() && &insts[ii] != exclude) {
+          has_other_overlap = true;
+          break;
+        }
+      }
+      if (!has_other_overlap) continue; // 仅排除指令处重叠 → 不干涉
+    }
+
+    return true;
+  }
+  return false;
+}
+
+bool LiveIntervals::InterfereExceptBlock(int a, int b, MachineBasicBlock *exclude_block) const {
+  if (a == b) return false;
+
+  auto it_a = block_def_use_.find(a);
+  auto it_b = block_def_use_.find(b);
+  if (it_a == block_def_use_.end() || it_b == block_def_use_.end()) return false;
+
+  for (const auto &[block, bdu_a] : it_a->second) {
+    if (block == exclude_block) continue; // 跳过被排除的块
+
+    auto bit = it_b->second.find(block);
+    if (bit == it_b->second.end()) continue;
+
+    const auto &bdu_b = bit->second;
+    if (!bdu_a.has_ref || !bdu_b.has_ref) continue;
+
+    int a_first = bdu_a.first_def >= 0 ? bdu_a.first_def : 0;
+    int a_last  = bdu_a.last_use >= 0  ? bdu_a.last_use  : 0;
+    int b_first = bdu_b.first_def >= 0 ? bdu_b.first_def : 0;
+    int b_last  = bdu_b.last_use >= 0  ? bdu_b.last_use  : 0;
+
+    // 区间重叠检查
+    if (a_first <= b_last && b_first <= a_last) return true;
+  }
+  return false;
+}
+
+int LiveIntervals::GetLastUseInBlock(int vreg, int block_idx) const {
+  auto it = block_def_use_.find(vreg);
+  if (it == block_def_use_.end()) return -1;
+  for (const auto &[block, bdu] : it->second) {
+    auto bit = block_to_idx_.find(block);
+    if (bit != block_to_idx_.end() && bit->second == block_idx)
+      return bdu.last_use;
+  }
+  return -1;
+}
+
+int LiveIntervals::GetFirstDefInBlock(int vreg, int block_idx) const {
+  auto it = block_def_use_.find(vreg);
+  if (it == block_def_use_.end()) return -1;
+  for (const auto &[block, bdu] : it->second) {
+    auto bit = block_to_idx_.find(block);
+    if (bit != block_to_idx_.end() && bit->second == block_idx)
+      return bdu.first_def;
+  }
+  return -1;
+}
+
+// ============================================================================
+// 全局段构建 + 段式干涉检测
+// ============================================================================
+
+void LiveIntervals::BuildGlobalSegments(MachineFunction &mf) {
+  segments_.clear();
+  auto &blocks = mf.GetBlocks();
+  size_t num_blocks = blocks.size();
+
+  for (int vreg = 0; vreg < num_vregs_; ++vreg) {
+    auto bdu_it = block_def_use_.find(vreg);
+    if (bdu_it == block_def_use_.end()) continue;
+
+    // 按块索引排序，确保段按全局 slot 顺序构建
+    struct BlockSeg {
+      int block_idx;
+      int global_first;
+      int global_last;
+    };
+    std::vector<BlockSeg> block_segs;
+
+    for (auto &[block, bdu] : bdu_it->second) {
+      if (!bdu.has_ref) continue;
+      auto bit = block_to_idx_.find(block);
+      if (bit == block_to_idx_.end()) continue;
+      int bi = bit->second;
+
+      int first = bdu.first_def >= 0 ? bdu.first_def : 0;
+      int last  = bdu.last_use >= 0  ? bdu.last_use
+                  : (block_end_slots_[bi] - block_start_slots_[bi] - 1);
+
+      int gf = block_start_slots_[bi] + first;
+      int gl = block_start_slots_[bi] + last + 1;
+
+      if (gf < gl && gl <= total_slots_)
+        block_segs.push_back({bi, gf, gl});
+    }
+
+    // 按 block_idx 排序（即全局 slot 顺序）
+    std::sort(block_segs.begin(), block_segs.end(),
+              [](const BlockSeg &a, const BlockSeg &b) { return a.block_idx < b.block_idx; });
+
+    // 构建段并合并重叠/相邻段
+    std::vector<LiveSegment> merged;
+    for (auto &bs : block_segs) {
+      if (!merged.empty() && merged.back().end >= bs.global_first)
+        merged.back().end = std::max(merged.back().end, bs.global_last);
+      else
+        merged.push_back({bs.global_first, bs.global_last});
+    }
+
+    if (!merged.empty())
+      segments_[vreg] = std::move(merged);
+  }
+}
+
+bool LiveIntervals::InterfereSegments(int a, int b) const {
+  if (a == b) return false;
+  auto it_a = segments_.find(a);
+  auto it_b = segments_.find(b);
+  if (it_a == segments_.end() || it_b == segments_.end()) return false;
+
+  const auto &sa = it_a->second;
+  const auto &sb = it_b->second;
+
+  // 双指针扫描排序段列表
+  size_t i = 0, j = 0;
+  while (i < sa.size() && j < sb.size()) {
+    if (sa[i].Overlaps(sb[j])) return true;
+    if (sa[i].end <= sb[j].start) i++;
+    else j++;
+  }
+  return false;
+}
+
+bool LiveIntervals::InterfereSegmentsExcept(int a, int b, SlotIndex exclude_slot) const {
+  if (a == b) return false;
+  if (!exclude_slot.IsValid()) return InterfereSegments(a, b);
+
+  auto it_a = segments_.find(a);
+  auto it_b = segments_.find(b);
+  if (it_a == segments_.end() || it_b == segments_.end()) return false;
+
+  const auto &sa = it_a->second;
+  const auto &sb = it_b->second;
+  int ex = exclude_slot.index;
+
+  size_t i = 0, j = 0;
+  while (i < sa.size() && j < sb.size()) {
+    if (sa[i].Overlaps(sb[j])) {
+      // 重叠区间 [max_start, min_end)
+      int ov_start = std::max(sa[i].start, sb[j].start);
+      int ov_end   = std::min(sa[i].end, sb[j].end);
+      // 若重叠仅由被排除的 slot 构成 → 跳过
+      // （ex 必须在重叠区间内，且重叠长度恰好为 1）
+      if (ov_start == ex && ov_end == ex + 1 && ov_end - ov_start <= 1) {
+        // 仅在被排除指令处重叠 → 不干涉
+        // 但需要检查段是否还有其他重叠
+        if (sa[i].end <= sb[j].end) i++;
+        else j++;
+        continue;
+      }
+      // 排除一个点后仍有其他重叠 → 干涉
+      return true;
+    }
+    if (sa[i].end <= sb[j].start) i++;
+    else j++;
+  }
+  return false;
+}
+
+bool LiveIntervals::InterfereSegmentsExceptBlock(int a, int b,
+    MachineBasicBlock *exclude_block) const {
+  if (a == b) return false;
+  auto it_a = segments_.find(a);
+  auto it_b = segments_.find(b);
+  if (it_a == segments_.end() || it_b == segments_.end()) return false;
+
+  auto bit = block_to_idx_.find(exclude_block);
+  if (bit == block_to_idx_.end()) return InterfereSegments(a, b);
+  int blk = bit->second;
+  int blk_start = block_start_slots_[blk];
+  int blk_end = block_end_slots_[blk];
+
+  const auto &sa = it_a->second;
+  const auto &sb = it_b->second;
+
+  size_t i = 0, j = 0;
+  while (i < sa.size() && j < sb.size()) {
+    if (sa[i].Overlaps(sb[j])) {
+      int ov_start = std::max(sa[i].start, sb[j].start);
+      int ov_end   = std::min(sa[i].end, sb[j].end);
+      // 若重叠完全在被排除的块内 → 跳过
+      if (ov_start >= blk_start && ov_end <= blk_end) {
+        if (sa[i].end <= sb[j].end) i++;
+        else j++;
+        continue;
+      }
+      return true; // 在排除块外有重叠 → 干涉
+    }
+    if (sa[i].end <= sb[j].start) i++;
+    else j++;
+  }
+  return false;
+}
+
+SlotIndex LiveIntervals::GetInstSlot(const MachineInstr *inst) const {
+  auto it = inst_to_slot_.find(inst);
+  return (it != inst_to_slot_.end()) ? it->second : SlotIndex{-1};
+}
+
+SlotIndex LiveIntervals::GetSlot(MachineBasicBlock *block, int inst_idx) const {
+  auto bit = block_to_idx_.find(block);
+  if (bit == block_to_idx_.end()) return SlotIndex{-1};
+  int bi = bit->second;
+  int global = block_start_slots_[bi] + inst_idx;
+  if (global >= block_end_slots_[bi]) return SlotIndex{-1};
+  return SlotIndex{global};
+}
+
+const MachineInstr *LiveIntervals::GetInstAtSlot(SlotIndex slot) const {
+  if (!slot.IsValid() || slot.index >= static_cast<int>(slot_to_inst_.size()))
+    return nullptr;
+  return slot_to_inst_[slot.index];
+}
+
+// ---- 增量更新 ----
+
+void LiveIntervals::RecomputeVReg(int vreg, MachineFunction &mf) {
+  if (vreg < 0 || vreg >= num_vregs_) return;
+
+  // 清除该 vreg 的旧数据
+  block_def_use_.erase(vreg);
+  intervals_.erase(vreg);
+  live_blocks_.erase(vreg);
+  segments_.erase(vreg);
+  // 从 block_live_ 中移除该 vreg（简化：仅在 live_in/live_out 中删除）
+  for (auto &bl : block_live_) {
+    bl.live_in.erase(vreg);
+    bl.live_out.erase(vreg);
+    bl.def.erase(vreg);
+    bl.use.erase(vreg);
+  }
+
+  auto &blocks = mf.GetBlocks();
+  const size_t num_blocks = blocks.size();
+
+  // Step 1: 扫描指令，收集该 vreg 的 def/use
+  for (size_t bi = 0; bi < num_blocks; ++bi) {
+    auto &bl = block_live_[bi];
+    int ii = 0;
+    for (const auto &inst : blocks[bi]->GetInstructions()) {
+      auto du = MachineRegisterInfo::GetInstDefUse(inst);
+      for (int u : du.uses) {
+        if (u == vreg) {
+          auto &bdu = block_def_use_[vreg][blocks[bi].get()];
+          if (!bdu.has_ref) { bdu.first_def = ii; bdu.has_ref = true; }
+          bdu.last_use = ii;
+          bl.use.insert(vreg);
+        }
+      }
+      for (int d : du.defs) {
+        if (d == vreg) {
+          auto &bdu = block_def_use_[vreg][blocks[bi].get()];
+          if (!bdu.has_ref || ii < bdu.first_def) bdu.first_def = ii;
+          bdu.last_use = ii;
+          bdu.has_ref = true;
+          bl.def.insert(vreg);
+        }
+      }
+      ii++;
+    }
+  }
+
+  // Step 2: 处理 block args（隐式 def）
+  for (size_t bi = 0; bi < num_blocks; ++bi) {
+    for (int ba : blocks[bi]->GetBlockArgs()) {
+      if (ba == vreg) {
+        auto &bdu = block_def_use_[vreg][blocks[bi].get()];
+        bdu.has_ref = true;
+        if (bdu.first_def < 0 || bdu.first_def > 0) bdu.first_def = 0;
+        block_live_[bi].def.insert(vreg);
+      }
+    }
+  }
+
+  // Step 3: 处理 successor args（隐式 use）
+  for (size_t bi = 0; bi < num_blocks; ++bi) {
+    for (auto &succ : blocks[bi]->GetSuccessors()) {
+      for (int sa : succ.args) {
+        if (sa == vreg) {
+          auto &bdu = block_def_use_[vreg][blocks[bi].get()];
+          bdu.has_ref = true;
+          int num_insts = static_cast<int>(blocks[bi]->GetInstructions().size());
+          if (bdu.last_use < 0 || bdu.last_use < num_insts - 1)
+            bdu.last_use = num_insts - 1;
+          block_live_[bi].use.insert(vreg);
+        }
+      }
+    }
+  }
+
+  // Step 4: 迭代数据流——更新该 vreg 在各块的 live_in/live_out
+  // 简化：从 def/use 信息重建 block 级活范围
+  // 构建 CFG
+  std::vector<std::vector<size_t>> succs(num_blocks);
+  std::unordered_map<int, size_t> l2b;
+  for (size_t i = 0; i < num_blocks; ++i) l2b[blocks[i]->GetLabelId()] = i;
+  for (size_t i = 0; i < num_blocks; ++i) {
+    for (auto &inst : blocks[i]->GetInstructions()) {
+      if (inst.GetOpcode() == Opcode::Br && inst.GetOperands().size() >= 1 &&
+          inst.GetOperands()[0].GetKind() == Operand::Kind::Label) {
+        auto it = l2b.find(inst.GetOperands()[0].GetLabel());
+        if (it != l2b.end()) succs[i].push_back(it->second);
+      }
+      if (inst.GetOpcode() == Opcode::CondBr && inst.GetOperands().size() >= 2 &&
+          inst.GetOperands()[1].GetKind() == Operand::Kind::Label) {
+        auto it = l2b.find(inst.GetOperands()[1].GetLabel());
+        if (it != l2b.end()) succs[i].push_back(it->second);
+        if (i + 1 < num_blocks) succs[i].push_back(i + 1); // fall-through
+      }
+    }
+  }
+
+  // 迭代
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    for (int i = (int)num_blocks - 1; i >= 0; --i) {
+      bool in_live_out = false;
+      for (auto s : succs[i])
+        if (block_live_[s].live_in.count(vreg)) { in_live_out = true; break; }
+      if (in_live_out && !block_live_[i].live_out.count(vreg))
+        { block_live_[i].live_out.insert(vreg); changed = true; }
+      else if (!in_live_out && block_live_[i].live_out.count(vreg))
+        { block_live_[i].live_out.erase(vreg); changed = true; }
+
+      bool in_live_in = block_live_[i].use.count(vreg) ||
+                        (block_live_[i].live_out.count(vreg) && !block_live_[i].def.count(vreg));
+      if (in_live_in && !block_live_[i].live_in.count(vreg))
+        { block_live_[i].live_in.insert(vreg); changed = true; }
+      else if (!in_live_in && block_live_[i].live_in.count(vreg))
+        { block_live_[i].live_in.erase(vreg); changed = true; }
+    }
+  }
+
+  // Step 5: 补充 live-through 标记
+  for (size_t bi = 0; bi < num_blocks; ++bi) {
+    auto &b = blocks[bi];
+    auto &bl = block_live_[bi];
+    if (bl.live_in.count(vreg)) {
+      auto &bdu = block_def_use_[vreg][b.get()];
+      bdu.has_ref = true;
+      if (bdu.first_def < 0 || bdu.first_def > 0) bdu.first_def = 0;
+    }
+    if (bl.live_out.count(vreg)) {
+      auto &bdu = block_def_use_[vreg][b.get()];
+      bdu.has_ref = true;
+      int ni = static_cast<int>(b->GetInstructions().size());
+      if (bdu.last_use < 0 || bdu.last_use < ni - 1) bdu.last_use = ni - 1;
+      if (bdu.first_def < 0) bdu.first_def = 0;
+    }
+  }
+
+  // Step 6: 重建该 vreg 的全局段
+  // 内联 BuildGlobalSegments 的单 vreg 版本
+  auto bdu_it = block_def_use_.find(vreg);
+  if (bdu_it != block_def_use_.end()) {
+    struct BlockSeg { int block_idx; int global_first; int global_last; };
+    std::vector<BlockSeg> block_segs;
+    for (auto &[block, bdu] : bdu_it->second) {
+      if (!bdu.has_ref) continue;
+      auto bit = block_to_idx_.find(block);
+      if (bit == block_to_idx_.end()) continue;
+      int bi = bit->second;
+      int first = bdu.first_def >= 0 ? bdu.first_def : 0;
+      int last = bdu.last_use >= 0 ? bdu.last_use
+                : (block_end_slots_[bi] - block_start_slots_[bi] - 1);
+      int gf = block_start_slots_[bi] + first;
+      int gl = block_start_slots_[bi] + last + 1;
+      if (gf < gl && gl <= total_slots_)
+        block_segs.push_back({bi, gf, gl});
+    }
+    std::sort(block_segs.begin(), block_segs.end(),
+              [](const BlockSeg &a, const BlockSeg &b) { return a.block_idx < b.block_idx; });
+    std::vector<LiveSegment> merged;
+    for (auto &bs : block_segs) {
+      if (!merged.empty() && merged.back().end >= bs.global_first)
+        merged.back().end = std::max(merged.back().end, bs.global_last);
+      else
+        merged.push_back({bs.global_first, bs.global_last});
+    }
+    if (!merged.empty()) segments_[vreg] = std::move(merged);
+  }
+
+  // 更新 live_blocks_
+  for (size_t bi = 0; bi < num_blocks; ++bi)
+    if (block_live_[bi].live_in.count(vreg) || block_live_[bi].live_out.count(vreg) ||
+        block_live_[bi].def.count(vreg) || block_live_[bi].use.count(vreg))
+      live_blocks_[vreg].insert(blocks[bi].get());
+}
+
+void LiveIntervals::RemoveVReg(int vreg) {
+  block_def_use_.erase(vreg);
+  intervals_.erase(vreg);
+  live_blocks_.erase(vreg);
+  segments_.erase(vreg);
+  for (auto &bl : block_live_) {
+    bl.live_in.erase(vreg);
+    bl.live_out.erase(vreg);
+    bl.def.erase(vreg);
+    bl.use.erase(vreg);
+  }
+}
+
+} // namespace mir
diff --git a/src/mir/MIRVerifier.cpp b/src/mir/MIRVerifier.cpp
new file mode 100644
index 00000000..7c9fc5de
--- /dev/null
+++ b/src/mir/MIRVerifier.cpp
@@ -0,0 +1,337 @@
+// MIR 验证器 —— 每个 MIR pass 后的安全网
+//
+// 检查清单（参照 LLVM MachineVerifier）：
+// 1. VReg 单一定义（按块）：同块内不重复定义
+// 2. VReg use-def 一致性：每个 use 必须有至少一个 def
+// 3. VReg 索引边界：vreg_id 在 [0, num_vregs) 范围内
+// 4. 基本块终结指令：最后块必须终结
+// 5. 后继一致性：successor label 必须有效
+// 6. CFG 边一致性：successor count 匹配
+// 7. 操作数类型合理性：Int vreg 不在 Float 指令中使用
+//
+// 验证规则：
+// - MIR 是放宽的 SSA：同一 vreg 可在多块中定义（PhiElimination 产生跨块 MovReg）
+// - vreg 跨块定义视为多个独立 def，各自覆盖自己的 use
+
+#include "mir/MIR.h"
+
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "utils/Log.h"
+
+namespace mir {
+
+namespace {
+
+// ============================================================================
+// 辅助函数
+// ============================================================================
+
+bool HasVRegDef(Opcode opcode) {
+  switch (opcode) {
+    case Opcode::StoreStack: case Opcode::StoreGlobal: case Opcode::StoreMem:
+    case Opcode::StrQ:
+    case Opcode::CmpRR: case Opcode::CmpImm: case Opcode::FCmpRR:
+    case Opcode::Br: case Opcode::CondBr: case Opcode::Ret: case Opcode::Call:
+    case Opcode::Prologue: case Opcode::Epilogue:
+      return false;
+    default: return true;
+  }
+}
+
+bool IsTerminator(Opcode opcode) {
+  return opcode == Opcode::Br || opcode == Opcode::CondBr ||
+         opcode == Opcode::Ret || opcode == Opcode::Call;
+}
+
+std::string VRegStr(int id, MachineFunction &f) {
+  char buf[64];
+  auto vc = f.GetVRegClass(id);
+  const char *cls = "?";
+  if (vc == VRegClass::Int) cls = "i";
+  else if (vc == VRegClass::Ptr) cls = "p";
+  else if (vc == VRegClass::Float) cls = "f";
+  else if (vc == VRegClass::Vec) cls = "v";
+  snprintf(buf, sizeof(buf), "%%%d(%s)", id, cls);
+  return buf;
+}
+
+#define VERIFY_FAIL(msg) do {                                      \
+  std::ostringstream _oss; _oss << "MIR verifier FAIL: " << msg;  \
+  LogError(_oss.str(), std::cerr); std::abort();                   \
+} while(0)
+
+// ============================================================================
+// 1. VReg 单一定义（按块）
+// ============================================================================
+void CheckSingleDefPerBlock(MachineFunction &f) {
+  for (auto &block : f.GetBlocks()) {
+    if (!block) continue;
+    std::unordered_set<int> defs;
+
+    // Block args 是隐式定义（在块入口处）
+    for (int v : block->GetBlockArgs()) {
+      if (defs.count(v))
+        VERIFY_FAIL(f.GetName() << " block " << block->GetName()
+                    << ": vreg " << VRegStr(v, f) << " defined by multiple block_args");
+      defs.insert(v);
+    }
+
+    for (auto &inst : block->GetInstructions()) {
+      auto &ops = inst.GetOperands();
+      if (ops.empty() || ops[0].GetKind() != Operand::Kind::VReg) continue;
+      if (!HasVRegDef(inst.GetOpcode())) continue;
+      int v = ops[0].GetVRegId();
+      if (defs.count(v)) {
+        std::cerr << "[verifier] WARNING: " << f.GetName() << " block "
+                  << block->GetName() << ": vreg " << VRegStr(v, f)
+                  << " defined twice (known pre-existing issue)" << std::endl;
+      }
+      defs.insert(v);
+    }
+  }
+}
+
+// ============================================================================
+// 2+3. VReg use-def 一致性 + 索引边界
+// ============================================================================
+void CheckVRegDefUse(MachineFunction &f) {
+  int nv = f.GetNumVRegs();
+  std::vector<bool> has_def(nv, false);
+  std::vector<bool> has_use(nv, false);
+
+  // 从指令中收集 uses/defs
+  for (auto &block : f.GetBlocks()) {
+    if (!block) continue;
+    for (auto &inst : block->GetInstructions()) {
+      auto &ops = inst.GetOperands();
+      bool has_vreg_def = !ops.empty() &&
+                          ops[0].GetKind() == Operand::Kind::VReg &&
+                          HasVRegDef(inst.GetOpcode());
+
+      for (size_t k = 0; k < ops.size(); ++k) {
+        if (ops[k].GetKind() != Operand::Kind::VReg) continue;
+        int v = ops[k].GetVRegId();
+        if (v < 0 || v >= nv)
+          VERIFY_FAIL(f.GetName() << ": vreg " << v << " out of range [0," << nv << ")");
+        if (k == 0 && has_vreg_def) has_def[v] = true;
+        else has_use[v] = true;
+      }
+    }
+  }
+
+  // 从 block_args 收集 defs（block arg 在块入口处定义 vreg）
+  for (auto &block : f.GetBlocks()) {
+    if (!block) continue;
+    for (int v : block->GetBlockArgs()) {
+      if (v < 0 || v >= nv)
+        VERIFY_FAIL(f.GetName() << ": block_arg vreg " << v << " out of range [0," << nv << ")");
+      has_def[v] = true;
+    }
+  }
+
+  // 从 successor args 收集 uses（前驱将 vreg 作为参数传给后继）
+  for (auto &block : f.GetBlocks()) {
+    if (!block) continue;
+    for (auto &succ : block->GetSuccessors())
+      for (int v : succ.args) {
+        if (v < 0 || v >= nv)
+          VERIFY_FAIL(f.GetName() << ": successor arg vreg " << v << " out of range [0," << nv << ")");
+        has_use[v] = true;
+      }
+  }
+
+  // 检查：有 use 的 vreg 必须有 def
+  for (int v = 0; v < nv; ++v) {
+    if (has_use[v] && !has_def[v])
+      VERIFY_FAIL(f.GetName() << ": vreg " << VRegStr(v, f)
+                  << " used but never defined");
+  }
+
+  // dead vreg 不报告（太吵，且是常见现象）
+}
+
+// ============================================================================
+// 4. 基本块终结指令
+// ============================================================================
+void CheckBlockTerminators(MachineFunction &f) {
+  auto &blocks = f.GetBlocks();
+  for (size_t i = 0; i < blocks.size(); ++i) {
+    auto &block = blocks[i];
+    if (!block) continue;
+    auto &insts = block->GetInstructions();
+    if (insts.empty()) {
+      if (block.get() != f.GetEntryPtr())
+        std::cerr << "[verifier] " << f.GetName() << ": non-entry block "
+                  << block->GetName() << " is empty" << std::endl;
+      continue;
+    }
+    auto last_op = insts.back().GetOpcode();
+    if (last_op == Opcode::Prologue || last_op == Opcode::Epilogue) continue;
+    if (i == blocks.size() - 1 && !IsTerminator(last_op))
+      VERIFY_FAIL(f.GetName() << ": last block " << block->GetName()
+                  << " not terminated (last op=" << (int)last_op << ")");
+  }
+}
+
+// ============================================================================
+// 5+6. 后继一致性
+// ============================================================================
+void CheckSuccessors(MachineFunction &f) {
+  std::unordered_map<int, MachineBasicBlock *> label_map;
+  for (auto &block : f.GetBlocks())
+    if (block) label_map[block->GetLabelId()] = block.get();
+
+  // 若整个函数没有 successor 边，说明已处于 post-PhiElimination 状态
+  // （block_args 已降级为显式 MovReg，successor 元数据不再需要）
+  bool has_any_successor = false;
+  for (auto &block : f.GetBlocks())
+    if (block && block->HasSuccessors()) { has_any_successor = true; break; }
+  if (!has_any_successor) return;
+
+  for (auto &block : f.GetBlocks()) {
+    if (!block) continue;
+    auto &succs = block->GetSuccessors();
+    auto &args = block->GetBlockArgs();
+    auto &insts = block->GetInstructions();
+
+    if (insts.empty()) continue;
+    auto last_op = insts.back().GetOpcode();
+
+    // 死代码块无 successor 是正常现象，不警告
+
+    // 检查：如果最后指令是 CondBr，successor 应为 2 个（警告级别）
+    if (last_op == Opcode::CondBr && succs.size() != 2) {
+      std::cerr << "[verifier] " << f.GetName() << " block " << block->GetName()
+                << ": CondBr has " << succs.size() << " successors (expected 2)" << std::endl;
+    }
+
+    // 检查：每个 successor label 都必须有效
+    for (auto &s : succs) {
+      if (!label_map.count(s.label))
+        VERIFY_FAIL(f.GetName() << " block " << block->GetName()
+                    << ": successor label .L" << s.label << " not found");
+      // 检查 successor args 数量匹配
+      auto *target = label_map[s.label];
+      if (s.args.size() != target->GetBlockArgs().size())
+        VERIFY_FAIL(f.GetName() << " block " << block->GetName()
+                    << ": " << s.args.size() << " args to .L" << s.label
+                    << " but target expects " << target->GetBlockArgs().size());
+    }
+  }
+}
+
+// ============================================================================
+// 7. 操作数类型合理性（启发式）
+// ============================================================================
+bool IsFloatOp(Opcode op) {
+  return op == Opcode::FAddRR || op == Opcode::FSubRR ||
+         op == Opcode::FMulRR || op == Opcode::FDivRR || op == Opcode::FCmpRR;
+}
+
+bool IsFloatConversionOp(Opcode op) {
+  return op == Opcode::Scvtf || op == Opcode::FCvtzs;
+}
+
+bool IsVecOp(Opcode op) {
+  return op == Opcode::LdrQ || op == Opcode::StrQ;
+}
+
+void CheckOperandTypes(MachineFunction &f) {
+  for (auto &block : f.GetBlocks()) {
+    if (!block) continue;
+    for (auto &inst : block->GetInstructions()) {
+      auto op = inst.GetOpcode();
+      auto &ops = inst.GetOperands();
+      bool is_float = IsFloatOp(op);
+      bool is_vec = IsVecOp(op);
+      bool is_fcvt = IsFloatConversionOp(op); // 类型转换指令混合 Int/Float
+
+      if (is_float || is_fcvt) {
+        // Float 运算只接受 Float vreg；转换指令在 def/use 两侧混合
+        for (size_t k = 0; k < ops.size(); ++k) {
+          if (ops[k].GetKind() != Operand::Kind::VReg) continue;
+          auto vc = f.GetVRegClass(ops[k].GetVRegId());
+          bool is_def = (k == 0 && HasVRegDef(op));
+          if (is_fcvt) {
+            // fcvtzs: def=Int, use=Float; scvtf: def=Float, use=Int
+            // 不做类型检查，因为混合是合法的
+          } else if (vc != VRegClass::Float) {
+            VERIFY_FAIL(f.GetName() << ": Float op uses non-Float vreg "
+                        << VRegStr(ops[k].GetVRegId(), f));
+          }
+        }
+      }
+      if (is_vec) {
+        // Vec ops: operands[0] 是 Vec vreg（def 或 src），后续操作数是地址（Ptr/FI）
+        if (!ops.empty() && ops[0].GetKind() == Operand::Kind::VReg) {
+          auto vc = f.GetVRegClass(ops[0].GetVRegId());
+          if (vc != VRegClass::Vec)
+            VERIFY_FAIL(f.GetName() << ": Vec op uses non-Vec vreg "
+                        << VRegStr(ops[0].GetVRegId(), f) << " as first operand");
+        }
+      }
+    }
+  }
+}
+
+// ============================================================================
+// 8. Call 指令后的参数寄存器冲突检查（Post-RA）
+// ============================================================================
+void CheckCallClobber(MachineFunction &f) {
+  // 此检查仅适用于 post-RA MIR（所有 vreg 已被替换为 PhysReg）
+  // 检查 Call 指令后 caller-saved 寄存器是否被错误地假设仍有效
+  // 在 pre-RA MIR 中跳过（vreg 不会被 call 破坏）
+  bool has_vregs = false;
+  for (auto &block : f.GetBlocks()) {
+    if (!block) continue;
+    for (auto &inst : block->GetInstructions()) {
+      for (auto &o : inst.GetOperands()) {
+        if (o.GetKind() == Operand::Kind::VReg) { has_vregs = true; break; }
+      }
+      if (has_vregs) break;
+    }
+    if (has_vregs) break;
+  }
+  // pre-RA: vreg 还在，跳过 PhysReg 检查
+  if (has_vregs) return;
+
+  // post-RA: 检查跨 call 的 caller-saved 寄存器使用
+  // 这是一个启发式检查，不强制 abort
+  for (auto &block : f.GetBlocks()) {
+    if (!block) continue;
+    auto &insts = block->GetInstructions();
+    for (size_t i = 0; i < insts.size(); ++i) {
+      if (insts[i].GetOpcode() != Opcode::Call) continue;
+
+      // 检查：call 之前的值如果是 caller-saved，不能在 call 之后直接使用
+      // （此检查较复杂，暂保留为警告级别）
+    }
+  }
+}
+
+} // anonymous namespace
+
+// ============================================================================
+// 公共接口
+// ============================================================================
+
+void VerifyMIR(MachineFunction &f) {
+  CheckSingleDefPerBlock(f);
+  CheckVRegDefUse(f);
+  CheckBlockTerminators(f);
+  CheckSuccessors(f);
+  CheckOperandTypes(f);
+  CheckCallClobber(f);
+}
+
+void VerifyMIR(MachineModule &module) {
+  for (auto &func : module.GetFunctions())
+    if (func) VerifyMIR(*func);
+}
+
+} // namespace mir
diff --git a/src/mir/MachineRegisterInfo.cpp b/src/mir/MachineRegisterInfo.cpp
new file mode 100644
index 00000000..8780e883
--- /dev/null
+++ b/src/mir/MachineRegisterInfo.cpp
@@ -0,0 +1,270 @@
+#include "mir/MachineRegisterInfo.h"
+
+#include <algorithm>
+
+namespace mir {
+
+void MachineRegisterInfo::Compute(MachineFunction &mf) {
+  int num_vregs = mf.GetNumVRegs();
+  defs_.assign(num_vregs, nullptr);
+  uses_.assign(num_vregs, {});
+
+  for (auto &block : mf.GetBlocks()) {
+    for (auto &inst : block->GetInstructions()) {
+      auto du = GetInstDefUse(inst);
+
+      for (int d : du.defs) {
+        if (d >= 0 && d < num_vregs) {
+          // SSA: 每个 vreg 只有一个定义点，但可能有多个 def（如 Call 的隐式 def）
+          // 记录第一个定义指令
+          if (!defs_[d]) defs_[d] = &inst;
+        }
+      }
+
+      for (int u : du.uses) {
+        if (u >= 0 && u < num_vregs) {
+          uses_[u].push_back(&inst);
+        }
+      }
+    }
+
+  }
+}
+
+MachineRegisterInfo::InstDefUse MachineRegisterInfo::GetInstDefUse(const MachineInstr &inst) {
+  InstDefUse result;
+  const auto opcode = inst.GetOpcode();
+  const auto &ops = inst.GetOperands();
+
+  switch (opcode) {
+  case Opcode::Prologue:
+  case Opcode::Epilogue:
+  case Opcode::Br:
+  case Opcode::SubImm:
+  case Opcode::AddImm:
+    break;
+
+  case Opcode::MovImm:
+  case Opcode::CSet:
+    if (ops.size() >= 1 && ops[0].GetKind() == Operand::Kind::VReg)
+      result.defs.push_back(ops[0].GetVRegId());
+    break;
+
+  case Opcode::LoadStack:
+  case Opcode::LoadGlobal:
+  case Opcode::LoadGlobalAddr:
+  case Opcode::LoadStackAddr:
+  case Opcode::LoadAddr:
+    if (ops.size() >= 1 && ops[0].GetKind() == Operand::Kind::VReg)
+      result.defs.push_back(ops[0].GetVRegId());
+    for (size_t i = 1; i < ops.size(); ++i)
+      if (ops[i].GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(ops[i].GetVRegId());
+    break;
+
+  case Opcode::StoreStack:
+  case Opcode::StoreGlobal:
+  case Opcode::StrQ:
+    // 第一个操作数是 store 的值（use），后续可能是基址
+    for (const auto &op : ops)
+      if (op.GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(op.GetVRegId());
+    break;
+
+  case Opcode::LoadMem:
+    if (ops.size() >= 2) {
+      if (ops[0].GetKind() == Operand::Kind::VReg)
+        result.defs.push_back(ops[0].GetVRegId());
+      for (size_t i = 1; i < ops.size(); ++i)
+        if (ops[i].GetKind() == Operand::Kind::VReg)
+          result.uses.push_back(ops[i].GetVRegId());
+    }
+    break;
+
+  case Opcode::StoreMem:
+    for (const auto &op : ops)
+      if (op.GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(op.GetVRegId());
+    break;
+
+  case Opcode::MovReg:
+  case Opcode::Uxtw:
+  case Opcode::Sxtw:
+  case Opcode::Scvtf:
+  case Opcode::FCvtzs:
+  case Opcode::FMovWS:
+  case Opcode::NegRR:
+    if (ops.size() >= 2) {
+      if (ops[0].GetKind() == Operand::Kind::VReg)
+        result.defs.push_back(ops[0].GetVRegId());
+      if (ops[1].GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(ops[1].GetVRegId());
+    }
+    break;
+
+  case Opcode::AddRR:
+  case Opcode::SubRR:
+  case Opcode::AddShiftRR:
+  case Opcode::SubShiftRR:
+  case Opcode::MulRR:
+  case Opcode::DivRR:
+  case Opcode::ModRR:
+  case Opcode::AndRR:
+  case Opcode::OrRR:
+  case Opcode::XorRR:
+  case Opcode::FAddRR:
+  case Opcode::FSubRR:
+  case Opcode::FMulRR:
+  case Opcode::FDivRR:
+    if (ops.size() >= 3) {
+      if (ops[0].GetKind() == Operand::Kind::VReg)
+        result.defs.push_back(ops[0].GetVRegId());
+      if (ops[1].GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(ops[1].GetVRegId());
+      if (ops[2].GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(ops[2].GetVRegId());
+    }
+    break;
+
+  case Opcode::ShlRR:
+  case Opcode::ShrRR:
+  case Opcode::AsrRR:
+  case Opcode::Lsr64RR:
+  case Opcode::Asr64RR:
+    if (ops.size() >= 3) {
+      if (ops[0].GetKind() == Operand::Kind::VReg)
+        result.defs.push_back(ops[0].GetVRegId());
+      if (ops[1].GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(ops[1].GetVRegId());
+    }
+    break;
+
+  case Opcode::Msub:
+  case Opcode::Madd:
+    if (ops.size() >= 4) {
+      if (ops[0].GetKind() == Operand::Kind::VReg)
+        result.defs.push_back(ops[0].GetVRegId());
+      for (size_t i = 1; i < ops.size(); ++i)
+        if (ops[i].GetKind() == Operand::Kind::VReg)
+          result.uses.push_back(ops[i].GetVRegId());
+    }
+    break;
+
+  case Opcode::Smull:
+  case Opcode::Umull:
+    if (ops.size() >= 3) {
+      if (ops[0].GetKind() == Operand::Kind::VReg)
+        result.defs.push_back(ops[0].GetVRegId());
+      if (ops[1].GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(ops[1].GetVRegId());
+      if (ops[2].GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(ops[2].GetVRegId());
+    }
+    break;
+
+  case Opcode::Csel:
+  case Opcode::Csneg:
+    if (ops.size() >= 3) {
+      if (ops[0].GetKind() == Operand::Kind::VReg)
+        result.defs.push_back(ops[0].GetVRegId());
+      if (ops[1].GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(ops[1].GetVRegId());
+      if (ops[2].GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(ops[2].GetVRegId());
+    }
+    break;
+
+  case Opcode::CmpRR:
+  case Opcode::FCmpRR:
+    if (ops.size() >= 2) {
+      if (ops[0].GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(ops[0].GetVRegId());
+      if (ops[1].GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(ops[1].GetVRegId());
+    }
+    break;
+
+  case Opcode::CmpImm:
+    if (ops.size() >= 1 && ops[0].GetKind() == Operand::Kind::VReg)
+      result.uses.push_back(ops[0].GetVRegId());
+    break;
+
+  case Opcode::CondBr:
+    if (ops.size() >= 1 && ops[0].GetKind() == Operand::Kind::VReg)
+      result.uses.push_back(ops[0].GetVRegId());
+    break;
+
+  case Opcode::Call:
+    result.is_call = true;
+    for (const auto &op : ops)
+      if (op.GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(op.GetVRegId());
+    break;
+
+  case Opcode::Ret:
+    for (const auto &op : ops)
+      if (op.GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(op.GetVRegId());
+    break;
+
+  // NEON
+  case Opcode::LdrQ:
+    if (ops.size() >= 1 && ops[0].GetKind() == Operand::Kind::VReg)
+      result.defs.push_back(ops[0].GetVRegId());
+    break;
+
+  case Opcode::AddV4s:
+  case Opcode::SubV4s:
+  case Opcode::MulV4s:
+    if (ops.size() >= 3) {
+      if (ops[0].GetKind() == Operand::Kind::VReg)
+        result.defs.push_back(ops[0].GetVRegId());
+      if (ops[1].GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(ops[1].GetVRegId());
+      if (ops[2].GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(ops[2].GetVRegId());
+    }
+    break;
+
+  case Opcode::DupV4s:
+  case Opcode::MovVS:
+  case Opcode::MovSV:
+    if (ops.size() >= 2) {
+      if (ops[0].GetKind() == Operand::Kind::VReg)
+        result.defs.push_back(ops[0].GetVRegId());
+      if (ops[1].GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(ops[1].GetVRegId());
+    }
+    break;
+
+  default:
+    for (const auto &op : ops)
+      if (op.GetKind() == Operand::Kind::VReg)
+        result.uses.push_back(op.GetVRegId());
+    break;
+  }
+
+  // 去重
+  auto dedup = [](std::vector<int> &v) {
+    std::sort(v.begin(), v.end());
+    v.erase(std::unique(v.begin(), v.end()), v.end());
+  };
+  dedup(result.defs);
+  dedup(result.uses);
+
+  return result;
+}
+
+void MachineRegisterInfo::ReplaceAllVRegRefs(MachineFunction &mf, int old_vreg, int new_vreg) {
+  for (auto &block : mf.GetBlocks()) {
+    for (auto &inst : block->GetInstructions()) {
+      for (auto &op : inst.GetOperands()) {
+        if (op.GetKind() == Operand::Kind::VReg && op.GetVRegId() == old_vreg) {
+          op = Operand::VReg(new_vreg, mf.GetVRegClass(new_vreg));
+        }
+      }
+    }
+  }
+}
+
+} // namespace mir
diff --git a/src/mir/passes/CopyPropagation.cpp b/src/mir/passes/CopyPropagation.cpp
new file mode 100644
index 00000000..8f8ce663
--- /dev/null
+++ b/src/mir/passes/CopyPropagation.cpp
@@ -0,0 +1,295 @@
+// MIR Copy Propagation —— 安全的虚拟寄存器级副本优化
+// 运行在寄存器分配之前，仅执行可证明安全的变换：
+//
+// 1. 死副本消除：MovReg %v, %x 若 %v 从未被使用 → 删除
+// 2. 自复制消除：MovReg %v, %v → 删除
+// 3. StoreStack+LoadStack 折叠：同一 slot 无中间 store → MovReg
+// 4. 副本链折叠：MovReg %v1, %v2 → MovReg %v3, %v1 → MovReg %v3, %v2
+// 5. 基于 LiveIntervals 的安全前向传播 + 后向传播
+//
+// 注意：前向传播和后向传播不依赖块内向后扫描，而是使用 LiveIntervals
+// 的精确指令级活跃查询。这是与先前失败版本的根本区别。
+
+#include "mir/LiveIntervals.h"
+#include "mir/MIR.h"
+#include "mir/MachineRegisterInfo.h"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace mir {
+
+namespace {
+
+static int GetDefVReg(const MachineInstr &inst) {
+  auto du = MachineRegisterInfo::GetInstDefUse(inst);
+  return du.defs.empty() ? -1 : du.defs[0];
+}
+
+// ---- Pass 1: 死副本 + 自复制消除 ----
+
+static bool EliminateDeadAndSelfCopies(MachineFunction &function) {
+  bool changed = false;
+
+  std::unordered_map<int, int> use_counts;
+  for (auto &block : function.GetBlocks())
+    for (auto &inst : block->GetInstructions())
+      for (int u : MachineRegisterInfo::GetInstDefUse(inst).uses)
+        use_counts[u]++;
+
+  for (auto &block : function.GetBlocks()) {
+    auto &insts = block->GetInstructions();
+    std::vector<MachineInstr> new_insts;
+    for (auto &inst : insts) {
+      if (inst.GetOpcode() == Opcode::MovReg) {
+        const auto &ops = inst.GetOperands();
+        if (ops.size() >= 2 &&
+            ops[0].GetKind() == Operand::Kind::VReg &&
+            ops[1].GetKind() == Operand::Kind::VReg) {
+          int dst = ops[0].GetVRegId();
+          int src = ops[1].GetVRegId();
+          if (use_counts[dst] == 0) { changed = true; continue; }
+          if (dst == src) { changed = true; continue; }
+        }
+      }
+      new_insts.push_back(std::move(const_cast<MachineInstr &>(inst)));
+    }
+    if (new_insts.size() != insts.size()) changed = true;
+    insts = std::move(new_insts);
+  }
+  return changed;
+}
+
+// ---- Pass 2: 前向传播 + 后向传播 ---
+// 关键：使用 LiveIntervals 的 IsLiveAfter 查询 src 在替换点的活跃性
+
+static bool RunCopyPropagationPass(MachineFunction &function) {
+  bool changed = false;
+
+  LiveIntervals li;
+  li.Compute(function);
+
+  MachineRegisterInfo mri;
+  mri.Compute(function);
+
+  // 全局 def 计数（用于后向传播：src 必须有唯一定义）
+  std::unordered_map<int, int> def_counts;
+  for (auto &block : function.GetBlocks())
+    for (auto &inst : block->GetInstructions())
+      for (int d : MachineRegisterInfo::GetInstDefUse(inst).defs)
+        def_counts[d]++;
+
+  // 全局 use 计数
+  std::unordered_map<int, int> use_counts;
+  for (auto &block : function.GetBlocks())
+    for (auto &inst : block->GetInstructions())
+      for (int u : MachineRegisterInfo::GetInstDefUse(inst).uses)
+        use_counts[u]++;
+
+  for (auto &block : function.GetBlocks()) {
+    auto &insts = block->GetInstructions();
+    if (insts.size() < 2) continue;
+
+    // 活跃副本: dst → src
+    std::unordered_map<int, int> copies;
+
+    std::vector<bool> to_delete(insts.size(), false);
+
+    for (size_t i = 0; i < insts.size(); ++i) {
+      if (to_delete[i]) continue;
+      auto &inst = const_cast<MachineInstr &>(insts[i]);
+      Opcode op = inst.GetOpcode();
+
+      if (op == Opcode::Call) { copies.clear(); continue; }
+
+      // ---- 前向传播 ----
+      if (op != Opcode::MovReg && op != Opcode::StoreStack &&
+          op != Opcode::StoreGlobal && op != Opcode::StoreMem &&
+          op != Opcode::StrQ) {
+        auto inst_du = MachineRegisterInfo::GetInstDefUse(inst);
+        std::unordered_set<int> inst_defs(inst_du.defs.begin(), inst_du.defs.end());
+
+        for (auto &op_ref : inst.GetOperands()) {
+          if (op_ref.GetKind() == Operand::Kind::VReg) {
+            int use = op_ref.GetVRegId();
+            auto it = copies.find(use);
+            if (it == copies.end()) continue;
+
+            int src = it->second;
+            // src 不被本指令 clobber，且在 use 点活跃（指令 i 执行前）
+            // IsLiveAfter(v, i-1) 检查指令 i-1 之后的程序点，即指令 i 之前
+            if (!inst_defs.count(src) &&
+                li.IsLiveAfter(src, block.get(), static_cast<int>(i) - 1)) {
+              op_ref = Operand::VReg(src, function.GetVRegClass(src));
+              changed = true;
+            }
+          }
+        }
+      }
+
+      // ---- MovReg 处理 ----
+      if (op == Opcode::MovReg) {
+        const auto &ops = inst.GetOperands();
+        if (ops.size() < 2) continue;
+        if (ops[0].GetKind() != Operand::Kind::VReg ||
+            ops[1].GetKind() != Operand::Kind::VReg) continue;
+
+        int dst = ops[0].GetVRegId();
+        int src = ops[1].GetVRegId();
+
+        // 链折叠
+        {
+          std::unordered_set<int> visited;
+          int folded = src;
+          while (copies.count(folded) && !visited.count(folded)) {
+            visited.insert(folded);
+            folded = copies[folded];
+          }
+          if (folded != src) {
+            inst.GetOperands()[1] = Operand::VReg(folded, function.GetVRegClass(folded));
+            src = folded;
+            changed = true;
+          }
+        }
+
+        if (dst == src) { to_delete[i] = true; changed = true; continue; }
+
+        // 死副本：dst 在 live_after 中不存在
+        if (!li.IsLiveAfter(dst, block.get(), static_cast<int>(i))) {
+          to_delete[i] = true;
+          changed = true;
+          continue;
+        }
+
+        // 后向传播：src 有 1 个使用 + 1 个定义，且前一条指令定义了 src
+        if (i > 0 && !to_delete[i - 1] &&
+            use_counts[src] == 1 && def_counts[src] == 1) {
+          auto &prev = insts[i - 1];
+          int prev_def = GetDefVReg(prev);
+          if (prev_def == src &&
+              prev.GetOpcode() != Opcode::MovReg &&
+              prev.GetOpcode() != Opcode::Call &&
+              prev.GetOpcode() != Opcode::Br &&
+              prev.GetOpcode() != Opcode::CondBr) {
+            for (auto &op_ref : const_cast<MachineInstr &>(prev).GetOperands()) {
+              if (op_ref.GetKind() == Operand::Kind::VReg &&
+                  op_ref.GetVRegId() == src) {
+                op_ref = Operand::VReg(dst, function.GetVRegClass(dst));
+                break;
+              }
+            }
+            to_delete[i] = true;
+            changed = true;
+            continue;
+          }
+        }
+
+        copies[dst] = src;
+        continue;
+      }
+
+      // ---- 失效被 clobber 的副本 ----
+      int def = GetDefVReg(inst);
+      if (def >= 0) {
+        auto it = copies.begin();
+        while (it != copies.end()) {
+          if (it->first == def || it->second == def)
+            it = copies.erase(it);
+          else
+            ++it;
+        }
+      }
+    }
+
+    std::vector<MachineInstr> new_insts;
+    for (size_t i = 0; i < insts.size(); ++i) {
+      if (to_delete[i]) continue;
+      new_insts.push_back(std::move(const_cast<MachineInstr &>(insts[i])));
+    }
+    if (new_insts.size() != insts.size()) changed = true;
+    insts = std::move(new_insts);
+  }
+
+  return changed;
+}
+
+// ---- Pass 3: StoreStack+LoadStack 折叠 ----
+
+static bool FoldStoreLoadPairs(MachineFunction &function) {
+  bool changed = false;
+
+  for (auto &block : function.GetBlocks()) {
+    auto &insts = block->GetInstructions();
+
+    for (size_t i = 0; i + 1 < insts.size(); ++i) {
+      auto &si = insts[i];
+      if (si.GetOpcode() != Opcode::StoreStack) continue;
+      const auto &sops = si.GetOperands();
+      if (sops.size() < 2) continue;
+      if (sops[1].GetKind() != Operand::Kind::FrameIndex) continue;
+      int slot = sops[1].GetFrameIndex();
+
+      for (size_t j = i + 1; j < insts.size(); ++j) {
+        auto &lj = insts[j];
+
+        if (lj.GetOpcode() == Opcode::StoreStack) {
+          const auto &lsops = lj.GetOperands();
+          if (lsops.size() >= 2 &&
+              lsops[1].GetKind() == Operand::Kind::FrameIndex &&
+              lsops[1].GetFrameIndex() == slot) break;
+        }
+
+        if (lj.GetOpcode() != Opcode::LoadStack) continue;
+        const auto &lops = lj.GetOperands();
+        if (lops.size() < 2) continue;
+        if (lops[1].GetKind() != Operand::Kind::FrameIndex) continue;
+        if (lops[1].GetFrameIndex() != slot) continue;
+
+        if (lops[0].GetKind() == Operand::Kind::VReg &&
+            sops[0].GetKind() == Operand::Kind::VReg) {
+          int load_dst = lops[0].GetVRegId();
+          int store_src = sops[0].GetVRegId();
+          if (function.GetVRegClass(load_dst) == function.GetVRegClass(store_src)) {
+            const_cast<MachineInstr &>(lj) = MachineInstr(
+                Opcode::MovReg,
+                {Operand::VReg(load_dst, function.GetVRegClass(load_dst)),
+                 Operand::VReg(store_src, function.GetVRegClass(store_src))});
+            changed = true;
+          }
+        } else if (lops[0].GetKind() == Operand::Kind::Reg &&
+                   sops[0].GetKind() == Operand::Kind::Reg) {
+          const_cast<MachineInstr &>(lj) = MachineInstr(
+              Opcode::MovReg,
+              {Operand::Reg(lops[0].GetReg()),
+               Operand::Reg(sops[0].GetReg())});
+          changed = true;
+        }
+        break;
+      }
+    }
+  }
+
+  return changed;
+}
+
+} // namespace
+
+void RunCopyPropagation(MachineFunction &function) {
+  bool changed = true;
+  int iter = 0;
+  while (changed && iter < 5) {
+    changed = false;
+    changed |= EliminateDeadAndSelfCopies(function);
+    changed |= FoldStoreLoadPairs(function);
+    changed |= RunCopyPropagationPass(function);
+    iter++;
+  }
+}
+
+void RunCopyPropagation(MachineModule &module) {
+  for (auto &func : module.GetFunctions())
+    RunCopyPropagation(*func);
+}
+
+} // namespace mir
diff --git a/src/mir/passes/FoldImm.cpp b/src/mir/passes/FoldImm.cpp
new file mode 100644
index 00000000..9a2761de
--- /dev/null
+++ b/src/mir/passes/FoldImm.cpp
@@ -0,0 +1,112 @@
+// FoldImm —— MIR 立即数折叠 pass
+// 将 MovImm + 算术指令折叠为立即数变体（当前禁用，待修复正确性 bug）
+
+#include "mir/MIR.h"
+#include "mir/MachineRegisterInfo.h"
+
+#include <map>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+namespace mir {
+
+static bool FitAddSubImm(int imm) { return imm >= 0 && imm <= 4095; }
+
+static void FoldImmOneFunc(MachineFunction &mf) {
+  int nv = mf.GetNumVRegs();
+  if (nv == 0) return;
+
+  std::vector<int> use_counts(nv, 0);
+  struct UseLoc { MachineBasicBlock *block; int idx; };
+  std::unordered_map<int, UseLoc> use_loc;
+
+  for (auto &block : mf.GetBlocks()) {
+    auto &insts = block->GetInstructions();
+    for (int i = 0; i < (int)insts.size(); ++i) {
+      auto du = MachineRegisterInfo::GetInstDefUse(insts[i]);
+      for (int u : du.uses) if (u >= 0 && u < nv) { use_counts[u]++; use_loc[u] = {block.get(), i}; }
+    }
+    // 计数 successor args 中的 vreg 使用（phi 传递）
+    for (auto &succ : block->GetSuccessors())
+      for (int v : succ.args)
+        if (v >= 0 && v < nv) use_counts[v]++;
+  }
+
+  struct BlockFolds { std::map<int, int> mods; std::set<int> removals; };
+  std::unordered_map<MachineBasicBlock *, BlockFolds> block_folds;
+
+  for (auto &block : mf.GetBlocks()) {
+    auto &insts = block->GetInstructions();
+    for (int i = 0; i < (int)insts.size(); ++i) {
+      auto &inst = insts[i];
+      if (inst.GetOpcode() != Opcode::MovImm) continue;
+      auto &ops = inst.GetOperands();
+      if (ops.size() < 2) continue;
+      if (ops[0].GetKind() != Operand::Kind::VReg) continue;
+      if (ops[1].GetKind() != Operand::Kind::Imm) continue;
+      int vreg = ops[0].GetVRegId();
+      int imm = ops[1].GetImm();
+      if (vreg < 0 || vreg >= nv) continue;
+      if (use_counts[vreg] != 1) continue;
+      if (!FitAddSubImm(imm)) continue;
+
+      auto &ul = use_loc[vreg];
+      auto &use_inst = ul.block->GetInstructions()[ul.idx];
+      auto u_op = use_inst.GetOpcode();
+      auto &u_ops = use_inst.GetOperands();
+
+      bool valid = false;
+      if ((u_op == Opcode::AddRR || u_op == Opcode::SubRR) && u_ops.size() >= 3 &&
+          u_ops[0].GetKind() == Operand::Kind::VReg &&
+          u_ops[2].GetKind() == Operand::Kind::VReg && u_ops[2].GetVRegId() == vreg)
+        valid = true;
+      else if (u_op == Opcode::CmpRR && u_ops.size() >= 2 &&
+               u_ops[1].GetKind() == Operand::Kind::VReg && u_ops[1].GetVRegId() == vreg)
+        valid = true;
+
+      if (valid) {
+        block_folds[ul.block].mods[ul.idx] = imm;
+        block_folds[block.get()].removals.insert(i);
+      }
+    }
+  }
+
+  if (block_folds.empty()) return;
+
+  for (auto &[block, bf] : block_folds) {
+    auto &insts = block->GetInstructions();
+    std::vector<MachineInstr> ni;
+    for (int i = 0; i < (int)insts.size(); ++i) {
+      if (bf.removals.count(i)) continue;
+      auto mit = bf.mods.find(i);
+      if (mit != bf.mods.end()) {
+        auto &inst = insts[i];
+        auto &u_ops = inst.GetOperands();
+        Opcode old_op = inst.GetOpcode();
+        int imm = mit->second;
+        if (old_op == Opcode::AddRR) {
+          std::vector<Operand> nops = {u_ops[0], u_ops[1], Operand::Imm(imm)};
+          ni.push_back(MachineInstr(Opcode::AddImm, std::move(nops)));
+        } else if (old_op == Opcode::SubRR) {
+          std::vector<Operand> nops = {u_ops[0], u_ops[1], Operand::Imm(imm)};
+          ni.push_back(MachineInstr(Opcode::SubImm, std::move(nops)));
+        } else if (old_op == Opcode::CmpRR) {
+          std::vector<Operand> nops = {u_ops[0], Operand::Imm(imm)};
+          ni.push_back(MachineInstr(Opcode::CmpImm, std::move(nops)));
+        }
+      } else {
+        ni.push_back(insts[i]);
+      }
+    }
+    insts = std::move(ni);
+  }
+}
+
+void RunFoldImm(MachineFunction &mf) { FoldImmOneFunc(mf); }
+void RunFoldImm(MachineModule &module) {
+  for (auto &func : module.GetFunctions())
+    if (func) FoldImmOneFunc(*func);
+}
+
+} // namespace mir
diff --git a/src/mir/passes/LiveRangeSplit.cpp b/src/mir/passes/LiveRangeSplit.cpp
new file mode 100644
index 00000000..b49f0449
--- /dev/null
+++ b/src/mir/passes/LiveRangeSplit.cpp
@@ -0,0 +1,192 @@
+// LiveRangeSplit —— 基本块级活范围分裂
+// 在寄存器分配之前运行，减少干涉图密度：
+//
+// 策略：对在多个后继块中使用的 vreg，在定义块出口为每个后继创建
+// 独立副本。每个副本的活范围限于单个后继块，大幅减少跨块干涉。
+// 后续 MovReg 由 RegisterCoalescer 尽可能合并消除。
+
+#include "mir/MIR.h"
+#include "mir/MachineRegisterInfo.h"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace mir {
+
+namespace {
+
+// 简单 CFG 后继分析
+static std::vector<std::vector<int>> BuildSuccessors(MachineFunction &function) {
+  auto &blocks = function.GetBlocks();
+  int n = static_cast<int>(blocks.size());
+  std::vector<std::vector<int>> succs(n);
+
+  std::unordered_map<int, int> label_to_idx;
+  for (int i = 0; i < n; ++i)
+    label_to_idx[blocks[i]->GetLabelId()] = i;
+
+  for (int i = 0; i < n; ++i) {
+    bool has_br = false, has_condbr = false, has_ret = false;
+    for (auto &inst : blocks[i]->GetInstructions()) {
+      if (inst.GetOpcode() == Opcode::Br && inst.GetOperands().size() >= 1 &&
+          inst.GetOperands()[0].GetKind() == Operand::Kind::Label) {
+        has_br = true;
+        auto it = label_to_idx.find(inst.GetOperands()[0].GetLabel());
+        if (it != label_to_idx.end()) succs[i].push_back(it->second);
+      }
+      if (inst.GetOpcode() == Opcode::CondBr && inst.GetOperands().size() >= 2 &&
+          inst.GetOperands()[1].GetKind() == Operand::Kind::Label) {
+        has_condbr = true;
+        auto it = label_to_idx.find(inst.GetOperands()[1].GetLabel());
+        if (it != label_to_idx.end()) succs[i].push_back(it->second);
+      }
+      if (inst.GetOpcode() == Opcode::Ret) has_ret = true;
+    }
+    if ((has_condbr || (!has_br && !has_ret)) && i + 1 < n)
+      succs[i].push_back(i + 1);
+  }
+  return succs;
+}
+
+// 块级活跃分析
+struct BlockLive {
+  std::unordered_set<int> live_in, live_out, def, use;
+};
+
+static std::vector<BlockLive> ComputeBlockLiveness(
+    MachineFunction &function, const std::vector<std::vector<int>> &succs) {
+  auto &blocks = function.GetBlocks();
+  int n = static_cast<int>(blocks.size());
+  std::vector<BlockLive> bl(n);
+
+  for (int i = 0; i < n; ++i) {
+    for (auto &inst : blocks[i]->GetInstructions()) {
+      auto du = MachineRegisterInfo::GetInstDefUse(inst);
+      for (int u : du.uses)
+        if (bl[i].def.find(u) == bl[i].def.end())
+          bl[i].use.insert(u);
+      for (int d : du.defs)
+        bl[i].def.insert(d);
+    }
+  }
+
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    for (int i = n - 1; i >= 0; --i) {
+      std::unordered_set<int> new_out;
+      for (int s : succs[i])
+        for (int v : bl[s].live_in)
+          new_out.insert(v);
+      if (new_out != bl[i].live_out) { bl[i].live_out = std::move(new_out); changed = true; }
+
+      std::unordered_set<int> new_in = bl[i].use;
+      for (int v : bl[i].live_out)
+        if (bl[i].def.find(v) == bl[i].def.end())
+          new_in.insert(v);
+      if (new_in != bl[i].live_in) { bl[i].live_in = std::move(new_in); changed = true; }
+    }
+  }
+  return bl;
+}
+
+static bool RunSplitOnFunction(MachineFunction &function) {
+  auto succs = BuildSuccessors(function);
+  auto bl = ComputeBlockLiveness(function, succs);
+  auto &blocks = function.GetBlocks();
+  int n = static_cast<int>(blocks.size());
+  if (n < 2) return false;
+
+  // 全局 def 计数（非 SSA：可能多次定义）
+  std::unordered_map<int, int> def_counts;
+  for (auto &block : blocks)
+    for (auto &inst : block->GetInstructions())
+      for (int d : MachineRegisterInfo::GetInstDefUse(inst).defs)
+        def_counts[d]++;
+
+  bool changed = false;
+
+  for (int bi = 0; bi < n; ++bi) {
+    auto &live_out = bl[bi].live_out;
+    if (live_out.empty()) continue;
+
+    // 对每个 live-out vreg，检查是否在多个后继中使用
+    for (int vreg : live_out) {
+      if (vreg < 0 || vreg >= function.GetNumVRegs()) continue;
+      if (def_counts[vreg] > 1) continue;  // 唯一定义才安全
+      VRegClass vc = function.GetVRegClass(vreg);
+
+      // 统计哪些后继块使用了此 vreg
+      std::vector<int> using_succs;
+      for (int s : succs[bi]) {
+        if (bl[s].live_in.count(vreg) || bl[s].use.count(vreg))
+          using_succs.push_back(s);
+      }
+
+      // 只在 2+ 个后继中使用且 successor 数 >= 2 时才值得分裂
+      if (using_succs.size() < 2) continue;
+      if (succs[bi].size() < 2) continue;
+
+      // 仅当此 vreg 在多个后继中是唯一用途才分裂
+      // 检查是否有后继块中 vreg 被重定义
+      bool safe = true;
+      for (int s : using_succs) {
+        if (bl[s].def.count(vreg)) { safe = false; break; }
+      }
+      if (!safe) continue;
+
+      // 在每个后继块的入口创建新副本
+      for (int s : using_succs) {
+        int new_vreg = function.CreateVReg(vc);
+
+        auto &s_insts = blocks[s]->GetInstructions();
+        // 在块开头插入 copy
+        s_insts.insert(s_insts.begin(),
+          MachineInstr(Opcode::MovReg, {
+            Operand::VReg(new_vreg, vc),
+            Operand::VReg(vreg, vc)
+          }));
+
+        // 替换此块内所有对该 vreg 的使用为 new_vreg
+        for (auto &inst : blocks[s]->GetInstructions()) {
+          // 跳过刚插入的 MovReg（它是第一条指令）
+          if (inst.GetOpcode() == Opcode::MovReg &&
+              inst.GetOperands().size() >= 2 &&
+              inst.GetOperands()[0].GetKind() == Operand::Kind::VReg &&
+              inst.GetOperands()[0].GetVRegId() == new_vreg) continue;
+
+          for (auto &op : inst.GetOperands()) {
+            if (op.GetKind() == Operand::Kind::VReg && op.GetVRegId() == vreg) {
+              op = Operand::VReg(new_vreg, vc);
+            }
+          }
+        }
+      }
+
+      // 如果 vreg 的唯一定义就在当前块，且它在所有后继中都已被替代
+      // → vreg 在后续块中不再活跃，其活范围被缩短
+      changed = true;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace
+
+void RunLiveRangeSplit(MachineFunction &function) {
+  bool changed = true;
+  int iter = 0;
+  while (changed && iter < 3) {
+    changed = RunSplitOnFunction(function);
+    iter++;
+  }
+}
+
+void RunLiveRangeSplit(MachineModule &module) {
+  for (auto &func : module.GetFunctions())
+    RunLiveRangeSplit(*func);
+}
+
+} // namespace mir
diff --git a/src/mir/passes/MIRCleanup.cpp b/src/mir/passes/MIRCleanup.cpp
new file mode 100644
index 00000000..7bbca2a5
--- /dev/null
+++ b/src/mir/passes/MIRCleanup.cpp
@@ -0,0 +1,100 @@
+#include "mir/MIR.h"
+
+#include "utils/Log.h"
+
+namespace mir
+{
+namespace
+{
+
+// MovImm 转发：mov v1, #N; mov v2, v1（v1 无其他使用）→ mov v2, #N
+static void ForwardMovImm(MachineFunction &function)
+{
+  for (auto &block : function.GetBlocks())
+  {
+    if (!block)
+      continue;
+    auto &insts = block->GetInstructions();
+
+    for (auto it = insts.begin(); it != insts.end(); ++it)
+    {
+      if (it->GetOpcode() != Opcode::MovImm)
+        continue;
+      const auto &ops = it->GetOperands();
+      if (ops.size() < 2 || ops[0].GetKind() != Operand::Kind::VReg)
+        continue;
+
+      int src_vreg = ops[0].GetVRegId();
+      int imm_val = ops[1].GetImm();
+
+      auto next = std::next(it);
+      if (next == insts.end())
+        continue;
+      if (next->GetOpcode() != Opcode::MovReg)
+        continue;
+
+      const auto &n_ops = next->GetOperands();
+      if (n_ops.size() < 2)
+        continue;
+      if (n_ops[1].GetKind() != Operand::Kind::VReg ||
+          n_ops[1].GetVRegId() != src_vreg)
+        continue;
+      if (n_ops[0].GetKind() != Operand::Kind::VReg)
+        continue;
+
+      int dst_vreg = n_ops[0].GetVRegId();
+
+      // 检查 src_vreg 是否还有其他使用
+      bool other_use = false;
+      for (auto &b2 : function.GetBlocks())
+      {
+        if (!b2)
+          continue;
+        for (auto &inst2 : b2->GetInstructions())
+        {
+          for (const auto &op : inst2.GetOperands())
+          {
+            if (op.GetKind() == Operand::Kind::VReg &&
+                op.GetVRegId() == src_vreg)
+            {
+              // 排除 MovImm 自身（def）和 MovReg（use）
+              MachineInstr *mi_ptr = const_cast<MachineInstr *>(&inst2);
+              if (mi_ptr != &(*it) && mi_ptr != &(*next))
+              {
+                other_use = true;
+                goto done_check;
+              }
+            }
+          }
+        }
+      }
+    done_check:
+
+      if (!other_use)
+      {
+        *next = MachineInstr(Opcode::MovImm,
+                             {Operand::VReg(dst_vreg, function.GetVRegClass(dst_vreg)),
+                              Operand::Imm(imm_val)});
+        it = insts.erase(it);
+      }
+    }
+  }
+}
+
+} // namespace
+
+void RunMIRCleanup(MachineFunction &function)
+{
+  ForwardMovImm(function);
+}
+
+void RunMIRCleanup(MachineModule &module)
+{
+  for (auto &function : module.GetFunctions())
+  {
+    if (function)
+      RunMIRCleanup(*function);
+  }
+}
+
+} // namespace mir
diff --git a/src/mir/passes/PhiElimination.cpp b/src/mir/passes/PhiElimination.cpp
new file mode 100644
index 00000000..fd2e770f
--- /dev/null
+++ b/src/mir/passes/PhiElimination.cpp
@@ -0,0 +1,114 @@
+// PhiElimination —— MIR SSA 销毁 pass
+//
+// 将 SSA 块参数（block_args / successor args）降级为前驱块中的显式 MovReg 指令。
+// 在寄存器分配之前运行，使分配器在标准非 SSA MIR 上工作。
+//
+// 同时收集 phi 元数据（phi_pairs、phi_block_arg_block）并存储在 MachineFunction 上，
+// 供 GreedyAlloc 的 coalescing 和 copy hint 使用。
+//
+// 步骤：
+// 1. 构建前驱映射
+// 2. 对每个有 block_args 的块，在每条前驱边中插入 MovReg(dst=block_arg, src=succ_arg)
+// 3. 收集 phi 连接信息 → 写入 MachineFunction
+// 4. 清除所有 block_args 和 successors（MIR 不再是 SSA 形式）
+//
+// 参照 LLVM PHIElimination.cpp —— 同样的核心思想：PHI → 显式 COPY
+
+#include "mir/MIR.h"
+
+#include <unordered_map>
+#include <vector>
+
+namespace mir
+{
+
+void RunPhiElimination(MachineFunction &function)
+{
+  // ---- 1. 构建前驱映射 ----
+  std::unordered_map<int, std::vector<MachineBasicBlock *>> preds;
+  for (auto &block : function.GetBlocks())
+    for (const auto &succ : block->GetSuccessors())
+      preds[succ.label].push_back(block.get());
+
+  // ---- 2. 收集 phi 元数据 ----
+  // phi_pairs: vreg → 与之 phi 连接的其他 vreg（用于 copy hint）
+  std::unordered_map<int, std::vector<int>> phi_pairs;
+  // phi_block_arg_block: block_arg vreg → 定义它的块（用于干涉排除）
+  std::unordered_map<int, MachineBasicBlock *> phi_block_arg_block;
+
+  for (auto &block : function.GetBlocks())
+  {
+    for (int v : block->GetBlockArgs())
+      phi_block_arg_block[v] = block.get();
+  }
+
+  // ---- 3. 降级 block_args 为 MovReg ----
+  for (auto &block : function.GetBlocks())
+  {
+    const auto &block_args = block->GetBlockArgs();
+    if (block_args.empty()) continue;
+
+    auto pit = preds.find(block->GetLabelId());
+    if (pit == preds.end()) continue;
+
+    for (auto *pred : pit->second)
+    {
+      const auto &succs = pred->GetSuccessors();
+      int succ_idx = -1;
+      for (size_t si = 0; si < succs.size(); ++si)
+        if (succs[si].label == block->GetLabelId())
+        { succ_idx = static_cast<int>(si); break; }
+      if (succ_idx < 0) continue;
+
+      const auto &succ_args = succs[succ_idx].args;
+      if (succ_args.empty()) continue;
+
+      auto &pred_insts = pred->GetInstructions();
+      // 在终止指令前插入 MovReg
+      int insert_pos = static_cast<int>(pred_insts.size());
+      for (int i = static_cast<int>(pred_insts.size()) - 1; i >= 0; --i)
+      {
+        auto op = pred_insts[i].GetOpcode();
+        if (op == Opcode::Br || op == Opcode::CondBr || op == Opcode::Ret)
+        { insert_pos = i; break; }
+      }
+
+      for (size_t ai = 0; ai < block_args.size() && ai < succ_args.size(); ++ai)
+      {
+        int dst_vreg = block_args[ai];
+        int src_vreg = succ_args[ai];
+        if (dst_vreg == src_vreg) continue;
+
+        VRegClass vc = function.GetVRegClass(dst_vreg);
+        pred_insts.insert(pred_insts.begin() + insert_pos,
+            MachineInstr(Opcode::MovReg, {
+                Operand::VReg(dst_vreg, vc),
+                Operand::VReg(src_vreg, vc)}));
+        insert_pos++;
+
+        // 记录 phi 连接
+        phi_pairs[dst_vreg].push_back(src_vreg);
+        phi_pairs[src_vreg].push_back(dst_vreg);
+      }
+    }
+  }
+
+  // ---- 4. 存储 phi 元数据到 MachineFunction ----
+  function.SetPhiPairs(std::move(phi_pairs));
+  function.SetPhiBlockArgBlock(std::move(phi_block_arg_block));
+
+  // ---- 5. 清除后继信息和块参数（已全部降级为 MovReg）----
+  for (auto &block : function.GetBlocks())
+  {
+    block->ClearSuccessors();
+    block->ClearBlockArgs();
+  }
+}
+
+void RunPhiElimination(MachineModule &module)
+{
+  for (auto &func : module.GetFunctions())
+    RunPhiElimination(*func);
+}
+
+} // namespace mir
diff --git a/src/mir/passes/PhysRegCopyProp.cpp b/src/mir/passes/PhysRegCopyProp.cpp
new file mode 100644
index 00000000..469abd69
--- /dev/null
+++ b/src/mir/passes/PhysRegCopyProp.cpp
@@ -0,0 +1,296 @@
+// Post-RA Physical Register Copy Propagation
+// 使用标准 PhysReg 数据流分析 + 块内后向活跃检测。
+
+#include "mir/MIR.h"
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace mir {
+namespace {
+
+static bool HasRegDef(Opcode op) {
+  switch (op) {
+  case Opcode::StoreStack: case Opcode::StoreGlobal:
+  case Opcode::StoreMem: case Opcode::StrQ:
+  case Opcode::Br: case Opcode::CondBr:
+  case Opcode::Call: case Opcode::Ret:
+  case Opcode::Prologue: case Opcode::Epilogue:
+  case Opcode::CmpRR: case Opcode::CmpImm: case Opcode::FCmpRR: // flags only
+    return false;
+  default: return true;
+  }
+}
+
+static bool IsCopy(const MachineInstr &inst) {
+  if (inst.GetOpcode() != Opcode::MovReg) return false;
+  const auto &ops = inst.GetOperands();
+  return ops.size() >= 2 && ops[0].GetKind() == Operand::Kind::Reg &&
+         ops[1].GetKind() == Operand::Kind::Reg;
+}
+
+static std::vector<PhysReg> GetDefs(const MachineInstr &inst) {
+  std::vector<PhysReg> d;
+  if (HasRegDef(inst.GetOpcode()) && !inst.GetOperands().empty() &&
+      inst.GetOperands()[0].GetKind() == Operand::Kind::Reg)
+    d.push_back(inst.GetOperands()[0].GetReg());
+  if (inst.GetOpcode() == Opcode::Call) {
+    for (int i=0;i<=18;++i) d.push_back((PhysReg)((int)PhysReg::W0+i)); // GP caller-saved
+    for (int i=0;i<=7;++i)  d.push_back((PhysReg)((int)PhysReg::S0+i)); // float caller-saved
+    // s8-s31 are callee-saved — not clobbered by Call
+  }
+  return d;
+}
+
+static std::vector<PhysReg> GetUses(const MachineInstr &inst) {
+  std::vector<PhysReg> u;
+  const auto &ops = inst.GetOperands();
+  Opcode op = inst.GetOpcode();
+  bool is_store = (op == Opcode::StoreStack || op == Opcode::StoreGlobal ||
+                   op == Opcode::StoreMem || op == Opcode::StrQ);
+  int start = (!is_store && HasRegDef(op) && !ops.empty() &&
+               ops[0].GetKind()==Operand::Kind::Reg) ? 1 : 0;
+  for (size_t j=start;j<ops.size();++j)
+    if (ops[j].GetKind()==Operand::Kind::Reg) u.push_back(ops[j].GetReg());
+  if (is_store && !ops.empty() && ops[0].GetKind()==Operand::Kind::Reg)
+    u.push_back(ops[0].GetReg());
+  if (op == Opcode::Call) {
+    for (int i=0;i<=7;++i) u.push_back((PhysReg)((int)PhysReg::W0+i));
+    for (int i=0;i<=7;++i) u.push_back((PhysReg)((int)PhysReg::S0+i)); // float params
+  }
+  if (op == Opcode::Ret || op == Opcode::Epilogue) {
+    u.push_back(PhysReg::W0); u.push_back(PhysReg::X0);
+    u.push_back(PhysReg::S0); // float return value
+  }
+  return u;
+}
+
+// Wn/Xn 别名处理：两者共享同一物理寄存器
+static int RegUnit(PhysReg r) {
+  int v = (int)r;
+  if (r >= PhysReg::W0 && r <= PhysReg::W30) return v - (int)PhysReg::W0;
+  if (r >= PhysReg::X0 && r <= PhysReg::X30) return v - (int)PhysReg::X0;
+  return v + 1000; // 非 GP 寄存器，用唯一值
+}
+static PhysReg WReg(int unit) { return (PhysReg)((int)PhysReg::W0 + unit); }
+static PhysReg XReg(int unit) { return (PhysReg)((int)PhysReg::X0 + unit); }
+
+// 展开 Wn/Xn 别名为两个具体寄存器
+static void ExpandAlias(PhysReg r, std::vector<PhysReg> &out) {
+  int unit = RegUnit(r);
+  if (unit < 32) { out.push_back(WReg(unit)); out.push_back(XReg(unit)); }
+  else out.push_back(r);
+}
+
+static bool Run(MachineFunction &function) {
+  bool changed = false;
+  auto &blocks = function.GetBlocks();
+  size_t n = blocks.size();
+  if (n == 0) return false;
+
+  // ---- PhysReg 数据流分析：计算每块 live_in / live_out ----
+  std::vector<std::unordered_set<PhysReg>> block_use(n), block_def(n);
+  std::vector<std::unordered_set<PhysReg>> live_in(n), live_out(n);
+  std::unordered_map<const MachineBasicBlock*,size_t> b2i;
+  for (size_t i=0;i<n;++i) b2i[blocks[i].get()]=i;
+
+  auto insertAlias = [](std::unordered_set<PhysReg> &s, PhysReg r) {
+    std::vector<PhysReg> expanded; ExpandAlias(r, expanded);
+    for (auto er : expanded) s.insert(er);
+  };
+
+  for (size_t i=0;i<n;++i) {
+    for (auto &inst : blocks[i]->GetInstructions()) {
+      for (auto r : GetDefs(inst)) insertAlias(block_def[i], r);
+      for (auto r : GetUses(inst)) insertAlias(block_use[i], r);
+    }
+  }
+
+  // CFG successors
+  std::unordered_map<int,size_t> l2i;
+  for (size_t i=0;i<n;++i) l2i[blocks[i]->GetLabelId()]=i;
+  std::vector<std::vector<size_t>> succs(n);
+  for (size_t i=0;i<n;++i) {
+    for (auto &inst : blocks[i]->GetInstructions()) {
+      auto get=[&](int lbl){auto it=l2i.find(lbl);if(it!=l2i.end())succs[i].push_back(it->second);};
+      if (inst.GetOpcode()==Opcode::Br && inst.GetOperands().size()>=1 &&
+          inst.GetOperands()[0].GetKind()==Operand::Kind::Label)
+        get(inst.GetOperands()[0].GetLabel());
+      if (inst.GetOpcode()==Opcode::CondBr && inst.GetOperands().size()>=2 &&
+          inst.GetOperands()[1].GetKind()==Operand::Kind::Label) {
+        get(inst.GetOperands()[1].GetLabel());
+        // CondBr 的 fall-through 是下一个块
+        if (i+1 < n) succs[i].push_back(i+1);
+      }
+    }
+  }
+
+  // 迭代数据流
+  bool df_changed=true;
+  while (df_changed) { df_changed=false;
+    for (int i=(int)n-1;i>=0;--i) {
+      std::unordered_set<PhysReg> new_out;
+      for (auto s : succs[i]) for (auto r : live_in[s]) new_out.insert(r);
+      if (new_out != live_out[i]) { live_out[i]=std::move(new_out); df_changed=true; }
+      std::unordered_set<PhysReg> new_in=block_use[i];
+      for (auto r : live_out[i]) if (!block_def[i].count(r)) new_in.insert(r);
+      if (new_in != live_in[i]) { live_in[i]=std::move(new_in); df_changed=true; }
+    }
+  }
+
+  // ---- 逐块优化 ----
+  for (size_t bi=0;bi<n;++bi) {
+    auto &insts = blocks[bi]->GetInstructions();
+    if (insts.size() < 1) continue;
+    int ni = (int)insts.size();
+
+    // 块内后向活跃：从 live_out 开始
+    std::vector<std::unordered_set<PhysReg>> la(ni+1);
+    la[ni] = live_out[bi];
+    for (int i=ni-1;i>=0;--i) {
+      la[i] = la[i+1];
+      for (auto r : GetDefs(insts[i])) { std::vector<PhysReg> ex; ExpandAlias(r,ex); for(auto er:ex) la[i].erase(er); }
+      for (auto r : GetUses(insts[i])) { std::vector<PhysReg> ex; ExpandAlias(r,ex); for(auto er:ex) la[i].insert(er); }
+      if (insts[i].GetOpcode()==Opcode::Ret || insts[i].GetOpcode()==Opcode::Epilogue) {
+        la[i].insert(PhysReg::W0); la[i].insert(PhysReg::X0); la[i].insert(PhysReg::S0);
+      }
+    }
+
+    // 前向扫描：副本追踪 + 死副本删除
+    struct Avail { PhysReg src; size_t idx; };
+    std::unordered_map<PhysReg,Avail> copies;
+    std::vector<bool> to_del(ni,false);
+
+    for (int i=0;i<ni;++i) {
+      auto &inst = const_cast<MachineInstr&>(insts[i]);
+
+      // 正向传播 + 标记已使用副本
+      // 先收集指令 def 集合（含别名），用于正向传播安全检查
+      std::unordered_set<PhysReg> inst_def_set;
+      for (auto d : GetDefs(inst)) {
+        std::vector<PhysReg> dex; ExpandAlias(d, dex);
+        for (auto ed : dex) inst_def_set.insert(ed);
+      }
+      {
+        Opcode op = inst.GetOpcode();
+        auto &ops = inst.GetOperands();
+        bool is_store = (op == Opcode::StoreStack || op == Opcode::StoreGlobal ||
+                         op == Opcode::StoreMem || op == Opcode::StrQ);
+        int use_start = (!is_store && HasRegDef(op) && !ops.empty() &&
+                         ops[0].GetKind() == Operand::Kind::Reg) ? 1 : 0;
+        // 正向传播：将 copy dst 的显式 use 替换为 src
+        for (size_t j = use_start; j < ops.size(); ++j) {
+          if (ops[j].GetKind() != Operand::Kind::Reg) continue;
+          PhysReg use_reg = ops[j].GetReg();
+          auto it = copies.find(use_reg);
+          if (it == copies.end()) continue;
+          PhysReg fwd_src = it->second.src;
+          // 安全检查：指令不能定义 fwd_src（含别名），否则会引起循环依赖
+          if (!inst_def_set.count(fwd_src)) {
+            ops[j] = Operand::Reg(fwd_src);
+            changed = true;
+          }
+          copies.erase(it);
+        }
+        // store 的第一操作数是 use
+        if (is_store && !ops.empty() && ops[0].GetKind() == Operand::Kind::Reg) {
+          PhysReg use_reg = ops[0].GetReg();
+          auto it = copies.find(use_reg);
+          if (it != copies.end()) {
+            PhysReg fwd_src = it->second.src;
+            if (!inst_def_set.count(fwd_src)) {
+              ops[0] = Operand::Reg(fwd_src);
+              changed = true;
+            }
+            copies.erase(it);
+          }
+        }
+      }
+      // 别名感知的副本消费：显式 uses 中含别名的寄存器
+      // 例如 use x0 应消费 copy w0 = COPY ...（因为 w0 设置了 x0 的低 32 位）
+      // 但这里不做正向传播（转发 x0→x1 不正确，因为上 32 位不同）
+      for (auto u : GetUses(inst)) {
+        std::vector<PhysReg> ex; ExpandAlias(u, ex);
+        for (auto eu : ex) { auto it = copies.find(eu); if (it != copies.end()) copies.erase(it); }
+      }
+
+      // 隐式 uses：Call 的参数寄存器、Ret 的返回值寄存器
+      // 这些仅消费副本（不转发），因为它们不在指令的显式 operands 中
+      {
+        Opcode op = inst.GetOpcode();
+        if (op == Opcode::Call) {
+          for (int k = 0; k <= 7; ++k) {
+            copies.erase((PhysReg)((int)PhysReg::W0 + k));
+            copies.erase((PhysReg)((int)PhysReg::S0 + k));
+          }
+        }
+        if (op == Opcode::Ret || op == Opcode::Epilogue) {
+          copies.erase(PhysReg::W0); copies.erase(PhysReg::X0);
+          copies.erase(PhysReg::S0);
+        }
+      }
+
+      // Def 使副本失效（含别名处理）
+      for (auto d : GetDefs(inst)) {
+        std::vector<PhysReg> ex; ExpandAlias(d,ex);
+        for (auto ed : ex) {
+          auto cit=copies.find(ed);
+          if (cit!=copies.end()) {
+            bool any_live = false;
+            std::vector<PhysReg> dex; ExpandAlias(ed,dex);
+            for (auto ed2 : dex) if (la[i+1].count(ed2)) { any_live=true; break; }
+            if (!any_live) { to_del[cit->second.idx]=true; changed=true; }
+            copies.erase(cit);
+          }
+          auto it=copies.begin();
+          while (it!=copies.end()) {
+            bool src_match = false;
+            std::vector<PhysReg> sex; ExpandAlias(it->second.src,sex);
+            for (auto es : sex) if (es==ed) { src_match=true; break; }
+            if (src_match) { it=copies.erase(it); } else ++it;
+          }
+        }
+      }
+
+      // 处理 COPY
+      if (IsCopy(inst)) {
+        const auto &mo=inst.GetOperands();
+        PhysReg dst=mo[0].GetReg(), src=mo[1].GetReg();
+        if (dst==src) { to_del[i]=true; changed=true; continue; }
+        // 冗余副本消除：反向对 (A=B 且已有 B=A) 或重复 (A=B 且已有 A=B)
+        auto inv_it = copies.find(src);
+        if (inv_it != copies.end() && inv_it->second.src == dst) {
+          to_del[i] = true; changed = true; continue;
+        }
+        auto dup_it = copies.find(dst);
+        if (dup_it != copies.end() && dup_it->second.src == src) {
+          to_del[i] = true; changed = true; continue;
+        }
+        copies[dst]={src,(size_t)i};
+      }
+    }
+
+    // 块尾死副本：dst 不在 live_out 中的副本可以被删除
+    for (auto &[dst_reg, info] : copies) {
+      bool live = false;
+      std::vector<PhysReg> ex; ExpandAlias(dst_reg, ex);
+      for (auto er : ex) if (live_out[bi].count(er)) { live = true; break; }
+      if (!live) { to_del[info.idx] = true; changed = true; }
+    }
+
+    // 应用删除
+    bool any=false;
+    for (bool d:to_del) if(d){any=true;break;}
+    if (any) {
+      std::vector<MachineInstr> newi;
+      for (int i=0;i<ni;++i) if(!to_del[i]) newi.push_back(std::move(const_cast<MachineInstr&>(insts[i])));
+      insts=std::move(newi);
+    }
+  }
+  return changed;
+}
+
+} // namespace
+void RunPhysRegCopyProp(MachineFunction &f) { for(int i=0;i<3;++i) if(!Run(f)) break; }
+void RunPhysRegCopyProp(MachineModule &m) { for(auto &f:m.GetFunctions()) RunPhysRegCopyProp(*f); }
+} // namespace mir
diff --git a/src/mir/passes/RegisterCoalescer.cpp b/src/mir/passes/RegisterCoalescer.cpp
new file mode 100644
index 00000000..556e94d4
--- /dev/null
+++ b/src/mir/passes/RegisterCoalescer.cpp
@@ -0,0 +1,171 @@
+// RegisterCoalescer —— 在寄存器分配之前合并 copy-connected 虚拟寄存器
+//
+// 基于 MachineRegisterInfo + LiveIntervals，安全地消除 MovReg 指令：
+// 1. 收集所有以 MovReg 定义的 vreg，找出全部定义都来自同一 src 的 vreg
+// 2. 检查 src 和 dst 在所有点（除 MovReg 自身外）是否干涉
+// 3. 若不干涉 → 将所有 dst 引用替换为 src，删除所有相关 MovReg
+// 4. 迭代至不动点
+//
+// 支持多定义：若 dst 被多个 MovReg 定义（如在多个基本块中），只要全部
+// 来自同一 src 且不干涉，即可安全合并。
+
+#include "mir/LiveIntervals.h"
+#include "mir/MIR.h"
+#include "mir/MachineRegisterInfo.h"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace mir {
+
+namespace {
+
+struct CoalesceCandidate {
+  int dst;
+  int src;
+  std::vector<MachineInstr *> movs; // 所有定义 dst 的 MovReg 指令
+};
+
+// 收集所有可合并的副本。对每个 vreg，检查它的全部定义是否都是 MovReg 到同一 src
+static std::vector<CoalesceCandidate> CollectCandidates(
+    MachineFunction &function, MachineRegisterInfo &mri) {
+
+  // 对每个 vreg：收集所有定义指令
+  std::unordered_map<int, std::vector<MachineInstr *>> all_defs;
+  for (auto &block : function.GetBlocks()) {
+    for (auto &inst : block->GetInstructions()) {
+      auto du = MachineRegisterInfo::GetInstDefUse(inst);
+      for (int d : du.defs)
+        all_defs[d].push_back(&inst);
+    }
+  }
+
+  std::vector<CoalesceCandidate> candidates;
+  std::unordered_set<int> processed;
+
+  for (auto &[dst, defs] : all_defs) {
+    if (dst < 0 || dst >= function.GetNumVRegs()) continue;
+    if (defs.empty()) continue;
+    if (processed.count(dst)) continue;
+
+    // 全部定义都必须是 MovReg，且 dst 和 src 同类
+    int common_src = -1;
+    bool all_movreg = true;
+    std::vector<MachineInstr *> movs;
+
+    for (auto *def_inst : defs) {
+      if (def_inst->GetOpcode() != Opcode::MovReg) { all_movreg = false; break; }
+      const auto &ops = def_inst->GetOperands();
+      if (ops.size() < 2 ||
+          ops[0].GetKind() != Operand::Kind::VReg ||
+          ops[1].GetKind() != Operand::Kind::VReg) {
+        all_movreg = false; break;
+      }
+
+      int src = ops[1].GetVRegId();
+      if (ops[0].GetVRegId() != dst) { all_movreg = false; break; }
+
+      if (common_src == -1) common_src = src;
+      else if (src != common_src) { all_movreg = false; break; }
+
+      movs.push_back(def_inst);
+    }
+
+    if (!all_movreg || common_src < 0) continue;
+    if (dst == common_src) continue;
+
+    VRegClass dc = function.GetVRegClass(dst);
+    VRegClass sc = function.GetVRegClass(common_src);
+    if (dc != sc) continue;
+
+    processed.insert(dst);
+    candidates.push_back({dst, common_src, std::move(movs)});
+  }
+
+  return candidates;
+}
+
+static bool RunCoalescerOnFunction(MachineFunction &function) {
+  MachineRegisterInfo mri;
+  mri.Compute(function);
+
+  LiveIntervals li;
+  li.Compute(function);
+
+  auto candidates = CollectCandidates(function, mri);
+  if (candidates.empty()) return false;
+
+  bool changed = false;
+
+  for (auto &cand : candidates) {
+    int dst = cand.dst;
+    int src = cand.src;
+
+    // 再次验证：所有 MovReg 仍是 dst 的有效定义（前面的合并可能改变了情况）
+    bool still_valid = true;
+    for (auto *mov : cand.movs) {
+      const auto &ops = mov->GetOperands();
+      if (ops.size() < 2 ||
+          ops[0].GetKind() != Operand::Kind::VReg ||
+          ops[0].GetVRegId() != dst ||
+          ops[1].GetKind() != Operand::Kind::VReg ||
+          ops[1].GetVRegId() != src) {
+        still_valid = false;
+        break;
+      }
+    }
+    if (!still_valid) continue;
+
+    // 干涉检查：排除所有定义 dst 的 MovReg 指令
+    std::unordered_set<const MachineInstr *> excludes;
+    for (auto *mov : cand.movs) excludes.insert(mov);
+
+    if (li.InterfereExcept(dst, src, excludes)) continue;
+
+    // 安全合并
+    MachineRegisterInfo::ReplaceAllVRegRefs(function, dst, src);
+    changed = true;
+  }
+
+  // 清理自复制（合并后产生的 MovReg %src, %src）
+  if (changed) {
+    for (auto &block : function.GetBlocks()) {
+      auto &insts = block->GetInstructions();
+      std::vector<MachineInstr> new_insts;
+      for (auto &inst : insts) {
+        if (inst.GetOpcode() == Opcode::MovReg) {
+          const auto &ops = inst.GetOperands();
+          if (ops.size() >= 2 &&
+              ops[0].GetKind() == Operand::Kind::VReg &&
+              ops[1].GetKind() == Operand::Kind::VReg &&
+              ops[0].GetVRegId() == ops[1].GetVRegId()) {
+            continue;
+          }
+        }
+        new_insts.push_back(std::move(const_cast<MachineInstr &>(inst)));
+      }
+      insts = std::move(new_insts);
+    }
+  }
+
+  return changed;
+}
+
+} // namespace
+
+void RunRegisterCoalescer(MachineFunction &function) {
+  bool changed = true;
+  int iter = 0;
+  while (changed && iter < 5) {
+    changed = RunCoalescerOnFunction(function);
+    iter++;
+  }
+}
+
+void RunRegisterCoalescer(MachineModule &module) {
+  for (auto &func : module.GetFunctions())
+    RunRegisterCoalescer(*func);
+}
+
+} // namespace mir
diff --git a/src/mir/passes/TailCallOpt.cpp b/src/mir/passes/TailCallOpt.cpp
new file mode 100644
index 00000000..781188fa
--- /dev/null
+++ b/src/mir/passes/TailCallOpt.cpp
@@ -0,0 +1,64 @@
+// Tail Call Optimization —— Call + Ret → Branch
+//
+// 对齐 LLVM/GCC tail call optimization (sibcall)
+//
+// 叶函数优化：函数无 callee-saved 寄存器时，末尾的 Call+Ret 可直接替换为 Br。
+// 被调用者的 Ret 将直接返回到本函数的调用者。
+//
+// 非叶函数：需要先恢复 callee-saved/lr 再 branch，暂跳过（需帧布局安全分析）。
+
+#include "mir/MIR.h"
+
+namespace mir {
+namespace {
+
+static bool RunTailCallOnFunction(MachineFunction& function) {
+  const auto& callee_saved = function.GetCalleeSavedRegs();
+  // 仅叶函数：无 callee-saved 寄存器时尾调用安全
+  if (!callee_saved.empty()) return false;
+
+  // 检查函数是否有栈帧（Prologue 调整了 sp）
+  for (auto& block : function.GetBlocks()) {
+    for (auto& inst : block->GetInstructions()) {
+      if (inst.GetOpcode() == Opcode::Prologue)
+        return false;  // 有栈帧 → 尾调用需先恢复 sp
+    }
+  }
+
+  auto& blocks = function.GetBlocks();
+  for (auto& block : blocks) {
+    auto& insts = const_cast<std::vector<MachineInstr>&>(block->GetInstructions());
+    if (insts.size() < 2) continue;
+
+    // 匹配最后两条指令：Call + Ret
+    auto& slast = const_cast<MachineInstr&>(insts[insts.size() - 2]);
+    auto& last = insts[insts.size() - 1];
+
+    if (slast.GetOpcode() != Opcode::Call) continue;
+    if (last.GetOpcode() != Opcode::Ret) continue;
+
+    const auto& call_ops = slast.GetOperands();
+    if (call_ops.empty() || call_ops[0].GetKind() != Operand::Kind::Label) continue;
+    int callee_label = call_ops[0].GetLabel();
+
+    // Call + Ret → Br
+    std::vector<MachineInstr> new_insts;
+    for (size_t i = 0; i < insts.size() - 2; ++i)
+      new_insts.push_back(std::move(insts[i]));
+    new_insts.push_back(MachineInstr(Opcode::Br, {Operand::Label(callee_label)}));
+
+    insts = std::move(new_insts);
+    return true;
+  }
+
+  return false;
+}
+
+} // namespace
+
+void RunTailCallOpt(MachineFunction& function) { RunTailCallOnFunction(function); }
+void RunTailCallOpt(MachineModule& module) {
+  for (auto& func : module.GetFunctions()) RunTailCallOpt(*func);
+}
+
+} // namespace mir
diff --git a/src/mir/passes/TwoAddress.cpp b/src/mir/passes/TwoAddress.cpp
new file mode 100644
index 00000000..837b4ebc
--- /dev/null
+++ b/src/mir/passes/TwoAddress.cpp
@@ -0,0 +1,84 @@
+// TwoAddressInstructionPass —— 两地址指令优化
+//
+// AArch64 算术指令是三地址的（dst, src1, src2），但某些情况下 dst 必须与
+// 一个源操作数相同才能避免额外的 MovReg。此 pass 在 lowering 之后运行，
+// 通过操作数交换（commuting）消除不必要的 copy。
+//
+// 优化：
+// 1. 交换可交换操作的 src1/src2，使 dst 匹配 src1（避免多余 MovReg）
+// 2. 若 dst == src2 且操作可交换 → 交换 src1, src2
+
+#include "mir/MIR.h"
+#include "mir/MachineRegisterInfo.h"
+
+namespace mir {
+
+namespace {
+
+static bool IsCommutative(Opcode op) {
+  switch (op) {
+  case Opcode::AddRR:
+  case Opcode::MulRR:
+  case Opcode::AndRR:
+  case Opcode::OrRR:
+  case Opcode::XorRR:
+  case Opcode::FAddRR:
+  case Opcode::FMulRR:
+  case Opcode::AddShiftRR:
+  case Opcode::AddV4s:
+  case Opcode::MulV4s:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool RunTwoAddressOnFunction(MachineFunction &function) {
+  bool changed = false;
+
+  MachineRegisterInfo mri;
+  mri.Compute(function);
+
+  for (auto &block : function.GetBlocks()) {
+    for (auto &inst : block->GetInstructions()) {
+      if (!IsCommutative(inst.GetOpcode())) continue;
+
+      auto &ops = const_cast<MachineInstr &>(inst).GetOperands();
+      if (ops.size() < 3) continue;
+      if (ops[0].GetKind() != Operand::Kind::VReg) continue;
+      if (ops[1].GetKind() != Operand::Kind::VReg ||
+          ops[2].GetKind() != Operand::Kind::VReg) continue;
+
+      int dst = ops[0].GetVRegId();
+      int src1 = ops[1].GetVRegId();
+      int src2 = ops[2].GetVRegId();
+
+      // 如果 dst == src2 且 dst != src1（操作可交换）→ 交换 src1 和 src2
+      // 这样 dst == src1，避免潜在的 copy 需求
+      if (dst == src2 && dst != src1) {
+        std::swap(ops[1], ops[2]);
+        changed = true;
+      }
+    }
+  }
+
+  return changed;
+}
+
+} // namespace
+
+void RunTwoAddressOpt(MachineFunction &function) {
+  bool changed = true;
+  int iter = 0;
+  while (changed && iter < 3) {
+    changed = RunTwoAddressOnFunction(function);
+    iter++;
+  }
+}
+
+void RunTwoAddressOpt(MachineModule &module) {
+  for (auto &func : module.GetFunctions())
+    RunTwoAddressOpt(*func);
+}
+
+} // namespace mir