并行优化分支

并行分支
65 changed files with 13504 additions and 740 deletions
--- a/Reference/黄越-如何快速获取性能分
+++ b/Reference/黄越-如何快速获取性能分
--- a/include/ir/PassManager.h
+++ b/include/ir/PassManager.h
@ -23,7 +23,10 @@ bool RunLoopStrengthReduction(Module& module);
 bool RunLoopUnroll(Module& module);
 bool RunLoopFission(Module& module);
 bool RunLoopRepeatReduction(Module& module);
+bool RunLoopParallelize(Module& module);
+bool RunLoopVectorize(Module& module);
 bool RunIfConversion(Module& module);
+void VerifyIR(const Module& module);
 void RunIRPassPipeline(Module& module);

 }  // namespace ir
--- a/include/mir/CodeGen.h
+++ b/include/mir/CodeGen.h
@ -0,0 +1,19 @@
+#pragma once
+
+#include <iosfwd>
+#include <memory>
+
+#include "mir/MIR.h"
+
+namespace ir {
+class Module;
+}
+
+namespace mir {
+
+std::unique_ptr<MachineModule> LowerToMIR(const ir::Module& module);
+void RunRegAlloc(MachineModule& module);
+void RunFrameLowering(MachineModule& module);
+void PrintAsm(const MachineModule& module, std::ostream& os);
+
+}  // namespace mir
--- a/include/mir/MIR.h
+++ b/include/mir/MIR.h
@ -20,7 +20,7 @@ class MIRContext {

 MIRContext& DefaultContext();

-enum class ValueType { Void, I1, I32, F32, Ptr };
+enum class ValueType { Void, I1, I32, F32, Ptr, I32x4, F32x4 };

 enum class RegClass { GPR, FPR };

@ -42,11 +42,15 @@ struct PhysReg {
  }
 };

-bool IsGPR(ValueType type);
-bool IsFPR(ValueType type);
-int GetValueSize(ValueType type);
-int GetValueAlign(ValueType type);
-const char* GetPhysRegName(PhysReg reg, ValueType type);
+bool IsGPR(ValueType type);
+bool IsFPR(ValueType type);
+bool IsVector(ValueType type);
+bool IsNEON(ValueType type);
+int GetVectorLaneCount(ValueType type);
+ValueType GetVectorElementType(ValueType type);
+int GetValueSize(ValueType type);
+int GetValueAlign(ValueType type);
+const char* GetPhysRegName(PhysReg reg, ValueType type);

 class MachineOperand {
 public:
@ -108,9 +112,11 @@ class MachineInstr {
    Load,
    Store,
    Lea,
-    Add,
-    Sub,
+    Add,
+    Sub,
    Mul,
+    MAdd,
+    MSub,
    Div,
    Rem,
    ModMul,
@ -130,8 +136,9 @@ class MachineInstr {
    FSqrt,
    FNeg,
    ICmp,
-    FCmp,
-    ZExt,
+    FCmp,
+    CSelect,
+    ZExt,
    ItoF,
    FtoI,
    Br,
@ -282,18 +289,7 @@ class MachineModule {
  std::vector<std::unique_ptr<MachineFunction>> functions_;
 };

-std::unique_ptr<MachineModule> LowerToMIR(const ir::Module& module);
-bool RunPeephole(MachineModule& module);
-bool RunSpillReduction(MachineModule& module);
-bool RunCFGCleanup(MachineModule& module);
-void RunMIRPreRegAllocPassPipeline(MachineModule& module);
-void RunMIRPostRegAllocPassPipeline(MachineModule& module);
-void RunAddressHoisting(MachineModule& module);
-void RunRegAlloc(MachineModule& module);
-void RunFrameLowering(MachineModule& module);
-void PrintAsm(const MachineModule& module, std::ostream& os);
-
-}  // namespace mir
+}  // namespace mir



--- a/include/mir/Passes.h
+++ b/include/mir/Passes.h
@ -0,0 +1,15 @@
+#pragma once
+
+#include "mir/MIR.h"
+
+namespace mir {
+
+bool RunPeephole(MachineModule& module);
+bool RunSpillReduction(MachineModule& module);
+bool RunCFGCleanup(MachineModule& module);
+void RunAddressHoisting(MachineModule& module);
+void VerifyMIR(const MachineModule& module);
+void RunMIRPreRegAllocPassPipeline(MachineModule& module);
+void RunMIRPostRegAllocPassPipeline(MachineModule& module);
+
+}  // namespace mir
--- a/include/utils/OptConfig.h
+++ b/include/utils/OptConfig.h
@ -0,0 +1,16 @@
+#pragma once
+
+#include <cstdlib>
+
+namespace utils {
+
+inline bool IsEnvFlagSet(const char* name) {
+  const char* value = std::getenv(name);
+  return value != nullptr && value[0] != '\0' && value[0] != '0';
+}
+
+inline bool IsEnabledUnlessEnvFlag(const char* disable_flag_name) {
+  return !IsEnvFlagSet(disable_flag_name);
+}
+
+}  // namespace utils
--- a/scripts/analyze_case.sh
+++ b/scripts/analyze_case.sh
@ -144,7 +144,7 @@ else
  exit 1
 fi

-if aarch64-linux-gnu-gcc "$OUR_ASM" "$REPO_ROOT/sylib/sylib.c" -O2 \
+if aarch64-linux-gnu-gcc -pthread "$OUR_ASM" "$REPO_ROOT/sylib/sylib.c" -O2 \
    -I "$REPO_ROOT/sylib" -lm -o "$OUR_ELF" 2>"$OUT_DIR/$STEM.link.err"; then
  rpt_color "$GREEN" "Linked: $OUR_ELF"
 else
--- a/scripts/run_baseline.sh
+++ b/scripts/run_baseline.sh
@ -1,9 +1,9 @@
 #!/usr/bin/env bash
-# run_baseline.sh — 批量编译 GCC -O2 基线并保存汇编、输出与运行时间
+# run_baseline.sh — 批量编译 GCC -O3 基线并保存汇编、输出与运行时间
 #
 # 数据统一保存在 output/baseline/：
 #   gcc_timing.tsv         — stem<TAB>gcc_elapsed_s  （所有脚本的共享数据源）
-#   <stem>.gcc.s           — GCC -O2 AArch64 汇编（供 analyze_case.sh 对比）
+#   <stem>.gcc.s           — GCC -O3 AArch64 汇编（供 analyze_case.sh 对比）
 #   <stem>.gcc.out         — GCC 程序实际输出 stdout+exit_code（供 analyze_case.sh 对比）
 #
 # 用法：
@ -175,7 +175,7 @@ PYEOF
  # -x c：允许 delete/new/class 等作为标识符
  # -include sylib.h：强制注入 SysY 运行时声明（.sy 无 #include）
  # 无名称修饰，直接链接同为 C 编译的 sylib.o
-  if ! aarch64-linux-gnu-gcc -O2 \
+  if ! aarch64-linux-gnu-gcc -pthread -O3 \
        -x c -include "$REPO_ROOT/sylib/sylib.h" \
        -I "$REPO_ROOT/sylib" \
        "$tmp_sy" -x none "$SYLIB_OBJ" \
@ -188,7 +188,7 @@ PYEOF
  fi

  # 步骤2：生成汇编（单独 -S，仅针对 .sy 文件本身）
-  aarch64-linux-gnu-gcc -O2 \
+  aarch64-linux-gnu-gcc -O3 \
    -x c -include "$REPO_ROOT/sylib/sylib.h" \
    -I "$REPO_ROOT/sylib" \
    "$tmp_sy" -S -o "$gcc_asm" 2>/dev/null || true
@ -283,7 +283,7 @@ fi
 # ---------- 预编译 sylib.o（C 模式，仅一次）----------

 SYLIB_OBJ="$BASELINE_DIR/sylib.o"
-if ! aarch64-linux-gnu-gcc -O2 -c -x c "$REPO_ROOT/sylib/sylib.c" \
+if ! aarch64-linux-gnu-gcc -pthread -O3 -c -x c "$REPO_ROOT/sylib/sylib.c" \
      -I "$REPO_ROOT/sylib" -o "$SYLIB_OBJ" 2>/dev/null; then
  printf '%bERROR: failed to compile sylib.c%b\n' "$RED" "$NC" >&2
  exit 1
--- a/scripts/verify_asm.sh
+++ b/scripts/verify_asm.sh
@ -70,7 +70,7 @@ _compile_start_ns=$(now_ns)
 "$compiler" --emit-asm "$input" > "$asm_file"
 echo "asm generated: $asm_file"

-aarch64-linux-gnu-gcc "$asm_file" "$REPO_ROOT/sylib/sylib.c" -O2 -o "$exe"
+aarch64-linux-gnu-gcc -pthread "$asm_file" "$REPO_ROOT/sylib/sylib.c" -O2 -o "$exe"
 echo "executable generated: $exe"
 _compile_ns=$(($(now_ns) - _compile_start_ns))

--- a/scripts/verify_asm_all.sh
+++ b/scripts/verify_asm_all.sh
@ -46,7 +46,7 @@ fi
 mkdir -p "$out_dir"

 sylib_obj="$out_dir/sylib.o"
-aarch64-linux-gnu-gcc -c "$sylib_c" -I sylib -o "$sylib_obj"
+aarch64-linux-gnu-gcc -pthread -c "$sylib_c" -I sylib -o "$sylib_obj"

 mapfile -t inputs < <(find "$test_dir" -type f -name '*.sy' | sort)
 if [[ ${#inputs[@]} -eq 0 ]]; then
@ -92,7 +92,7 @@ run_case() {
    return 1
  fi

-  if ! aarch64-linux-gnu-gcc "$asm_file" "$sylib_obj" -o "$exe" 2>"$case_out_dir/$stem.link.err"; then
+  if ! aarch64-linux-gnu-gcc -pthread "$asm_file" "$sylib_obj" -o "$exe" 2>"$case_out_dir/$stem.link.err"; then
    echo "$stem: 链接失败"
    cat "$case_out_dir/$stem.link.err" >&2
    return 1
--- a/scripts/verify_asm_all_time.sh
+++ b/scripts/verify_asm_all_time.sh
@ -46,7 +46,7 @@ fi
 mkdir -p "$out_dir"

 sylib_obj="$out_dir/sylib.o"
-aarch64-linux-gnu-gcc -c "$sylib_c" -I sylib -o "$sylib_obj"
+aarch64-linux-gnu-gcc -pthread -c "$sylib_c" -I sylib -o "$sylib_obj"

 mapfile -t inputs < <(find "$test_dir" -type f -name '*.sy' | sort)
 if [[ ${#inputs[@]} -eq 0 ]]; then
@ -91,7 +91,7 @@ run_case() {
    return 1
  fi

-  if ! aarch64-linux-gnu-gcc "$asm_file" "$sylib_obj" -o "$exe" 2>"$case_out_dir/$stem.link.err"; then
+  if ! aarch64-linux-gnu-gcc -pthread "$asm_file" "$sylib_obj" -o "$exe" 2>"$case_out_dir/$stem.link.err"; then
    echo "$stem: 链接失败"
    cat "$case_out_dir/$stem.link.err" >&2
    return 1
--- a/src/ir/passes/ArithmeticSimplify.cpp
+++ b/src/ir/passes/ArithmeticSimplify.cpp
@ -5,7 +5,10 @@

 #include <algorithm>
 #include <cstdint>
+#include <cstdlib>
+#include <limits>
 #include <memory>
+#include <unordered_map>
 #include <vector>

 namespace ir {
@ -38,6 +41,267 @@ bool IsZero(Value* value) {
  return false;
 }

+struct LinearExpr {
+  std::int64_t constant = 0;
+  int constant_terms = 0;
+  int term_visits = 0;
+  int decomposed_nodes = 0;
+  bool saw_nested = false;
+  std::vector<Value*> order;
+  std::unordered_map<Value*, std::int64_t> coeffs;
+};
+
+bool IsInt32Value(Value* value) {
+  return value != nullptr && value->GetType() && value->GetType()->IsInt32();
+}
+
+bool AddLinearTerm(LinearExpr& expr, Value* value, std::int64_t coeff) {
+  if (value == nullptr || coeff == 0) {
+    return true;
+  }
+  if (auto* ci = dyncast<ConstantInt>(value)) {
+    expr.constant += coeff * static_cast<std::int64_t>(ci->GetValue());
+    ++expr.constant_terms;
+    return true;
+  }
+
+  constexpr int kMaxTerms = 16;
+  if (++expr.term_visits > kMaxTerms) {
+    return false;
+  }
+  auto [it, inserted] = expr.coeffs.emplace(value, 0);
+  if (inserted) {
+    expr.order.push_back(value);
+  }
+  it->second += coeff;
+  return true;
+}
+
+bool CollectLinearExpr(Value* value, std::int64_t sign, int depth,
+                       LinearExpr& expr) {
+  constexpr int kMaxDepth = 8;
+  if (value == nullptr || !IsInt32Value(value)) {
+    return false;
+  }
+
+  auto* bin = dyncast<BinaryInst>(value);
+  if (bin != nullptr && depth < kMaxDepth &&
+      (bin->GetOpcode() == Opcode::Add || bin->GetOpcode() == Opcode::Sub) &&
+      IsInt32Value(bin)) {
+    expr.saw_nested = expr.saw_nested || depth > 0;
+    ++expr.decomposed_nodes;
+    if (!CollectLinearExpr(bin->GetLhs(), sign, depth + 1, expr)) {
+      return false;
+    }
+    const std::int64_t rhs_sign =
+        bin->GetOpcode() == Opcode::Add ? sign : -sign;
+    return CollectLinearExpr(bin->GetRhs(), rhs_sign, depth + 1, expr);
+  }
+
+  return AddLinearTerm(expr, value, sign);
+}
+
+int WrappedI32(std::int64_t value) {
+  return static_cast<int>(static_cast<std::int32_t>(
+      static_cast<std::uint32_t>(value)));
+}
+
+int EstimateLinearMaterializeCost(const LinearExpr& expr) {
+  int item_count = expr.constant != 0 ? 1 : 0;
+  int coeff_cost = 0;
+  int positive_terms = 0;
+  int negative_terms = 0;
+
+  for (auto* value : expr.order) {
+    auto it = expr.coeffs.find(value);
+    if (it == expr.coeffs.end() || it->second == 0) {
+      continue;
+    }
+    ++item_count;
+    if (it->second > 0) {
+      ++positive_terms;
+    } else {
+      ++negative_terms;
+    }
+    const auto abs_coeff = it->second < 0 ? -it->second : it->second;
+    if (abs_coeff != 1) {
+      ++coeff_cost;
+    }
+  }
+
+  if (item_count == 0) {
+    return 0;
+  }
+
+  int combine_cost = 0;
+  bool started = false;
+  auto append_positive = [&]() {
+    if (started) {
+      ++combine_cost;
+    } else {
+      started = true;
+    }
+  };
+  auto append_negative = [&]() {
+    if (started) {
+      ++combine_cost;
+    } else {
+      // Materialize "0 - x" for a leading negative term.
+      ++combine_cost;
+      started = true;
+    }
+  };
+
+  for (int i = 0; i < positive_terms; ++i) {
+    append_positive();
+  }
+  if (expr.constant != 0) {
+    append_positive();
+  }
+  for (int i = 0; i < negative_terms; ++i) {
+    append_negative();
+  }
+
+  return coeff_cost + combine_cost;
+}
+
+Value* MaterializeCoeffTerm(Function& function, BasicBlock* block,
+                            std::size_t& insert_index, Value* value,
+                            std::int64_t abs_coeff) {
+  if (abs_coeff == 1) {
+    return value;
+  }
+  if (abs_coeff > std::numeric_limits<int>::max()) {
+    return nullptr;
+  }
+  return block->Insert<BinaryInst>(
+      insert_index++, Opcode::Mul, Type::GetInt32Type(), value,
+      looputils::ConstInt(static_cast<int>(abs_coeff)), nullptr,
+      looputils::NextSyntheticName(function, "lin.mul."));
+}
+
+Value* MaterializeLinearExpr(Function& function, BasicBlock* block,
+                             std::size_t insert_index,
+                             const LinearExpr& expr) {
+  Value* result = nullptr;
+
+  auto append_add = [&](Value* term) -> bool {
+    if (term == nullptr) {
+      return false;
+    }
+    if (result == nullptr) {
+      result = term;
+      return true;
+    }
+    result = block->Insert<BinaryInst>(
+        insert_index++, Opcode::Add, Type::GetInt32Type(), result, term,
+        nullptr, looputils::NextSyntheticName(function, "lin.add."));
+    return true;
+  };
+
+  auto append_sub = [&](Value* term) -> bool {
+    if (term == nullptr) {
+      return false;
+    }
+    if (result == nullptr) {
+      result = block->Insert<BinaryInst>(
+          insert_index++, Opcode::Sub, Type::GetInt32Type(),
+          looputils::ConstInt(0), term, nullptr,
+          looputils::NextSyntheticName(function, "lin.neg."));
+      return true;
+    }
+    result = block->Insert<BinaryInst>(
+        insert_index++, Opcode::Sub, Type::GetInt32Type(), result, term,
+        nullptr, looputils::NextSyntheticName(function, "lin.sub."));
+    return true;
+  };
+
+  for (auto* value : expr.order) {
+    auto it = expr.coeffs.find(value);
+    if (it == expr.coeffs.end() || it->second <= 0) {
+      continue;
+    }
+    auto* term = MaterializeCoeffTerm(function, block, insert_index, value,
+                                      it->second);
+    if (!append_add(term)) {
+      return nullptr;
+    }
+  }
+
+  if (expr.constant != 0) {
+    if (!append_add(looputils::ConstInt(WrappedI32(expr.constant)))) {
+      return nullptr;
+    }
+  }
+
+  for (auto* value : expr.order) {
+    auto it = expr.coeffs.find(value);
+    if (it == expr.coeffs.end() || it->second >= 0) {
+      continue;
+    }
+    const auto abs_coeff = -it->second;
+    auto* term =
+        MaterializeCoeffTerm(function, block, insert_index, value, abs_coeff);
+    if (!append_sub(term)) {
+      return nullptr;
+    }
+  }
+
+  return result != nullptr ? result : static_cast<Value*>(looputils::ConstInt(0));
+}
+
+bool SimplifyLinearAddSub(Function& function) {
+  bool changed = false;
+  std::vector<BinaryInst*> candidates;
+
+  for (const auto& block_ptr : function.GetBlocks()) {
+    auto* block = block_ptr.get();
+    if (!block) {
+      continue;
+    }
+    for (const auto& inst_ptr : block->GetInstructions()) {
+      auto* bin = dyncast<BinaryInst>(inst_ptr.get());
+      if (bin != nullptr &&
+          (bin->GetOpcode() == Opcode::Add || bin->GetOpcode() == Opcode::Sub) &&
+          IsInt32Value(bin)) {
+        candidates.push_back(bin);
+      }
+    }
+  }
+
+  for (auto* root : candidates) {
+    auto* block = root == nullptr ? nullptr : root->GetParent();
+    if (block == nullptr || root->GetUses().empty()) {
+      continue;
+    }
+
+    LinearExpr expr;
+    if (!CollectLinearExpr(root, 1, 0, expr)) {
+      continue;
+    }
+    if (!expr.saw_nested && expr.constant_terms <= 1) {
+      continue;
+    }
+    if (EstimateLinearMaterializeCost(expr) >= expr.decomposed_nodes) {
+      continue;
+    }
+
+    const auto insert_index = FindInstructionIndex(block, root);
+    auto* replacement = MaterializeLinearExpr(function, block, insert_index, expr);
+    if (replacement == nullptr || replacement == root) {
+      continue;
+    }
+
+    root->ReplaceAllUsesWith(replacement);
+    if (root->GetUses().empty() && root->GetParent() != nullptr) {
+      root->GetParent()->EraseInstruction(root);
+    }
+    changed = true;
+  }
+
+  return changed;
+}
+
 Value* OtherCompareOperand(BinaryInst* cmp, Value* value) {
  if (!cmp || cmp->GetNumOperands() != 2) {
    return nullptr;
@ -129,6 +393,7 @@ bool RunArithmeticSimplify(Module& module) {
    if (!function || function->IsExternal()) {
      continue;
    }
+    changed |= SimplifyLinearAddSub(*function);
    changed |= SimplifyPowerOfTwoRemTests(*function);
  }
  return changed;
--- a/src/ir/passes/CMakeLists.txt
+++ b/src/ir/passes/CMakeLists.txt
@ -19,7 +19,10 @@ add_library(ir_passes STATIC
  LoopUnroll.cpp
  LoopFission.cpp
  LoopRepeatReduction.cpp
+  LoopParallelize.cpp
+  LoopVectorize.cpp
  IfConversion.cpp
+  IRVerifier.cpp
 )

 target_link_libraries(ir_passes PUBLIC
--- a/src/ir/passes/DCE.cpp
+++ b/src/ir/passes/DCE.cpp
@ -3,11 +3,68 @@
 #include "ir/IR.h"
 #include "PassUtils.h"

+#include <unordered_set>
 #include <vector>

 namespace ir {
 namespace {

+bool RunAggressiveDCEOnFunction(Function& function) {
+  std::unordered_set<Instruction*> live;
+  std::vector<Instruction*> worklist;
+
+  auto mark_live = [&](Instruction* inst) {
+    if (inst != nullptr && live.insert(inst).second) {
+      worklist.push_back(inst);
+    }
+  };
+
+  for (const auto& block_ptr : function.GetBlocks()) {
+    for (const auto& inst_ptr : block_ptr->GetInstructions()) {
+      auto* inst = inst_ptr.get();
+      if (inst->IsTerminator() || passutils::IsSideEffectingInstruction(inst)) {
+        mark_live(inst);
+      }
+    }
+  }
+
+  while (!worklist.empty()) {
+    auto* inst = worklist.back();
+    worklist.pop_back();
+    for (std::size_t i = 0; i < inst->GetNumOperands(); ++i) {
+      mark_live(dyncast<Instruction>(inst->GetOperand(i)));
+    }
+  }
+
+  bool changed = false;
+  std::vector<Instruction*> dead;
+  for (const auto& block_ptr : function.GetBlocks()) {
+    for (const auto& inst_ptr : block_ptr->GetInstructions()) {
+      auto* inst = inst_ptr.get();
+      if (!inst->IsTerminator() && live.find(inst) == live.end()) {
+        dead.push_back(inst);
+      }
+    }
+  }
+  if (dead.empty()) {
+    return false;
+  }
+
+  // Dead phi/computation cycles can keep each other in their use-lists. Clear
+  // all operands first so erasing one node cannot leave dangling use records in
+  // another dead node.
+  for (auto* inst : dead) {
+    inst->ClearAllOperands();
+  }
+  for (auto* inst : dead) {
+    if (auto* parent = inst->GetParent()) {
+      parent->EraseInstruction(inst);
+      changed = true;
+    }
+  }
+  return changed;
+}
+
 bool RunDCEOnFunction(Function& function) {
  if (function.IsExternal()) {
    return false;
@ -17,6 +74,11 @@ bool RunDCEOnFunction(Function& function) {
  bool local_changed = true;
  while (local_changed) {
    local_changed = false;
+    if (RunAggressiveDCEOnFunction(function)) {
+      local_changed = true;
+      changed = true;
+      continue;
+    }
    for (const auto& block_ptr : function.GetBlocks()) {
      std::vector<Instruction*> to_remove;
      for (const auto& inst_ptr : block_ptr->GetInstructions()) {
--- a/src/ir/passes/IRVerifier.cpp
+++ b/src/ir/passes/IRVerifier.cpp
@ -0,0 +1,214 @@
+#include "ir/PassManager.h"
+
+#include "ir/IR.h"
+
+#include <algorithm>
+#include <stdexcept>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+namespace ir {
+namespace {
+
+[[noreturn]] void Fail(const Function* function, const BasicBlock* block,
+                       const std::string& message) {
+  std::string where = "[ir-verify]";
+  if (function != nullptr) {
+    where += " function " + function->GetName();
+  }
+  if (block != nullptr) {
+    where += " block " + block->GetName();
+  }
+  throw std::runtime_error(where + ": " + message);
+}
+
+bool Contains(const std::vector<BasicBlock*>& blocks, const BasicBlock* needle) {
+  return std::find(blocks.begin(), blocks.end(), needle) != blocks.end();
+}
+
+bool SameType(const std::shared_ptr<Type>& lhs, const std::shared_ptr<Type>& rhs) {
+  if (lhs == rhs) {
+    return true;
+  }
+  if (!lhs || !rhs || lhs->GetKind() != rhs->GetKind()) {
+    return false;
+  }
+  if (lhs->IsPointer()) {
+    return true;
+  }
+  if (lhs->IsArray()) {
+    return lhs->GetNumElements() == rhs->GetNumElements() &&
+           SameType(lhs->GetElementType(), rhs->GetElementType());
+  }
+  return true;
+}
+
+void CheckValueUse(const Function& function, const BasicBlock& block,
+                   const Instruction& inst, std::size_t operand_index) {
+  auto* value = inst.GetOperand(operand_index);
+  if (value == nullptr) {
+    Fail(&function, &block, "null operand");
+  }
+  const auto& uses = value->GetUses();
+  const bool found = std::any_of(uses.begin(), uses.end(), [&](const Use& use) {
+    return use.GetUser() == &inst && use.GetOperandIndex() == operand_index;
+  });
+  if (!found) {
+    Fail(&function, &block, "operand use-list is inconsistent");
+  }
+}
+
+void CheckTerminatorTargets(const Function& function, const BasicBlock& block,
+                            const std::unordered_set<const BasicBlock*>& blocks) {
+  if (block.GetInstructions().empty()) {
+    Fail(&function, &block, "empty block has no terminator");
+  }
+  const auto* terminator = block.GetInstructions().back().get();
+  if (!terminator->IsTerminator()) {
+    Fail(&function, &block, "block has no terminator");
+  }
+
+  std::vector<BasicBlock*> expected;
+  if (auto* br = dyncast<UncondBrInst>(terminator)) {
+    expected.push_back(br->GetDest());
+  } else if (auto* br = dyncast<CondBrInst>(terminator)) {
+    if (!br->GetCondition() || !br->GetCondition()->IsBool()) {
+      Fail(&function, &block, "conditional branch condition must be i1");
+    }
+    expected.push_back(br->GetThenBlock());
+    expected.push_back(br->GetElseBlock());
+  }
+
+  for (auto* succ : expected) {
+    if (succ == nullptr || blocks.count(succ) == 0) {
+      Fail(&function, &block, "terminator targets a block outside the function");
+    }
+    if (!Contains(block.GetSuccessors(), succ)) {
+      Fail(&function, &block, "terminator target is missing from successor list");
+    }
+  }
+  for (auto* succ : block.GetSuccessors()) {
+    if (succ == nullptr || blocks.count(succ) == 0) {
+      Fail(&function, &block, "successor list contains a block outside the function");
+    }
+    if (!Contains(succ->GetPredecessors(), const_cast<BasicBlock*>(&block))) {
+      Fail(&function, &block, "successor/predecessor lists are inconsistent");
+    }
+  }
+}
+
+void CheckInstructionTypes(const Function& function, const BasicBlock& block,
+                           const Instruction& inst) {
+  for (std::size_t i = 0; i < inst.GetNumOperands(); ++i) {
+    CheckValueUse(function, block, inst, i);
+  }
+
+  if (auto* ret = dyncast<ReturnInst>(&inst)) {
+    if (function.GetReturnType()->IsVoid()) {
+      if (ret->HasReturnValue()) {
+        Fail(&function, &block, "void function returns a value");
+      }
+    } else if (!ret->HasReturnValue() ||
+               !SameType(function.GetReturnType(), ret->GetReturnValue()->GetType())) {
+      Fail(&function, &block, "return value type does not match function type");
+    }
+    return;
+  }
+
+  if (auto* call = dyncast<CallInst>(&inst)) {
+    auto* callee = call->GetCallee();
+    if (callee == nullptr) {
+      Fail(&function, &block, "call has no callee");
+    }
+    const auto args = call->GetArguments();
+    if (args.size() != callee->GetParamTypes().size()) {
+      Fail(&function, &block, "call argument count mismatch");
+    }
+    for (std::size_t i = 0; i < args.size(); ++i) {
+      if (!SameType(args[i]->GetType(), callee->GetParamTypes()[i])) {
+        Fail(&function, &block, "call argument type mismatch");
+      }
+    }
+  }
+}
+
+void CheckFunction(const Function& function) {
+  if (function.IsExternal()) {
+    if (!function.GetBlocks().empty()) {
+      Fail(&function, nullptr, "external function must not have blocks");
+    }
+    return;
+  }
+
+  std::unordered_set<const BasicBlock*> blocks;
+  for (const auto& block_ptr : function.GetBlocks()) {
+    if (!block_ptr) {
+      Fail(&function, nullptr, "null block");
+    }
+    auto* block = block_ptr.get();
+    if (block->GetParent() != &function) {
+      Fail(&function, block, "block parent is inconsistent");
+    }
+    blocks.insert(block);
+  }
+  if (function.GetEntryBlock() == nullptr || blocks.count(function.GetEntryBlock()) == 0) {
+    Fail(&function, nullptr, "entry block is missing or outside the function");
+  }
+
+  for (const auto& block_ptr : function.GetBlocks()) {
+    const auto& block = *block_ptr;
+    const auto& instructions = block.GetInstructions();
+    for (std::size_t i = 0; i < instructions.size(); ++i) {
+      auto* inst = instructions[i].get();
+      if (inst == nullptr) {
+        Fail(&function, &block, "null instruction");
+      }
+      if (inst->GetParent() != &block) {
+        Fail(&function, &block, "instruction parent is inconsistent");
+      }
+      if (inst->IsTerminator() && i + 1 != instructions.size()) {
+        Fail(&function, &block, "terminator is not the last instruction");
+      }
+      CheckInstructionTypes(function, block, *inst);
+    }
+    CheckTerminatorTargets(function, block, blocks);
+
+    for (auto* pred : block.GetPredecessors()) {
+      if (pred == nullptr || blocks.count(pred) == 0) {
+        Fail(&function, &block, "predecessor list contains a block outside the function");
+      }
+      if (!Contains(pred->GetSuccessors(), const_cast<BasicBlock*>(&block))) {
+        Fail(&function, &block, "predecessor/successor lists are inconsistent");
+      }
+    }
+
+    for (const auto& inst_ptr : instructions) {
+      auto* phi = dyncast<PhiInst>(inst_ptr.get());
+      if (!phi) {
+        break;
+      }
+      for (int i = 0; i < phi->GetNumIncomings(); ++i) {
+        auto* incoming_block = phi->GetIncomingBlock(i);
+        if (!Contains(block.GetPredecessors(), incoming_block)) {
+          Fail(&function, &block, "phi incoming block is not a predecessor");
+        }
+        if (!SameType(phi->GetType(), phi->GetIncomingValue(i)->GetType())) {
+          Fail(&function, &block, "phi incoming value type mismatch");
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
+
+void VerifyIR(const Module& module) {
+  for (const auto& function : module.GetFunctions()) {
+    if (function) {
+      CheckFunction(*function);
+    }
+  }
+}
+
+}  // namespace ir
--- a/src/ir/passes/IfConversion.cpp
+++ b/src/ir/passes/IfConversion.cpp
@ -2,7 +2,9 @@

 #include "ir/IR.h"
 #include "PassUtils.h"
+#include "utils/OptConfig.h"

+#include <algorithm>
 #include <cstddef>
 #include <vector>

@ -67,6 +69,19 @@ bool HasOnlyOneNonTerminator(BasicBlock* block, Instruction** out) {
  return candidate != nullptr;
 }

+bool HasAnyPhi(BasicBlock* block) {
+  if (block == nullptr) {
+    return false;
+  }
+  for (const auto& inst_ptr : block->GetInstructions()) {
+    if (dyncast<PhiInst>(inst_ptr.get()) != nullptr) {
+      return true;
+    }
+    break;
+  }
+  return false;
+}
+
 int IncomingIndexFor(PhiInst* phi, BasicBlock* block) {
  if (phi == nullptr || block == nullptr) {
    return -1;
@ -91,6 +106,193 @@ bool IsUsedOnlyBy(Value* value, User* expected_user) {
  return true;
 }

+bool IsConstI32(Value* value, int expected) {
+  auto* constant = dyncast<ConstantInt>(value);
+  return constant != nullptr && constant->GetValue() == expected;
+}
+
+bool SameValue(Value* lhs, Value* rhs) {
+  return lhs == rhs || passutils::AreEquivalentValues(lhs, rhs);
+}
+
+bool HasOnlyPredecessors(BasicBlock* block,
+                         const std::vector<BasicBlock*>& preds) {
+  if (block == nullptr || block->GetPredecessors().size() != preds.size()) {
+    return false;
+  }
+  for (auto* pred : preds) {
+    const auto& block_preds = block->GetPredecessors();
+    if (std::find(block_preds.begin(), block_preds.end(), pred) ==
+        block_preds.end()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool MatchCompareBranchBlock(BasicBlock* block, BinaryInst** condition,
+                             CondBrInst** branch) {
+  if (block == nullptr || condition == nullptr || branch == nullptr) {
+    return false;
+  }
+  auto* term = dyncast<CondBrInst>(GetTerminator(block));
+  if (term == nullptr) {
+    return false;
+  }
+  Instruction* only_inst = nullptr;
+  if (!HasOnlyOneNonTerminator(block, &only_inst)) {
+    return false;
+  }
+  auto* cmp = dyncast<BinaryInst>(only_inst);
+  if (cmp == nullptr || cmp != term->GetCondition() ||
+      !cmp->GetType()->IsInt1()) {
+    return false;
+  }
+  *condition = cmp;
+  *branch = term;
+  return true;
+}
+
+bool MatchConstStoreBlock(BasicBlock* block, int expected_value,
+                          Value** store_ptr, BasicBlock** dest) {
+  if (block == nullptr || store_ptr == nullptr || dest == nullptr) {
+    return false;
+  }
+  auto* term = dyncast<UncondBrInst>(GetTerminator(block));
+  if (term == nullptr) {
+    return false;
+  }
+  Instruction* only_inst = nullptr;
+  if (!HasOnlyOneNonTerminator(block, &only_inst)) {
+    return false;
+  }
+  auto* store = dyncast<StoreInst>(only_inst);
+  if (store == nullptr || !IsConstI32(store->GetValue(), expected_value)) {
+    return false;
+  }
+  *store_ptr = store->GetPtr();
+  *dest = term->GetDest();
+  return true;
+}
+
+void RedirectBlockTo(BasicBlock* block, BasicBlock* dest) {
+  const auto old_successors = block->GetSuccessors();
+  for (auto* succ : old_successors) {
+    block->RemoveSuccessor(succ);
+    if (succ != nullptr) {
+      succ->RemovePredecessor(block);
+    }
+  }
+  block->AddSuccessor(dest);
+  if (dest != nullptr) {
+    dest->AddPredecessor(block);
+  }
+  passutils::ReplaceTerminatorWithBr(block, dest);
+}
+
+bool TryConvertNestedBooleanStore(Function& function, BasicBlock* pred) {
+  if (!utils::IsEnvFlagSet("NUDTC_ENABLE_NESTED_STORE_IFCONV")) {
+    return false;
+  }
+
+  auto* outer_branch = dyncast<CondBrInst>(GetTerminator(pred));
+  if (outer_branch == nullptr || outer_branch->GetCondition() == nullptr ||
+      !outer_branch->GetCondition()->GetType()->IsInt1()) {
+    return false;
+  }
+
+  auto* rhs_block = outer_branch->GetThenBlock();
+  auto* else_block = outer_branch->GetElseBlock();
+  if (rhs_block == nullptr || else_block == nullptr || rhs_block == else_block ||
+      !HasOnlyPredecessors(rhs_block, {pred})) {
+    return false;
+  }
+
+  BinaryInst* rhs_cmp = nullptr;
+  CondBrInst* rhs_branch = nullptr;
+  if (!MatchCompareBranchBlock(rhs_block, &rhs_cmp, &rhs_branch) ||
+      rhs_branch->GetElseBlock() != else_block ||
+      !HasOnlyPredecessors(else_block, {pred, rhs_block})) {
+    return false;
+  }
+
+  BinaryInst* else_cmp = nullptr;
+  CondBrInst* else_branch = nullptr;
+  if (!MatchCompareBranchBlock(else_block, &else_cmp, &else_branch)) {
+    return false;
+  }
+
+  auto* store_one_from_and = rhs_branch->GetThenBlock();
+  auto* store_one_from_else = else_branch->GetThenBlock();
+  auto* store_zero = else_branch->GetElseBlock();
+  if (store_one_from_and == nullptr || store_one_from_else == nullptr ||
+      store_zero == nullptr || store_one_from_and == store_one_from_else ||
+      store_one_from_and == store_zero || store_one_from_else == store_zero ||
+      !HasOnlyPredecessors(store_one_from_and, {rhs_block}) ||
+      !HasOnlyPredecessors(store_one_from_else, {else_block}) ||
+      !HasOnlyPredecessors(store_zero, {else_block})) {
+    return false;
+  }
+
+  Value* ptr_a = nullptr;
+  Value* ptr_b = nullptr;
+  Value* ptr_zero = nullptr;
+  BasicBlock* join_a = nullptr;
+  BasicBlock* join_b = nullptr;
+  BasicBlock* join_zero = nullptr;
+  if (!MatchConstStoreBlock(store_one_from_and, 1, &ptr_a, &join_a) ||
+      !MatchConstStoreBlock(store_one_from_else, 1, &ptr_b, &join_b) ||
+      !MatchConstStoreBlock(store_zero, 0, &ptr_zero, &join_zero) ||
+      !SameValue(ptr_a, ptr_b) || !SameValue(ptr_a, ptr_zero)) {
+    return false;
+  }
+
+  BasicBlock* join = nullptr;
+  if (join_b == join_zero) {
+    auto* after_else = join_b;
+    auto* after_term = dyncast<UncondBrInst>(GetTerminator(after_else));
+    if (after_term == nullptr ||
+        !HasOnlyPredecessors(after_else, {store_one_from_else, store_zero}) ||
+        HasAnyPhi(after_else)) {
+      return false;
+    }
+    join = after_term->GetDest();
+  } else {
+    return false;
+  }
+
+  if (join == nullptr || join_a != join || HasAnyPhi(join)) {
+    return false;
+  }
+
+  std::size_t pos = GetTerminatorIndex(pred);
+  auto* rhs_cmp_clone = pred->Insert<BinaryInst>(
+      pos++, rhs_cmp->GetOpcode(), rhs_cmp->GetType(), rhs_cmp->GetLhs(),
+      rhs_cmp->GetRhs(), nullptr, "%ifconv.rhs");
+  auto* else_cmp_clone = pred->Insert<BinaryInst>(
+      pos++, else_cmp->GetOpcode(), else_cmp->GetType(), else_cmp->GetLhs(),
+      else_cmp->GetRhs(), nullptr, "%ifconv.else");
+  auto* outer_i32 = pred->Insert<ZextInst>(
+      pos++, outer_branch->GetCondition(), Type::GetInt32Type(), nullptr,
+      "%ifconv.outer");
+  auto* rhs_i32 = pred->Insert<ZextInst>(
+      pos++, rhs_cmp_clone, Type::GetInt32Type(), nullptr, "%ifconv.rhs.i32");
+  auto* else_i32 = pred->Insert<ZextInst>(
+      pos++, else_cmp_clone, Type::GetInt32Type(), nullptr,
+      "%ifconv.else.i32");
+  auto* both = pred->Insert<BinaryInst>(
+      pos++, Opcode::And, Type::GetInt32Type(), outer_i32, rhs_i32, nullptr,
+      "%ifconv.and");
+  auto* result = pred->Insert<BinaryInst>(
+      pos++, Opcode::Or, Type::GetInt32Type(), both, else_i32, nullptr,
+      "%ifconv.store");
+  pred->Insert<StoreInst>(pos++, result, ptr_a, nullptr);
+
+  RedirectBlockTo(pred, join);
+  passutils::RemoveUnreachableBlocks(function);
+  return true;
+}
+
 struct ConditionalAccumulation {
  Value* base = nullptr;
  Value* delta = nullptr;
@ -214,7 +416,8 @@ bool RunIfConversionOnFunction(Function& function) {
    local_changed = false;
    auto blocks = passutils::CollectReachableBlocks(function);
    for (auto* block : blocks) {
-      if (TryConvertConditionalAccumulation(function, block)) {
+      if (TryConvertNestedBooleanStore(function, block) ||
+          TryConvertConditionalAccumulation(function, block)) {
        local_changed = true;
        changed = true;
        break;
--- a/src/ir/passes/Inline.cpp
+++ b/src/ir/passes/Inline.cpp
@ -17,6 +17,7 @@ namespace {
 struct InlineCandidateInfo {
  bool valid = false;
  int cost = 0;
+  int return_count = 0;
  bool has_nested_call = false;
  bool has_control_flow = false;
 };
@ -124,6 +125,7 @@ InlineCandidateInfo AnalyzeInlineCandidate(const Function& function) {
          return {};
        }
        saw_return = true;
+        ++info.return_count;
        continue;
      }
      if ((dyncast<UncondBrInst>(inst) || dyncast<CondBrInst>(inst)) &&
@ -143,6 +145,9 @@ InlineCandidateInfo AnalyzeInlineCandidate(const Function& function) {
  if (!saw_return) {
    return {};
  }
+  if (info.has_control_flow && info.return_count != 1) {
+    return {};
+  }

  info.valid = true;
  return info;
--- a/src/ir/passes/LoadStoreElim.cpp
+++ b/src/ir/passes/LoadStoreElim.cpp
@ -1,9 +1,12 @@
 #include "ir/PassManager.h"

+#include "ir/Analysis.h"
 #include "ir/IR.h"
+#include "LoopMemoryUtils.h"
 #include "MemoryUtils.h"
 #include "PassUtils.h"

+#include <cstdint>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@ -86,6 +89,263 @@ void InvalidateStatesForCall(MemoryState& state, Function* callee) {
  }
 }

+bool IsSameComponent(const memutils::AddressComponent& lhs,
+                     const memutils::AddressComponent& rhs) {
+  return lhs == rhs;
+}
+
+bool MatchValuePlusConst(Value* value, Value* base, std::int64_t& delta) {
+  if (!value || !base) {
+    return false;
+  }
+  if (value == base) {
+    delta = 0;
+    return true;
+  }
+
+  auto* bin = dyncast<BinaryInst>(value);
+  if (!bin) {
+    return false;
+  }
+
+  if (bin->GetOpcode() == Opcode::Add) {
+    if (bin->GetLhs() == base) {
+      auto* ci = dyncast<ConstantInt>(bin->GetRhs());
+      if (!ci) {
+        return false;
+      }
+      delta = ci->GetValue();
+      return true;
+    }
+    if (bin->GetRhs() == base) {
+      auto* ci = dyncast<ConstantInt>(bin->GetLhs());
+      if (!ci) {
+        return false;
+      }
+      delta = ci->GetValue();
+      return true;
+    }
+    return false;
+  }
+
+  if (bin->GetOpcode() == Opcode::Sub && bin->GetLhs() == base) {
+    auto* ci = dyncast<ConstantInt>(bin->GetRhs());
+    if (!ci) {
+      return false;
+    }
+    delta = -static_cast<std::int64_t>(ci->GetValue());
+    return true;
+  }
+  return false;
+}
+
+bool IsTriangularIVNoAlias(const memutils::AddressKey& invariant_key,
+                           const memutils::AddressKey& varying_key,
+                           const loopmem::SimpleInductionVar& iv) {
+  if (iv.phi == nullptr || iv.stride == 0 ||
+      invariant_key.kind != varying_key.kind ||
+      invariant_key.root != varying_key.root ||
+      invariant_key.components.size() != varying_key.components.size()) {
+    return false;
+  }
+
+  int diff_index = -1;
+  for (std::size_t i = 0; i < invariant_key.components.size(); ++i) {
+    if (IsSameComponent(invariant_key.components[i], varying_key.components[i])) {
+      continue;
+    }
+    if (diff_index >= 0) {
+      return false;
+    }
+    diff_index = static_cast<int>(i);
+  }
+  if (diff_index < 0) {
+    return false;
+  }
+
+  const auto& invariant_component =
+      invariant_key.components[static_cast<std::size_t>(diff_index)];
+  const auto& varying_component =
+      varying_key.components[static_cast<std::size_t>(diff_index)];
+  if (invariant_component.is_constant || varying_component.is_constant ||
+      varying_component.value != iv.phi) {
+    return false;
+  }
+
+  std::int64_t start_delta = 0;
+  if (!MatchValuePlusConst(iv.start, invariant_component.value, start_delta)) {
+    return false;
+  }
+
+  // If the loop IV starts strictly beyond the invariant index and moves farther
+  // away, accesses like A[i][k] and A[j][k] cannot refer to the same element.
+  return (iv.stride > 0 && start_delta > 0) ||
+         (iv.stride < 0 && start_delta < 0);
+}
+
+bool LoopStoreCannotClobber(const memutils::AddressKey& load_key,
+                            const memutils::AddressKey& store_key,
+                            const loopmem::SimpleInductionVar& iv) {
+  if (load_key == store_key) {
+    return false;
+  }
+  if (!memutils::MayAliasConservatively(load_key, store_key)) {
+    return true;
+  }
+  return IsTriangularIVNoAlias(load_key, store_key, iv);
+}
+
+bool FindSimpleLoopIV(const Loop& loop, loopmem::SimpleInductionVar& iv) {
+  if (!loop.header || !loop.preheader) {
+    return false;
+  }
+  for (const auto& inst_ptr : loop.header->GetInstructions()) {
+    auto* phi = dyncast<PhiInst>(inst_ptr.get());
+    if (!phi) {
+      break;
+    }
+    if (loopmem::MatchSimpleInductionVariable(loop, loop.preheader, phi, iv)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+StoreInst* FindPreheaderStoreForLoad(const memutils::EscapeSummary& escapes,
+                                     BasicBlock* preheader,
+                                     const memutils::AddressKey& load_key) {
+  if (!preheader) {
+    return nullptr;
+  }
+
+  StoreInst* candidate = nullptr;
+  for (const auto& inst_ptr : preheader->GetInstructions()) {
+    auto* inst = inst_ptr.get();
+    if (inst->IsTerminator()) {
+      break;
+    }
+
+    if (auto* store = dyncast<StoreInst>(inst)) {
+      memutils::AddressKey store_key;
+      if (!memutils::BuildExactAddressKey(store->GetPtr(), &escapes, store_key)) {
+        candidate = nullptr;
+        continue;
+      }
+      if (store_key == load_key) {
+        candidate = store;
+        continue;
+      }
+      if (memutils::MayAliasConservatively(store_key, load_key)) {
+        candidate = nullptr;
+      }
+      continue;
+    }
+
+    if (auto* call = dyncast<CallInst>(inst)) {
+      if (memutils::CallMayWriteRoot(call->GetCallee(), load_key.kind)) {
+        candidate = nullptr;
+      }
+      continue;
+    }
+
+    if (auto* memset = dyncast<MemsetInst>(inst)) {
+      memutils::AddressKey memset_key;
+      if (!memutils::BuildExactAddressKey(memset->GetDest(), &escapes,
+                                          memset_key) ||
+          memutils::MayAliasConservatively(memset_key, load_key)) {
+        candidate = nullptr;
+      }
+      continue;
+    }
+  }
+  return candidate;
+}
+
+bool CanForwardPreheaderStoreThroughLoop(
+    const memutils::EscapeSummary& escapes, const Loop& loop,
+    const loopmem::SimpleInductionVar& iv,
+    const memutils::AddressKey& load_key) {
+  for (auto* block : loop.block_list) {
+    for (const auto& inst_ptr : block->GetInstructions()) {
+      auto* inst = inst_ptr.get();
+      if (auto* store = dyncast<StoreInst>(inst)) {
+        memutils::AddressKey store_key;
+        if (!memutils::BuildExactAddressKey(store->GetPtr(), &escapes, store_key) ||
+            !LoopStoreCannotClobber(load_key, store_key, iv)) {
+          return false;
+        }
+        continue;
+      }
+
+      if (auto* call = dyncast<CallInst>(inst)) {
+        if (memutils::CallMayWriteRoot(call->GetCallee(), load_key.kind)) {
+          return false;
+        }
+        continue;
+      }
+
+      if (auto* memset = dyncast<MemsetInst>(inst)) {
+        memutils::AddressKey memset_key;
+        if (!memutils::BuildExactAddressKey(memset->GetDest(), &escapes,
+                                            memset_key) ||
+            memutils::MayAliasConservatively(memset_key, load_key)) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+bool OptimizeLoopPreheaderStoreForwarding(Function& function,
+                                          const memutils::EscapeSummary& escapes) {
+  DominatorTree dom_tree(function);
+  LoopInfo loop_info(function, dom_tree);
+  bool changed = false;
+
+  for (auto* loop : loop_info.GetLoopsInPostOrder()) {
+    loopmem::SimpleInductionVar iv;
+    if (!FindSimpleLoopIV(*loop, iv)) {
+      continue;
+    }
+
+    std::vector<LoadInst*> to_remove;
+    for (auto* block : loop->block_list) {
+      for (const auto& inst_ptr : block->GetInstructions()) {
+        auto* load = dyncast<LoadInst>(inst_ptr.get());
+        if (!load) {
+          continue;
+        }
+
+        memutils::AddressKey load_key;
+        if (!memutils::BuildExactAddressKey(load->GetPtr(), &escapes, load_key)) {
+          continue;
+        }
+        auto* store =
+            FindPreheaderStoreForLoad(escapes, loop->preheader, load_key);
+        if (!store || store->GetValue() == load ||
+            !dom_tree.Dominates(store, load)) {
+          continue;
+        }
+        if (!CanForwardPreheaderStoreThroughLoop(escapes, *loop, iv, load_key)) {
+          continue;
+        }
+
+        load->ReplaceAllUsesWith(store->GetValue());
+        to_remove.push_back(load);
+        changed = true;
+      }
+    }
+
+    for (auto* load : to_remove) {
+      if (auto* parent = load->GetParent()) {
+        parent->EraseInstruction(load);
+      }
+    }
+  }
+  return changed;
+}
+
 void SimulateInstruction(const memutils::EscapeSummary& escapes, Instruction* inst,
                         MemoryState& state) {
  if (!inst) {
@ -302,6 +562,7 @@ bool RunLoadStoreElimOnFunction(Function& function) {
  }

  bool changed = false;
+  changed |= OptimizeLoopPreheaderStoreForwarding(function, escapes);
  for (auto* block : reachable_blocks) {
    changed |= OptimizeBlock(escapes, block, in_states[block]);
  }
--- a/src/ir/passes/LoopFission.cpp
+++ b/src/ir/passes/LoopFission.cpp
@ -56,6 +56,18 @@ Opcode SwapCompareOpcode(Opcode opcode) {
  }
 }

+bool IsSupportedCompareOpcode(Opcode opcode) {
+  switch (opcode) {
+    case Opcode::ICmpLT:
+    case Opcode::ICmpLE:
+    case Opcode::ICmpGT:
+    case Opcode::ICmpGE:
+      return true;
+    default:
+      return false;
+  }
+}
+
 bool MatchFissionLoop(Loop& loop, FissionLoopInfo& info) {
  if (!loop.preheader || !loop.header || !loop.IsInnermost()) {
    return false;
@ -90,7 +102,9 @@ bool MatchFissionLoop(Loop& loop, FissionLoopInfo& info) {

  auto* branch = dyncast<CondBrInst>(looputils::GetTerminator(loop.header));
  auto* compare = branch ? dyncast<BinaryInst>(branch->GetCondition()) : nullptr;
-  if (!branch || branch->GetThenBlock() != body || !compare) {
+  if (!branch || branch->GetThenBlock() != body || !compare ||
+      !compare->GetType()->IsBool() ||
+      !IsSupportedCompareOpcode(compare->GetOpcode())) {
    return false;
  }

@ -107,6 +121,13 @@ bool MatchFissionLoop(Loop& loop, FissionLoopInfo& info) {
    return false;
  }

+  if ((induction_var.stride > 0 &&
+       !(compare_opcode == Opcode::ICmpLT || compare_opcode == Opcode::ICmpLE)) ||
+      (induction_var.stride < 0 &&
+       !(compare_opcode == Opcode::ICmpGT || compare_opcode == Opcode::ICmpGE))) {
+    return false;
+  }
+
  auto* step_inst = dyncast<BinaryInst>(induction_var.latch_value);
  if (!step_inst || step_inst->GetParent() != body) {
    return false;
--- a/src/ir/passes/LoopMemoryUtils.h
+++ b/src/ir/passes/LoopMemoryUtils.h
@ -6,7 +6,9 @@
 #include <algorithm>
 #include <cstdint>
 #include <cstdlib>
+#include <functional>
 #include <memory>
+#include <utility>
 #include <vector>

 namespace ir::loopmem {
@ -116,17 +118,54 @@ struct AffineExpr {
  Value* var = nullptr;
  std::int64_t coeff = 0;
  std::int64_t constant = 0;
+  std::vector<std::pair<Value*, std::int64_t>> invariant_terms;
 };

 inline AffineExpr MakeConst(std::int64_t value) {
-  return {true, nullptr, 0, value};
+  return {true, nullptr, 0, value, {}};
+}
+
+inline void NormalizeInvariantTerms(std::vector<std::pair<Value*, std::int64_t>>& terms) {
+  std::sort(terms.begin(), terms.end(),
+            [](const auto& lhs, const auto& rhs) {
+              return std::less<Value*>{}(lhs.first, rhs.first);
+            });
+  std::vector<std::pair<Value*, std::int64_t>> normalized;
+  for (const auto& term : terms) {
+    if (term.first == nullptr || term.second == 0) {
+      continue;
+    }
+    if (!normalized.empty() && normalized.back().first == term.first) {
+      normalized.back().second += term.second;
+      if (normalized.back().second == 0) {
+        normalized.pop_back();
+      }
+    } else {
+      normalized.push_back(term);
+    }
+  }
+  terms = std::move(normalized);
+}
+
+inline AffineExpr MakeInvariant(Value* value) {
+  if (!value) {
+    return {};
+  }
+  AffineExpr out{true, nullptr, 0, 0, {{value, 1}}};
+  NormalizeInvariantTerms(out.invariant_terms);
+  return out;
 }

 inline AffineExpr Scale(const AffineExpr& expr, std::int64_t factor) {
  if (!expr.valid) {
    return {};
  }
-  return {true, expr.var, expr.coeff * factor, expr.constant * factor};
+  auto terms = expr.invariant_terms;
+  for (auto& term : terms) {
+    term.second *= factor;
+  }
+  NormalizeInvariantTerms(terms);
+  return {true, expr.var, expr.coeff * factor, expr.constant * factor, std::move(terms)};
 }

 inline AffineExpr Combine(const AffineExpr& lhs, const AffineExpr& rhs, int sign) {
@ -141,9 +180,19 @@ inline AffineExpr Combine(const AffineExpr& lhs, const AffineExpr& rhs, int sign
  out.var = lhs.var ? lhs.var : rhs.var;
  out.coeff = lhs.coeff + sign * rhs.coeff;
  out.constant = lhs.constant + sign * rhs.constant;
+  out.invariant_terms = lhs.invariant_terms;
+  out.invariant_terms.reserve(out.invariant_terms.size() + rhs.invariant_terms.size());
+  for (const auto& term : rhs.invariant_terms) {
+    out.invariant_terms.push_back({term.first, sign * term.second});
+  }
+  NormalizeInvariantTerms(out.invariant_terms);
  return out;
 }

+inline bool SameInvariantTerms(const AffineExpr& lhs, const AffineExpr& rhs) {
+  return lhs.invariant_terms == rhs.invariant_terms;
+}
+
 inline AffineExpr AnalyzeAffine(Value* value, PhiInst* iv, const Loop& loop) {
  if (!value) {
    return {};
@ -152,10 +201,10 @@ inline AffineExpr AnalyzeAffine(Value* value, PhiInst* iv, const Loop& loop) {
    return MakeConst(ci->GetValue());
  }
  if (value == iv) {
-    return {true, iv, 1, 0};
+    return {true, iv, 1, 0, {}};
  }
  if (looputils::IsLoopInvariantValue(loop, value)) {
-    return {};
+    return value->IsInt32() ? MakeInvariant(value) : AffineExpr{};
  }

  if (auto* zext = dyncast<ZextInst>(value)) {
@ -236,26 +285,33 @@ inline PointerInfo AnalyzePointer(Value* pointer, PhiInst* iv, const Loop& loop,
      escapes != nullptr && memutils::BuildExactAddressKey(pointer, escapes, info.exact_key);

  info.invariant_address = looputils::IsLoopInvariantValue(loop, pointer);
-  if (!dyncast<GetElementPtrInst>(pointer)) {
+  std::vector<GetElementPtrInst*> gep_chain;
+  for (auto* gep = dyncast<GetElementPtrInst>(pointer); gep != nullptr;
+       gep = dyncast<GetElementPtrInst>(gep->GetPointer())) {
+    gep_chain.push_back(gep);
+  }
+  if (gep_chain.empty()) {
    info.byte_offset = MakeConst(0);
    return info;
  }

-  auto* gep = static_cast<GetElementPtrInst*>(pointer);
-  std::shared_ptr<Type> current = gep->GetSourceType();
+  std::reverse(gep_chain.begin(), gep_chain.end());
  AffineExpr total = MakeConst(0);
-  bool all_indices_loop_invariant = looputils::IsLoopInvariantValue(loop, gep->GetPointer());
-  for (std::size_t i = 0; i < gep->GetNumIndices(); ++i) {
-    auto* index = gep->GetIndex(i);
-    all_indices_loop_invariant &= looputils::IsLoopInvariantValue(loop, index);
-    const std::int64_t stride = current ? current->GetSize() : 0;
-    auto term = AnalyzeAffine(index, iv, loop);
-    if (!term.valid) {
-      total = {};
-    } else if (total.valid) {
-      total = Combine(total, Scale(term, stride), +1);
+  bool all_indices_loop_invariant = looputils::IsLoopInvariantValue(loop, info.base);
+  for (auto* gep : gep_chain) {
+    std::shared_ptr<Type> current = gep->GetSourceType();
+    for (std::size_t i = 0; i < gep->GetNumIndices(); ++i) {
+      auto* index = gep->GetIndex(i);
+      all_indices_loop_invariant &= looputils::IsLoopInvariantValue(loop, index);
+      const std::int64_t stride = current ? current->GetSize() : 0;
+      auto term = AnalyzeAffine(index, iv, loop);
+      if (!term.valid) {
+        total = {};
+      } else if (total.valid) {
+        total = Combine(total, Scale(term, stride), +1);
+      }
+      current = AdvanceGEPType(current);
    }
-    current = AdvanceGEPType(current);
  }
  info.invariant_address = all_indices_loop_invariant;
  info.byte_offset = total;
@ -303,7 +359,8 @@ inline bool SameAffineAddress(const PointerInfo& lhs, const PointerInfo& rhs) {
  return lhs.base == rhs.base && lhs.byte_offset.valid && rhs.byte_offset.valid &&
         lhs.byte_offset.var == rhs.byte_offset.var &&
         lhs.byte_offset.coeff == rhs.byte_offset.coeff &&
-         lhs.byte_offset.constant == rhs.byte_offset.constant;
+         lhs.byte_offset.constant == rhs.byte_offset.constant &&
+         SameInvariantTerms(lhs.byte_offset, rhs.byte_offset);
 }

 inline bool MayAliasSameIteration(const PointerInfo& lhs, const PointerInfo& rhs) {
@ -328,8 +385,11 @@ inline bool MayAliasSameIteration(const PointerInfo& lhs, const PointerInfo& rhs
  if (lhs.byte_offset.coeff != rhs.byte_offset.coeff) {
    return true;
  }
+  if (!SameInvariantTerms(lhs.byte_offset, rhs.byte_offset)) {
+    return true;
+  }
  const auto diff = std::llabs(lhs.byte_offset.constant - rhs.byte_offset.constant);
-  const auto overlap = std::min(lhs.access_size, rhs.access_size);
+  const auto overlap = std::max(lhs.access_size, rhs.access_size);
  return diff < overlap;
 }

@ -354,6 +414,9 @@ inline bool HasCrossIterationDependence(const PointerInfo& lhs, const PointerInf
  if (lhs.byte_offset.var != rhs.byte_offset.var) {
    return true;
  }
+  if (!SameInvariantTerms(lhs.byte_offset, rhs.byte_offset)) {
+    return true;
+  }

  const auto lhs_step = lhs.byte_offset.coeff * iv_stride;
  const auto rhs_step = rhs.byte_offset.coeff * iv_stride;
@ -361,8 +424,14 @@ inline bool HasCrossIterationDependence(const PointerInfo& lhs, const PointerInf
    return MayAliasSameIteration(lhs, rhs);
  }
  if (lhs_step == rhs_step && lhs_step != 0) {
-    const auto diff = rhs.byte_offset.constant - lhs.byte_offset.constant;
-    return diff != 0 && diff % std::llabs(lhs_step) == 0;
+    const auto period = std::llabs(lhs_step);
+    auto diff = std::llabs(rhs.byte_offset.constant - lhs.byte_offset.constant);
+    diff %= period;
+    if (diff == 0) {
+      return period < std::max(lhs.access_size, rhs.access_size);
+    }
+    diff = std::min(diff, period - diff);
+    return diff < std::max(lhs.access_size, rhs.access_size);
  }
  return true;
 }
--- a/src/ir/passes/LoopParallelize.cpp
+++ b/src/ir/passes/LoopParallelize.cpp
@ -0,0 +1,456 @@
+#include "ir/PassManager.h"
+
+#include "ir/Analysis.h"
+#include "ir/IR.h"
+#include "LoopMemoryUtils.h"
+#include "LoopPassUtils.h"
+#include "utils/OptConfig.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace ir {
+namespace {
+
+constexpr const char* kRuntimeParallelFor = "__nudtc_parallel_for_i32";
+constexpr const char* kWorkerPrefix = "__nudtc_par_worker_";
+constexpr const char* kCapturePrefix = "__nudtc_par_cap_";
+constexpr int kDefaultParallelMinTrip = 8192;
+
+struct CaptureInfo {
+  Value* value = nullptr;
+  GlobalValue* slot = nullptr;
+};
+
+struct ParallelLoopInfo {
+  Loop* loop = nullptr;
+  BasicBlock* preheader = nullptr;
+  BasicBlock* header = nullptr;
+  BasicBlock* body = nullptr;
+  BasicBlock* exit = nullptr;
+  Value* begin = nullptr;
+  Value* end = nullptr;
+  loopmem::SimpleInductionVar iv;
+  BinaryInst* compare = nullptr;
+  BinaryInst* step = nullptr;
+  std::vector<CaptureInfo> captures;
+};
+
+bool HasPrefix(const std::string& value, const char* prefix) {
+  return value.rfind(prefix, 0) == 0;
+}
+
+void ReplaceTerminatorWithCondBr(BasicBlock* block, Value* cond,
+                                 BasicBlock* then_block,
+                                 BasicBlock* else_block) {
+  auto& instructions = block->GetInstructions();
+  if (instructions.empty() || !instructions.back()->IsTerminator()) {
+    return;
+  }
+  instructions.back()->ClearAllOperands();
+  auto branch =
+      std::make_unique<CondBrInst>(cond, then_block, else_block, nullptr);
+  branch->SetParent(block);
+  instructions.back() = std::move(branch);
+}
+
+int ReadPositiveEnvInt(const char* name, int fallback, int min_value, int max_value) {
+  const char* raw = std::getenv(name);
+  if (raw == nullptr || *raw == '\0') {
+    return fallback;
+  }
+  char* end = nullptr;
+  const long parsed = std::strtol(raw, &end, 10);
+  if (end == raw || parsed < min_value || parsed > max_value) {
+    return fallback;
+  }
+  return static_cast<int>(parsed);
+}
+
+bool IsAlreadyParallelGuarded(BasicBlock* preheader) {
+  auto* branch = dyncast<CondBrInst>(looputils::GetTerminator(preheader));
+  if (!branch) {
+    return false;
+  }
+  auto* cond = branch->GetCondition();
+  return cond != nullptr && cond->GetName().find("%par.guard.") == 0;
+}
+
+int NextWorkerId(const Module& module) {
+  int next_id = 0;
+  const std::string prefix = kWorkerPrefix;
+  for (const auto& function : module.GetFunctions()) {
+    if (!function || !HasPrefix(function->GetName(), kWorkerPrefix)) {
+      continue;
+    }
+    const auto suffix = function->GetName().substr(prefix.size());
+    try {
+      next_id = std::max(next_id, std::stoi(suffix) + 1);
+    } catch (...) {
+    }
+  }
+  return next_id;
+}
+
+bool IsSupportedExternalValue(const Loop& loop, Value* value) {
+  if (value == nullptr || value->IsConstant() || dyncast<GlobalValue>(value) ||
+      dyncast<Function>(value) || dyncast<BasicBlock>(value)) {
+    return true;
+  }
+  auto* inst = dyncast<Instruction>(value);
+  return inst != nullptr && loop.Contains(inst->GetParent());
+}
+
+bool IsCapturableValue(const Loop& loop, Value* value) {
+  if (IsSupportedExternalValue(loop, value)) {
+    return false;
+  }
+  if (!value || !value->GetType() || value->IsVoid() || value->IsLabel() ||
+      value->GetType()->IsFunction() || value->IsArray()) {
+    return false;
+  }
+  if (!value->GetType()->IsInt32() && !value->GetType()->IsFloat() &&
+      !value->GetType()->IsPointer()) {
+    return false;
+  }
+  return value->IsArgument() || dyncast<Instruction>(value) != nullptr;
+}
+
+bool CollectCaptures(const Loop& loop, const DominatorTree& dom_tree,
+                     const ParallelLoopInfo& info,
+                     std::vector<Value*>& captures) {
+  std::unordered_set<Value*> seen;
+  auto observe = [&](Value* value) {
+    if (IsSupportedExternalValue(loop, value)) {
+      return true;
+    }
+    if (!IsCapturableValue(loop, value)) {
+      return false;
+    }
+    if (auto* inst = dyncast<Instruction>(value)) {
+      if (!dom_tree.Dominates(inst->GetParent(), info.preheader)) {
+        return false;
+      }
+    }
+    if (seen.insert(value).second) {
+      captures.push_back(value);
+    }
+    return true;
+  };
+
+  for (auto* block : loop.block_list) {
+    for (const auto& inst_ptr : block->GetInstructions()) {
+      auto* inst = inst_ptr.get();
+      if (dyncast<PhiInst>(inst) || inst->IsTerminator() || inst == info.step) {
+        continue;
+      }
+      for (std::size_t i = 0; i < inst->GetNumOperands(); ++i) {
+        if (!observe(inst->GetOperand(i))) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+bool StepOnlyFeedsLoopPhi(BinaryInst* step, PhiInst* iv) {
+  if (!step || !iv) {
+    return false;
+  }
+  for (const auto& use : step->GetUses()) {
+    auto* user_inst = dyncast<Instruction>(use.GetUser());
+    if (user_inst != iv) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ExitHasHeaderPhiUse(BasicBlock* exit, BasicBlock* header) {
+  if (!exit || !header) {
+    return true;
+  }
+  for (const auto& inst_ptr : exit->GetInstructions()) {
+    auto* phi = dyncast<PhiInst>(inst_ptr.get());
+    if (!phi) {
+      break;
+    }
+    if (looputils::GetPhiIncomingIndex(phi, header) >= 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool HasWriteSelfDependence(const std::vector<loopmem::MemoryAccessInfo>& accesses,
+                            int iv_stride) {
+  for (const auto& access : accesses) {
+    if (access.is_write &&
+        loopmem::HasCrossIterationDependence(access.ptr, access.ptr, iv_stride)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool BodyUsesOnlyCloneableState(const Loop& loop, const DominatorTree& dom_tree,
+                                const ParallelLoopInfo& info,
+                                std::vector<Value*>& captures) {
+  for (const auto& inst_ptr : info.body->GetInstructions()) {
+    auto* inst = inst_ptr.get();
+    if (inst->IsTerminator() || inst == info.step) {
+      continue;
+    }
+    if (!looputils::IsCloneableInstruction(inst) || dyncast<MemsetInst>(inst) ||
+        dyncast<AllocaInst>(inst)) {
+      return false;
+    }
+  }
+  return CollectCaptures(loop, dom_tree, info, captures);
+}
+
+bool MatchParallelLoop(Loop& loop, const DominatorTree& dom_tree,
+                       ParallelLoopInfo& info) {
+  if (!loop.preheader || !loop.header || !loop.IsInnermost() ||
+      IsAlreadyParallelGuarded(loop.preheader)) {
+    return false;
+  }
+
+  BasicBlock* body = nullptr;
+  BasicBlock* exit = nullptr;
+  if (!loopmem::GetCanonicalLoopBlocks(loop, body, exit)) {
+    return false;
+  }
+
+  std::vector<PhiInst*> phis;
+  loopmem::SimpleInductionVar iv;
+  bool found_iv = false;
+  for (const auto& inst_ptr : loop.header->GetInstructions()) {
+    auto* phi = dyncast<PhiInst>(inst_ptr.get());
+    if (!phi) {
+      break;
+    }
+    phis.push_back(phi);
+    if (!found_iv && loopmem::MatchSimpleInductionVariable(loop, loop.preheader, phi, iv)) {
+      found_iv = true;
+    }
+  }
+  if (!found_iv || phis.size() != 1 || iv.stride != 1) {
+    return false;
+  }
+
+  auto* branch = dyncast<CondBrInst>(looputils::GetTerminator(loop.header));
+  auto* compare = branch ? dyncast<BinaryInst>(branch->GetCondition()) : nullptr;
+  if (!branch || branch->GetThenBlock() != body || !compare ||
+      compare->GetOpcode() != Opcode::ICmpLT || compare->GetLhs() != iv.phi ||
+      !looputils::IsLoopInvariantValue(loop, compare->GetRhs())) {
+    return false;
+  }
+
+  auto* step = dyncast<BinaryInst>(iv.latch_value);
+  if (!step || step->GetParent() != body || !StepOnlyFeedsLoopPhi(step, iv.phi) ||
+      ExitHasHeaderPhiUse(exit, loop.header)) {
+    return false;
+  }
+
+  info.loop = &loop;
+  info.preheader = loop.preheader;
+  info.header = loop.header;
+  info.body = body;
+  info.exit = exit;
+  const int begin_index = looputils::GetPhiIncomingIndex(iv.phi, loop.preheader);
+  if (begin_index < 0) {
+    return false;
+  }
+  info.begin = iv.phi->GetIncomingValue(begin_index);
+  info.end = compare->GetRhs();
+  info.iv = iv;
+  info.compare = compare;
+  info.step = step;
+  if (!info.begin || !info.end) {
+    return false;
+  }
+
+  const auto accesses = loopmem::CollectMemoryAccesses(loop, iv.phi);
+  if (accesses.empty() || HasWriteSelfDependence(accesses, iv.stride) ||
+      !loopmem::IsLoopParallelizable(loop, iv.phi, iv.stride, accesses)) {
+    return false;
+  }
+
+  std::vector<Value*> captures;
+  if (!BodyUsesOnlyCloneableState(loop, dom_tree, info, captures)) {
+    return false;
+  }
+  info.captures.clear();
+  info.captures.reserve(captures.size());
+  for (auto* value : captures) {
+    info.captures.push_back({value, nullptr});
+  }
+  return true;
+}
+
+Function* EnsureRuntime(Module& module) {
+  auto* runtime = module.CreateFunction(
+      kRuntimeParallelFor, Type::GetVoidType(),
+      {Type::GetInt32Type(), Type::GetInt32Type(), Type::GetInt32Type(),
+       Type::GetPointerType()},
+      {"%begin", "%end", "%step", "%body"}, true);
+  runtime->SetEffectInfo(true, true, true, true, false, true, false);
+  return runtime;
+}
+
+Function* BuildWorker(Module& module, const ParallelLoopInfo& info, int worker_id) {
+  auto* worker = module.CreateFunction(
+      std::string(kWorkerPrefix) + std::to_string(worker_id), Type::GetVoidType(),
+      {Type::GetInt32Type(), Type::GetInt32Type()}, {"%begin", "%end"}, false);
+  worker->SetEffectInfo(true, true, true, true, false, false, false);
+
+  auto* entry = worker->CreateBlock("entry");
+  auto* header = worker->CreateBlock("par.header");
+  auto* body = worker->CreateBlock("par.body");
+  auto* exit = worker->CreateBlock("par.exit");
+
+  std::unordered_map<Value*, Value*> remap;
+  for (const auto& capture : info.captures) {
+    if (!capture.value || !capture.slot) {
+      return nullptr;
+    }
+    auto* loaded = entry->Append<LoadInst>(
+        capture.value->GetType(), capture.slot, nullptr,
+        looputils::NextSyntheticName(*worker, "par.cap."));
+    remap[capture.value] = loaded;
+  }
+
+  entry->Append<UncondBrInst>(header, nullptr);
+  entry->AddSuccessor(header);
+  header->AddPredecessor(entry);
+
+  auto* worker_iv = header->Append<PhiInst>(
+      Type::GetInt32Type(), nullptr,
+      looputils::NextSyntheticName(*worker, "par.iv."));
+  worker_iv->AddIncoming(worker->GetArgument(0), entry);
+  auto* worker_cmp = header->Append<BinaryInst>(
+      Opcode::ICmpLT, Type::GetBoolType(), worker_iv, worker->GetArgument(1), nullptr,
+      looputils::NextSyntheticName(*worker, "par.cmp."));
+  header->Append<CondBrInst>(worker_cmp, body, exit, nullptr);
+  header->AddSuccessor(body);
+  header->AddSuccessor(exit);
+  body->AddPredecessor(header);
+  exit->AddPredecessor(header);
+
+  remap[info.iv.phi] = worker_iv;
+  for (const auto& inst_ptr : info.body->GetInstructions()) {
+    auto* inst = inst_ptr.get();
+    if (inst->IsTerminator() || inst == info.step) {
+      continue;
+    }
+    looputils::CloneInstruction(*worker, inst, body, remap, "par.");
+  }
+  auto* next_iv = body->Append<BinaryInst>(
+      Opcode::Add, Type::GetInt32Type(), worker_iv, looputils::ConstInt(1), nullptr,
+      looputils::NextSyntheticName(*worker, "par.next."));
+  worker_iv->AddIncoming(next_iv, body);
+  body->Append<UncondBrInst>(header, nullptr);
+  body->AddSuccessor(header);
+  header->AddPredecessor(body);
+
+  exit->Append<ReturnInst>(nullptr, nullptr);
+  return worker;
+}
+
+void CreateCaptureSlots(Module& module, ParallelLoopInfo& info, int worker_id) {
+  for (std::size_t i = 0; i < info.captures.size(); ++i) {
+    auto& capture = info.captures[i];
+    capture.slot = module.CreateGlobalValue(
+        std::string(kCapturePrefix) + std::to_string(worker_id) + "_" +
+            std::to_string(i),
+        capture.value->GetType(), false, nullptr);
+  }
+}
+
+bool ParallelizeFirstLoopInFunction(Module& module, Function& function, int* worker_id) {
+  if (function.IsExternal() || !function.GetEntryBlock() ||
+      HasPrefix(function.GetName(), kWorkerPrefix) ||
+      function.GetName() == kRuntimeParallelFor) {
+    return false;
+  }
+
+  DominatorTree dom_tree(function);
+  LoopInfo loop_info(function, dom_tree);
+  for (auto* loop : loop_info.GetLoopsInPostOrder()) {
+    ParallelLoopInfo info;
+    if (!MatchParallelLoop(*loop, dom_tree, info)) {
+      continue;
+    }
+
+    auto* runtime = EnsureRuntime(module);
+    const int current_worker_id = (*worker_id)++;
+    CreateCaptureSlots(module, info, current_worker_id);
+    auto* worker = BuildWorker(module, info, current_worker_id);
+    if (worker == nullptr) {
+      return false;
+    }
+    auto* parallel_block = function.CreateBlock(
+        looputils::NextSyntheticBlockName(function, "par.dispatch"));
+
+    const int min_trip =
+        ReadPositiveEnvInt("NUDTC_PARALLEL_MIN_TRIP", kDefaultParallelMinTrip, 1,
+                           1 << 30);
+    auto* trip_count = info.preheader->Insert<BinaryInst>(
+        looputils::GetTerminatorIndex(info.preheader), Opcode::Sub,
+        Type::GetInt32Type(), info.end, info.begin, nullptr,
+        looputils::NextSyntheticName(function, "par.trip."));
+    auto* large_enough = info.preheader->Insert<BinaryInst>(
+        looputils::GetTerminatorIndex(info.preheader), Opcode::ICmpGE,
+        Type::GetBoolType(), trip_count, looputils::ConstInt(min_trip), nullptr,
+        looputils::NextSyntheticName(function, "par.guard."));
+
+    ReplaceTerminatorWithCondBr(info.preheader, large_enough, parallel_block,
+                                info.header);
+    info.preheader->AddSuccessor(parallel_block);
+    parallel_block->AddPredecessor(info.preheader);
+
+    for (const auto& capture : info.captures) {
+      parallel_block->Append<StoreInst>(capture.value, capture.slot, nullptr);
+    }
+    parallel_block->Append<CallInst>(
+        runtime,
+        std::vector<Value*>{info.begin, info.end, looputils::ConstInt(1), worker},
+        nullptr, "");
+    parallel_block->Append<UncondBrInst>(info.exit, nullptr);
+    parallel_block->AddSuccessor(info.exit);
+    info.exit->AddPredecessor(parallel_block);
+    return true;
+  }
+  return false;
+}
+
+}  // namespace
+
+bool RunLoopParallelize(Module& module) {
+  if (utils::IsEnvFlagSet("NUDTC_DISABLE_LOOP_PARALLELIZE")) {
+    return false;
+  }
+
+  std::vector<Function*> functions;
+  for (const auto& function : module.GetFunctions()) {
+    if (function && !function->IsExternal()) {
+      functions.push_back(function.get());
+    }
+  }
+
+  bool changed = false;
+  int worker_id = NextWorkerId(module);
+  for (auto* function : functions) {
+    changed |= ParallelizeFirstLoopInFunction(module, *function, &worker_id);
+  }
+  return changed;
+}
+
+}  // namespace ir
--- a/src/ir/passes/LoopStrengthReduction.cpp
+++ b/src/ir/passes/LoopStrengthReduction.cpp
@ -21,7 +21,11 @@ struct InductionVarInfo {

 struct GepReductionCandidate {
  GetElementPtrInst* gep = nullptr;
-  std::vector<Value*> init_indices;
+  struct InitIndex {
+    Value* base = nullptr;
+    int offset = 0;
+  };
+  std::vector<InitIndex> init_indices;
  int step_elements = 0;
 };

@ -72,6 +76,21 @@ Value* BuildScaledValue(Function& function, BasicBlock* block, Value* base,
  return BuildMulValue(function, block, base, looputils::ConstInt(factor), prefix);
 }

+Value* BuildOffsetValue(Function& function, BasicBlock* block, Value* base,
+                        int offset, const std::string& prefix) {
+  if (offset == 0) {
+    return base;
+  }
+  if (auto* base_const = dyncast<ConstantInt>(base)) {
+    return looputils::ConstInt(base_const->GetValue() + offset);
+  }
+  const auto opcode = offset > 0 ? Opcode::Add : Opcode::Sub;
+  return block->Insert<BinaryInst>(
+      looputils::GetTerminatorIndex(block), opcode, Type::GetInt32Type(),
+      base, looputils::ConstInt(std::abs(offset)), nullptr,
+      looputils::NextSyntheticName(function, prefix));
+}
+
 bool MatchSimpleInductionVariable(const Loop& loop, BasicBlock* preheader,
                                  PhiInst* phi, InductionVarInfo& info) {
  if (!phi || !phi->GetType() || !phi->GetType()->IsInt32() ||
@ -95,7 +114,8 @@ bool MatchSimpleInductionVariable(const Loop& loop, BasicBlock* preheader,
  }

  auto* step_inst = dyncast<BinaryInst>(phi->GetIncomingValue(latch_index));
-  if (!step_inst || step_inst->GetParent() != latch) {
+  if (!step_inst || step_inst->GetParent() == nullptr ||
+      !loop.Contains(step_inst->GetParent())) {
    return false;
  }

@ -157,6 +177,51 @@ bool IsMulCandidate(const Loop& loop, Instruction* inst, PhiInst* phi, Value*& f
  return false;
 }

+bool MatchAffineIVIndex(const Loop& loop, Value* index,
+                        const InductionVarInfo& iv, Value** init_base,
+                        int* init_offset) {
+  if (index == iv.phi) {
+    *init_base = iv.start;
+    *init_offset = 0;
+    return true;
+  }
+
+  auto* inst = dyncast<BinaryInst>(index);
+  if (inst == nullptr || inst->GetParent() == nullptr ||
+      !loop.Contains(inst->GetParent())) {
+    return false;
+  }
+
+  auto match_const = [](Value* value, int* out) {
+    auto* constant = dyncast<ConstantInt>(value);
+    if (constant == nullptr) {
+      return false;
+    }
+    *out = constant->GetValue();
+    return true;
+  };
+
+  int offset = 0;
+  if (inst->GetOpcode() == Opcode::Add) {
+    if (inst->GetLhs() == iv.phi && match_const(inst->GetRhs(), &offset)) {
+      *init_base = iv.start;
+      *init_offset = offset;
+      return true;
+    }
+    if (inst->GetRhs() == iv.phi && match_const(inst->GetLhs(), &offset)) {
+      *init_base = iv.start;
+      *init_offset = offset;
+      return true;
+    }
+  } else if (inst->GetOpcode() == Opcode::Sub && inst->GetLhs() == iv.phi &&
+             match_const(inst->GetRhs(), &offset)) {
+    *init_base = iv.start;
+    *init_offset = -offset;
+    return true;
+  }
+  return false;
+}
+
 Value* CreateReducedPhi(Function& function, BasicBlock* header, BasicBlock* preheader,
                        const InductionVarInfo& iv, Value* factor) {
  auto* reduced_phi = header->Insert<PhiInst>(
@ -288,7 +353,7 @@ bool BuildGepReductionCandidate(const Loop& loop, const InductionVarInfo& iv,
  auto current_type = gep->GetSourceType();
  std::int64_t step_bytes = 0;
  bool saw_iv = false;
-  std::vector<Value*> init_indices;
+  std::vector<GepReductionCandidate::InitIndex> init_indices;
  init_indices.reserve(gep->GetNumIndices());

  for (std::size_t i = 0; i < gep->GetNumIndices(); ++i) {
@ -296,16 +361,18 @@ bool BuildGepReductionCandidate(const Loop& loop, const InductionVarInfo& iv,
    const std::int64_t stride =
        current_type ? static_cast<std::int64_t>(current_type->GetSize()) : 0;

-    if (index == iv.phi) {
+    Value* affine_init = nullptr;
+    int affine_offset = 0;
+    if (MatchAffineIVIndex(loop, index, iv, &affine_init, &affine_offset)) {
      saw_iv = true;
      step_bytes += stride * static_cast<std::int64_t>(iv.stride);
-      init_indices.push_back(iv.start);
+      init_indices.push_back({affine_init, affine_offset});
    } else {
      if (!looputils::IsLoopInvariantValue(loop, index) ||
          !DominatesBlock(dom_tree, index, preheader)) {
        return false;
      }
-      init_indices.push_back(index);
+      init_indices.push_back({index, 0});
    }

    if (current_type && current_type->IsArray()) {
@ -328,12 +395,40 @@ bool BuildGepReductionCandidate(const Loop& loop, const InductionVarInfo& iv,
  return true;
 }

+bool SameGepReductionInitIndex(const GepReductionCandidate::InitIndex& lhs,
+                               const GepReductionCandidate::InitIndex& rhs) {
+  return lhs.base == rhs.base && lhs.offset == rhs.offset;
+}
+
+bool SameGepReductionCandidate(const GepReductionCandidate& lhs,
+                               const GepReductionCandidate& rhs) {
+  if (lhs.gep == nullptr || rhs.gep == nullptr ||
+      lhs.gep->GetPointer() != rhs.gep->GetPointer() ||
+      lhs.step_elements != rhs.step_elements ||
+      lhs.init_indices.size() != rhs.init_indices.size()) {
+    return false;
+  }
+  for (std::size_t i = 0; i < lhs.init_indices.size(); ++i) {
+    if (!SameGepReductionInitIndex(lhs.init_indices[i], rhs.init_indices[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
 Value* CreateReducedPointerPhi(Function& function, const Loop& loop,
                               BasicBlock* preheader,
                               const GepReductionCandidate& candidate) {
+  std::vector<Value*> init_indices;
+  init_indices.reserve(candidate.init_indices.size());
+  for (const auto& index : candidate.init_indices) {
+    init_indices.push_back(BuildOffsetValue(function, preheader, index.base,
+                                            index.offset, "lsr.idx."));
+  }
+
  auto* init = preheader->Insert<GetElementPtrInst>(
      looputils::GetTerminatorIndex(preheader), candidate.gep->GetSourceType(),
-      candidate.gep->GetPointer(), candidate.init_indices, nullptr,
+      candidate.gep->GetPointer(), init_indices, nullptr,
      looputils::NextSyntheticName(function, "lsr.ptr.init."));

  auto* ptr_phi = loop.header->Insert<PhiInst>(
@ -389,12 +484,27 @@ bool ReduceLoopAddressing(Function& function, const Loop& loop,
      }
    }

+    struct CachedReduction {
+      GepReductionCandidate candidate;
+      Value* value = nullptr;
+    };
+    std::vector<CachedReduction> reduced_cache;
    for (const auto& candidate : candidates) {
      if (candidate.gep == nullptr || candidate.gep->GetParent() == nullptr ||
          candidate.gep->GetUses().empty()) {
        continue;
      }
-      auto* replacement = CreateReducedPointerPhi(function, loop, preheader, candidate);
+      Value* replacement = nullptr;
+      for (const auto& cached : reduced_cache) {
+        if (SameGepReductionCandidate(candidate, cached.candidate)) {
+          replacement = cached.value;
+          break;
+        }
+      }
+      if (replacement == nullptr) {
+        replacement = CreateReducedPointerPhi(function, loop, preheader, candidate);
+        reduced_cache.push_back({candidate, replacement});
+      }
      candidate.gep->ReplaceAllUsesWith(replacement);
      to_remove.push_back(candidate.gep);
      changed = true;
--- a/src/ir/passes/LoopUnroll.cpp
+++ b/src/ir/passes/LoopUnroll.cpp
@ -4,6 +4,7 @@
 #include "ir/IR.h"
 #include "LoopMemoryUtils.h"
 #include "LoopPassUtils.h"
+#include "utils/OptConfig.h"

 #include <unordered_map>
 #include <vector>
@ -110,6 +111,11 @@ int ChooseUnrollFactor(BasicBlock* body) {
      ++mem_ops;
    }
  }
+  const bool aggressive =
+      utils::IsEnabledUnlessEnvFlag("NUDTC_DISABLE_AGGRESSIVE_LOOP_UNROLL");
+  if (aggressive && inst_count >= 2 && inst_count <= 14 && mem_ops <= 3) {
+    return 4;
+  }
  if (inst_count >= 2 && inst_count <= 6 && mem_ops <= 2) {
    return 4;
  }
@ -140,6 +146,11 @@ bool HasUnsafeLoopCarriedMemoryDependence(
  return false;
 }

+bool IsSafeCallToCloneInUnrolledLoop(CallInst* call) {
+  auto* callee = call ? call->GetCallee() : nullptr;
+  return callee != nullptr && callee->CanDiscardUnusedCall();
+}
+
 bool MatchCountedLoop(Loop& loop, CountedLoopInfo& info) {
  if (!loop.preheader || !loop.header || !loop.IsInnermost()) {
    return false;
@ -224,10 +235,14 @@ bool MatchCountedLoop(Loop& loop, CountedLoopInfo& info) {
    if (inst->IsTerminator()) {
      continue;
    }
-    if (!looputils::IsCloneableInstruction(inst) || dyncast<CallInst>(inst) ||
+    if (!looputils::IsCloneableInstruction(inst) ||
        dyncast<MemsetInst>(inst) || dyncast<AllocaInst>(inst)) {
      return false;
    }
+    if (auto* call = dyncast<CallInst>(inst);
+        call != nullptr && !IsSafeCallToCloneInUnrolledLoop(call)) {
+      return false;
+    }
  }

  info.loop = &loop;
--- a/src/ir/passes/LoopVectorize.cpp
+++ b/src/ir/passes/LoopVectorize.cpp
@ -0,0 +1,607 @@
+#include "ir/PassManager.h"
+
+#include "ir/Analysis.h"
+#include "ir/IR.h"
+#include "LoopMemoryUtils.h"
+#include "LoopPassUtils.h"
+#include "utils/OptConfig.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace ir {
+namespace {
+
+constexpr int kMinConstantTripToVectorize = 16;
+
+enum class VectorKind { I32, F32 };
+enum class VectorOp { Add, Sub, Mul };
+
+struct VectorLoopInfo {
+  Loop* loop = nullptr;
+  BasicBlock* preheader = nullptr;
+  BasicBlock* header = nullptr;
+  BasicBlock* body = nullptr;
+  BasicBlock* exit = nullptr;
+  loopmem::SimpleInductionVar iv;
+  Value* begin = nullptr;
+  Value* end = nullptr;
+  StoreInst* store = nullptr;
+  LoadInst* lhs_load = nullptr;
+  LoadInst* rhs_load = nullptr;
+  BinaryInst* binary = nullptr;
+  VectorKind kind = VectorKind::I32;
+  VectorOp op = VectorOp::Add;
+};
+
+struct FillLoopInfo {
+  Loop* loop = nullptr;
+  BasicBlock* preheader = nullptr;
+  BasicBlock* header = nullptr;
+  BasicBlock* body = nullptr;
+  BasicBlock* exit = nullptr;
+  loopmem::SimpleInductionVar iv;
+  Value* begin = nullptr;
+  Value* end = nullptr;
+  StoreInst* store = nullptr;
+  Value* fill_value = nullptr;
+  VectorKind kind = VectorKind::I32;
+};
+
+bool HasExitPhiUse(BasicBlock* exit, BasicBlock* header) {
+  if (!exit || !header) {
+    return true;
+  }
+  for (const auto& inst_ptr : exit->GetInstructions()) {
+    auto* phi = dyncast<PhiInst>(inst_ptr.get());
+    if (!phi) {
+      break;
+    }
+    if (looputils::GetPhiIncomingIndex(phi, header) >= 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool StepOnlyFeedsLoopPhi(BinaryInst* step, PhiInst* iv) {
+  if (!step || !iv) {
+    return false;
+  }
+  for (const auto& use : step->GetUses()) {
+    if (dyncast<Instruction>(use.GetUser()) != iv) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool MatchVectorOpcode(Opcode opcode, VectorKind* kind, VectorOp* op) {
+  switch (opcode) {
+    case Opcode::Add:
+      *kind = VectorKind::I32;
+      *op = VectorOp::Add;
+      return true;
+    case Opcode::Sub:
+      *kind = VectorKind::I32;
+      *op = VectorOp::Sub;
+      return true;
+    case Opcode::Mul:
+      *kind = VectorKind::I32;
+      *op = VectorOp::Mul;
+      return true;
+    case Opcode::FAdd:
+      *kind = VectorKind::F32;
+      *op = VectorOp::Add;
+      return true;
+    case Opcode::FSub:
+      *kind = VectorKind::F32;
+      *op = VectorOp::Sub;
+      return true;
+    case Opcode::FMul:
+      *kind = VectorKind::F32;
+      *op = VectorOp::Mul;
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsExpectedScalarType(Value* value, VectorKind kind) {
+  return value && value->GetType() &&
+         ((kind == VectorKind::I32 && value->GetType()->IsInt32()) ||
+          (kind == VectorKind::F32 && value->GetType()->IsFloat()));
+}
+
+bool IsDirectVectorIndex(Value* value, PhiInst* iv, const Loop& loop) {
+  return value == iv || looputils::IsLoopInvariantValue(loop, value);
+}
+
+Value* RemapIndexAtBegin(Value* value, PhiInst* iv, Value* begin, const Loop& loop) {
+  if (value == iv) {
+    return begin;
+  }
+  if (looputils::IsLoopInvariantValue(loop, value)) {
+    return value;
+  }
+  return nullptr;
+}
+
+Value* MaterializePointerAtBegin(Function& function, BasicBlock* insert_block,
+                                 Value* pointer, PhiInst* iv, Value* begin,
+                                 const Loop& loop) {
+  if (looputils::IsLoopInvariantValue(loop, pointer)) {
+    return pointer;
+  }
+  auto* gep = dyncast<GetElementPtrInst>(pointer);
+  if (!gep || !gep->GetParent() || !loop.Contains(gep->GetParent())) {
+    return nullptr;
+  }
+
+  auto* base =
+      MaterializePointerAtBegin(function, insert_block, gep->GetPointer(), iv, begin, loop);
+  if (!base) {
+    return nullptr;
+  }
+
+  std::vector<Value*> indices;
+  indices.reserve(gep->GetNumIndices());
+  for (std::size_t i = 0; i < gep->GetNumIndices(); ++i) {
+    auto* index = gep->GetIndex(i);
+    if (!IsDirectVectorIndex(index, iv, loop)) {
+      return nullptr;
+    }
+    auto* mapped = RemapIndexAtBegin(index, iv, begin, loop);
+    if (!mapped) {
+      return nullptr;
+    }
+    indices.push_back(mapped);
+  }
+
+  return insert_block->Insert<GetElementPtrInst>(
+      looputils::GetTerminatorIndex(insert_block), gep->GetSourceType(), base,
+      indices, nullptr, looputils::NextSyntheticName(function, "vec.ptr."));
+}
+
+bool IsUnitStrideAccess(const loopmem::PointerInfo& ptr, PhiInst* iv, int access_size) {
+  return ptr.byte_offset.valid && ptr.byte_offset.var == iv &&
+         ptr.byte_offset.coeff == access_size;
+}
+
+bool MatchVectorLoop(Loop& loop, VectorLoopInfo& info) {
+  if (!loop.preheader || !loop.header || !loop.IsInnermost()) {
+    return false;
+  }
+
+  BasicBlock* body = nullptr;
+  BasicBlock* exit = nullptr;
+  if (!loopmem::GetCanonicalLoopBlocks(loop, body, exit)) {
+    return false;
+  }
+
+  std::vector<PhiInst*> phis;
+  loopmem::SimpleInductionVar iv;
+  bool found_iv = false;
+  for (const auto& inst_ptr : loop.header->GetInstructions()) {
+    auto* phi = dyncast<PhiInst>(inst_ptr.get());
+    if (!phi) {
+      break;
+    }
+    phis.push_back(phi);
+    if (!found_iv && loopmem::MatchSimpleInductionVariable(loop, loop.preheader, phi, iv)) {
+      found_iv = true;
+    }
+  }
+  if (!found_iv || phis.size() != 1 || iv.stride != 1) {
+    return false;
+  }
+
+  auto* branch = dyncast<CondBrInst>(looputils::GetTerminator(loop.header));
+  auto* compare = branch ? dyncast<BinaryInst>(branch->GetCondition()) : nullptr;
+  if (!branch || branch->GetThenBlock() != body || !compare ||
+      compare->GetOpcode() != Opcode::ICmpLT || compare->GetLhs() != iv.phi ||
+      !looputils::IsLoopInvariantValue(loop, compare->GetRhs())) {
+    return false;
+  }
+  auto* preheader_branch = dyncast<UncondBrInst>(loop.preheader->GetInstructions().empty()
+                                                     ? nullptr
+                                                     : loop.preheader->GetInstructions().back().get());
+  if (!preheader_branch || preheader_branch->GetDest() != loop.header ||
+      loop.preheader->GetSuccessors().size() != 1) {
+    return false;
+  }
+
+  auto* step = dyncast<BinaryInst>(iv.latch_value);
+  if (!step || step->GetParent() != body || !StepOnlyFeedsLoopPhi(step, iv.phi) ||
+      HasExitPhiUse(exit, loop.header)) {
+    return false;
+  }
+
+  if (auto* begin_const = dyncast<ConstantInt>(iv.start)) {
+    if (auto* end_const = dyncast<ConstantInt>(compare->GetRhs())) {
+      if (end_const->GetValue() - begin_const->GetValue() < kMinConstantTripToVectorize) {
+        return false;
+      }
+    }
+  }
+
+  StoreInst* store = nullptr;
+  BinaryInst* binary = nullptr;
+  LoadInst* lhs_load = nullptr;
+  LoadInst* rhs_load = nullptr;
+
+  for (const auto& inst_ptr : body->GetInstructions()) {
+    auto* inst = inst_ptr.get();
+    if (inst->IsTerminator() || inst == step) {
+      continue;
+    }
+    if (auto* gep = dyncast<GetElementPtrInst>(inst)) {
+      (void)gep;
+      continue;
+    }
+    if (auto* load = dyncast<LoadInst>(inst)) {
+      if (lhs_load == nullptr) {
+        lhs_load = load;
+      } else if (rhs_load == nullptr) {
+        rhs_load = load;
+      } else {
+        return false;
+      }
+      continue;
+    }
+    if (auto* bin = dyncast<BinaryInst>(inst)) {
+      if (binary != nullptr) {
+        return false;
+      }
+      binary = bin;
+      continue;
+    }
+    if (auto* st = dyncast<StoreInst>(inst)) {
+      if (store != nullptr) {
+        return false;
+      }
+      store = st;
+      continue;
+    }
+    return false;
+  }
+
+  if (!store || !binary || !lhs_load || !rhs_load || store->GetValue() != binary) {
+    return false;
+  }
+  lhs_load = dyncast<LoadInst>(binary->GetLhs());
+  rhs_load = dyncast<LoadInst>(binary->GetRhs());
+  if (!lhs_load || !rhs_load) {
+    return false;
+  }
+
+  VectorKind kind = VectorKind::I32;
+  VectorOp op = VectorOp::Add;
+  if (!MatchVectorOpcode(binary->GetOpcode(), &kind, &op) ||
+      !IsExpectedScalarType(binary, kind) ||
+      !IsExpectedScalarType(lhs_load, kind) ||
+      !IsExpectedScalarType(rhs_load, kind) ||
+      !IsExpectedScalarType(store->GetValue(), kind)) {
+    return false;
+  }
+
+  const int access_size = kind == VectorKind::I32 ? 4 : 4;
+  auto store_ptr = loopmem::AnalyzePointer(store->GetPtr(), iv.phi, loop, access_size);
+  auto lhs_ptr = loopmem::AnalyzePointer(lhs_load->GetPtr(), iv.phi, loop, access_size);
+  auto rhs_ptr = loopmem::AnalyzePointer(rhs_load->GetPtr(), iv.phi, loop, access_size);
+  if (!IsUnitStrideAccess(store_ptr, iv.phi, access_size) ||
+      !IsUnitStrideAccess(lhs_ptr, iv.phi, access_size) ||
+      !IsUnitStrideAccess(rhs_ptr, iv.phi, access_size)) {
+    return false;
+  }
+
+  info.loop = &loop;
+  info.preheader = loop.preheader;
+  info.header = loop.header;
+  info.body = body;
+  info.exit = exit;
+  info.iv = iv;
+  info.begin = iv.start;
+  info.end = compare->GetRhs();
+  info.store = store;
+  info.lhs_load = lhs_load;
+  info.rhs_load = rhs_load;
+  info.binary = binary;
+  info.kind = kind;
+  info.op = op;
+  return true;
+}
+
+bool MatchFillLoop(Loop& loop, FillLoopInfo& info) {
+  if (!loop.preheader || !loop.header || !loop.IsInnermost()) {
+    return false;
+  }
+
+  BasicBlock* body = nullptr;
+  BasicBlock* exit = nullptr;
+  if (!loopmem::GetCanonicalLoopBlocks(loop, body, exit)) {
+    return false;
+  }
+
+  std::vector<PhiInst*> phis;
+  loopmem::SimpleInductionVar iv;
+  bool found_iv = false;
+  for (const auto& inst_ptr : loop.header->GetInstructions()) {
+    auto* phi = dyncast<PhiInst>(inst_ptr.get());
+    if (!phi) {
+      break;
+    }
+    phis.push_back(phi);
+    if (!found_iv && loopmem::MatchSimpleInductionVariable(loop, loop.preheader, phi, iv)) {
+      found_iv = true;
+    }
+  }
+  if (!found_iv || phis.size() != 1 || iv.stride != 1) {
+    return false;
+  }
+
+  auto* branch = dyncast<CondBrInst>(looputils::GetTerminator(loop.header));
+  auto* compare = branch ? dyncast<BinaryInst>(branch->GetCondition()) : nullptr;
+  if (!branch || branch->GetThenBlock() != body || !compare ||
+      compare->GetOpcode() != Opcode::ICmpLT || compare->GetLhs() != iv.phi ||
+      !looputils::IsLoopInvariantValue(loop, compare->GetRhs())) {
+    return false;
+  }
+  auto* preheader_branch = dyncast<UncondBrInst>(
+      loop.preheader->GetInstructions().empty()
+          ? nullptr
+          : loop.preheader->GetInstructions().back().get());
+  if (!preheader_branch || preheader_branch->GetDest() != loop.header ||
+      loop.preheader->GetSuccessors().size() != 1) {
+    return false;
+  }
+
+  auto* step = dyncast<BinaryInst>(iv.latch_value);
+  if (!step || step->GetParent() != body || !StepOnlyFeedsLoopPhi(step, iv.phi) ||
+      HasExitPhiUse(exit, loop.header)) {
+    return false;
+  }
+
+  if (auto* begin_const = dyncast<ConstantInt>(iv.start)) {
+    if (auto* end_const = dyncast<ConstantInt>(compare->GetRhs())) {
+      if (end_const->GetValue() - begin_const->GetValue() < kMinConstantTripToVectorize) {
+        return false;
+      }
+    }
+  }
+
+  StoreInst* store = nullptr;
+  for (const auto& inst_ptr : body->GetInstructions()) {
+    auto* inst = inst_ptr.get();
+    if (inst->IsTerminator() || inst == step) {
+      continue;
+    }
+    if (auto* gep = dyncast<GetElementPtrInst>(inst)) {
+      (void)gep;
+      continue;
+    }
+    if (auto* st = dyncast<StoreInst>(inst)) {
+      if (store != nullptr) {
+        return false;
+      }
+      store = st;
+      continue;
+    }
+    return false;
+  }
+
+  if (!store || !store->GetValue() ||
+      !looputils::IsLoopInvariantValue(loop, store->GetValue())) {
+    return false;
+  }
+
+  VectorKind kind = VectorKind::I32;
+  if (store->GetValue()->GetType() && store->GetValue()->GetType()->IsFloat()) {
+    kind = VectorKind::F32;
+  } else if (!store->GetValue()->GetType() || !store->GetValue()->GetType()->IsInt32()) {
+    return false;
+  }
+
+  const int access_size = 4;
+  auto store_ptr = loopmem::AnalyzePointer(store->GetPtr(), iv.phi, loop, access_size);
+  if (!IsUnitStrideAccess(store_ptr, iv.phi, access_size)) {
+    return false;
+  }
+
+  info.loop = &loop;
+  info.preheader = loop.preheader;
+  info.header = loop.header;
+  info.body = body;
+  info.exit = exit;
+  info.iv = iv;
+  info.begin = iv.start;
+  info.end = compare->GetRhs();
+  info.store = store;
+  info.fill_value = store->GetValue();
+  info.kind = kind;
+  return true;
+}
+
+const char* HelperName(VectorKind kind, VectorOp op) {
+  if (kind == VectorKind::I32) {
+    switch (op) {
+      case VectorOp::Add:
+        return "__nudtc_neon_i32_add";
+      case VectorOp::Sub:
+        return "__nudtc_neon_i32_sub";
+      case VectorOp::Mul:
+        return "__nudtc_neon_i32_mul";
+    }
+  }
+  switch (op) {
+    case VectorOp::Add:
+      return "__nudtc_neon_f32_add";
+    case VectorOp::Sub:
+      return "__nudtc_neon_f32_sub";
+    case VectorOp::Mul:
+      return "__nudtc_neon_f32_mul";
+  }
+  return "__nudtc_neon_i32_add";
+}
+
+const char* FillHelperName(VectorKind kind) {
+  return kind == VectorKind::I32 ? "__nudtc_neon_i32_fill" : "__nudtc_neon_f32_fill";
+}
+
+Function* EnsureHelper(Module& module, VectorKind kind, VectorOp op) {
+  auto* helper = module.CreateFunction(
+      HelperName(kind, op), Type::GetVoidType(),
+      {Type::GetPointerType(), Type::GetPointerType(), Type::GetPointerType(),
+       Type::GetInt32Type()},
+      {"%dst", "%lhs", "%rhs", "%n"}, true);
+  helper->SetEffectInfo(false, false, true, true, false, false, false);
+  return helper;
+}
+
+Function* EnsureFillHelper(Module& module, VectorKind kind) {
+  auto scalar_type =
+      kind == VectorKind::I32 ? Type::GetInt32Type() : Type::GetFloatType();
+  auto* helper = module.CreateFunction(
+      FillHelperName(kind), Type::GetVoidType(),
+      {Type::GetPointerType(), scalar_type, Type::GetInt32Type()},
+      {"%dst", "%value", "%n"}, true);
+  helper->SetEffectInfo(false, false, true, true, false, false, false);
+  return helper;
+}
+
+bool VectorizeLoop(Module& module, Function& function, VectorLoopInfo& info) {
+  auto* dst = MaterializePointerAtBegin(function, info.preheader, info.store->GetPtr(),
+                                        info.iv.phi, info.begin, *info.loop);
+  auto* lhs = MaterializePointerAtBegin(function, info.preheader, info.lhs_load->GetPtr(),
+                                        info.iv.phi, info.begin, *info.loop);
+  auto* rhs = MaterializePointerAtBegin(function, info.preheader, info.rhs_load->GetPtr(),
+                                        info.iv.phi, info.begin, *info.loop);
+  if (!dst || !lhs || !rhs) {
+    return false;
+  }
+
+  auto* trip_count = info.preheader->Insert<BinaryInst>(
+      looputils::GetTerminatorIndex(info.preheader), Opcode::Sub, Type::GetInt32Type(),
+      info.end, info.begin, nullptr, looputils::NextSyntheticName(function, "vec.n."));
+
+  auto* vector_block =
+      function.CreateBlock(looputils::NextSyntheticBlockName(function, "vec.dispatch"));
+
+  auto& preheader_insts = info.preheader->GetInstructions();
+  if (preheader_insts.empty() || !preheader_insts.back()->IsTerminator()) {
+    return false;
+  }
+  preheader_insts.back()->ClearAllOperands();
+  auto branch = std::make_unique<UncondBrInst>(vector_block, nullptr);
+  branch->SetParent(info.preheader);
+  preheader_insts.back() = std::move(branch);
+  info.preheader->RemoveSuccessor(info.header);
+  info.header->RemovePredecessor(info.preheader);
+  info.preheader->AddSuccessor(vector_block);
+  vector_block->AddPredecessor(info.preheader);
+  for (const auto& inst_ptr : info.header->GetInstructions()) {
+    auto* phi = dyncast<PhiInst>(inst_ptr.get());
+    if (!phi) {
+      break;
+    }
+    const int incoming = looputils::GetPhiIncomingIndex(phi, info.preheader);
+    if (incoming >= 0) {
+      phi->RemoveOperand(static_cast<std::size_t>(2 * incoming + 1));
+      phi->RemoveOperand(static_cast<std::size_t>(2 * incoming));
+    }
+  }
+
+  auto* helper = EnsureHelper(module, info.kind, info.op);
+  vector_block->Append<CallInst>(helper, std::vector<Value*>{dst, lhs, rhs, trip_count},
+                                 nullptr, "");
+  vector_block->Append<UncondBrInst>(info.exit, nullptr);
+  vector_block->AddSuccessor(info.exit);
+  info.exit->AddPredecessor(vector_block);
+  return true;
+}
+
+bool VectorizeFillLoop(Module& module, Function& function, FillLoopInfo& info) {
+  auto* dst = MaterializePointerAtBegin(function, info.preheader, info.store->GetPtr(),
+                                        info.iv.phi, info.begin, *info.loop);
+  if (!dst) {
+    return false;
+  }
+
+  auto* trip_count = info.preheader->Insert<BinaryInst>(
+      looputils::GetTerminatorIndex(info.preheader), Opcode::Sub, Type::GetInt32Type(),
+      info.end, info.begin, nullptr, looputils::NextSyntheticName(function, "vec.n."));
+
+  auto* vector_block =
+      function.CreateBlock(looputils::NextSyntheticBlockName(function, "vec.fill"));
+
+  auto& preheader_insts = info.preheader->GetInstructions();
+  if (preheader_insts.empty() || !preheader_insts.back()->IsTerminator()) {
+    return false;
+  }
+  preheader_insts.back()->ClearAllOperands();
+  auto branch = std::make_unique<UncondBrInst>(vector_block, nullptr);
+  branch->SetParent(info.preheader);
+  preheader_insts.back() = std::move(branch);
+  info.preheader->RemoveSuccessor(info.header);
+  info.header->RemovePredecessor(info.preheader);
+  info.preheader->AddSuccessor(vector_block);
+  vector_block->AddPredecessor(info.preheader);
+  for (const auto& inst_ptr : info.header->GetInstructions()) {
+    auto* phi = dyncast<PhiInst>(inst_ptr.get());
+    if (!phi) {
+      break;
+    }
+    const int incoming = looputils::GetPhiIncomingIndex(phi, info.preheader);
+    if (incoming >= 0) {
+      phi->RemoveOperand(static_cast<std::size_t>(2 * incoming + 1));
+      phi->RemoveOperand(static_cast<std::size_t>(2 * incoming));
+    }
+  }
+
+  auto* helper = EnsureFillHelper(module, info.kind);
+  vector_block->Append<CallInst>(helper, std::vector<Value*>{dst, info.fill_value, trip_count},
+                                 nullptr, "");
+  vector_block->Append<UncondBrInst>(info.exit, nullptr);
+  vector_block->AddSuccessor(info.exit);
+  info.exit->AddPredecessor(vector_block);
+  return true;
+}
+
+bool RunLoopVectorizeOnFunction(Module& module, Function& function) {
+  if (function.IsExternal() || !function.GetEntryBlock()) {
+    return false;
+  }
+
+  DominatorTree dom_tree(function);
+  LoopInfo loop_info(function, dom_tree);
+  for (auto* loop : loop_info.GetLoopsInPostOrder()) {
+    VectorLoopInfo info;
+    if (MatchVectorLoop(*loop, info)) {
+      return VectorizeLoop(module, function, info);
+    }
+    FillLoopInfo fill_info;
+    if (MatchFillLoop(*loop, fill_info)) {
+      return VectorizeFillLoop(module, function, fill_info);
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+bool RunLoopVectorize(Module& module) {
+  if (utils::IsEnvFlagSet("NUDTC_DISABLE_LOOP_VECTORIZE")) {
+    return false;
+  }
+
+  bool changed = false;
+  for (const auto& function : module.GetFunctions()) {
+    if (function && !function->IsExternal()) {
+      changed |= RunLoopVectorizeOnFunction(module, *function);
+    }
+  }
+  return changed;
+}
+
+}  // namespace ir
--- a/src/ir/passes/PassManager.cpp
+++ b/src/ir/passes/PassManager.cpp
@ -2,47 +2,143 @@

 #include "ir/PassManager.h"

-#include <cstdlib>
+#include "ir/IR.h"
+#include "utils/OptConfig.h"
+
+#include <cstddef>
+#include <unordered_map>
+#include <utility>
+#include <vector>

 namespace ir {
+namespace {
+
+struct PipelineShape {
+  std::size_t instructions = 0;
+  std::size_t blocks = 0;
+  bool may_have_loop = false;
+};
+
+bool FunctionMayHaveLoop(Function& function) {
+  std::unordered_map<BasicBlock*, int> state;
+  std::vector<std::pair<BasicBlock*, std::size_t>> stack;
+
+  auto push_start = [&](BasicBlock* block) {
+    if (block == nullptr || state[block] != 0) {
+      return false;
+    }
+    state[block] = 1;
+    stack.push_back({block, 0});
+    return true;
+  };
+
+  auto run_dfs = [&]() {
+    while (!stack.empty()) {
+      auto& frame = stack.back();
+      auto* block = frame.first;
+      const auto& succs = block->GetSuccessors();
+      if (frame.second >= succs.size()) {
+        state[block] = 2;
+        stack.pop_back();
+        continue;
+      }
+
+      auto* succ = succs[frame.second++];
+      if (succ == nullptr) {
+        continue;
+      }
+      auto it = state.find(succ);
+      const int succ_state = it == state.end() ? 0 : it->second;
+      if (succ_state == 1) {
+        return true;
+      }
+      if (succ_state == 0) {
+        state[succ] = 1;
+        stack.push_back({succ, 0});
+      }
+    }
+    return false;
+  };
+
+  if (push_start(function.GetEntryBlock()) && run_dfs()) {
+    return true;
+  }
+  for (const auto& block : function.GetBlocks()) {
+    if (push_start(block.get()) && run_dfs()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+PipelineShape AnalyzePipelineShape(Module& module) {
+  PipelineShape shape;
+  for (const auto& function : module.GetFunctions()) {
+    if (!function || function->IsExternal()) {
+      continue;
+    }
+    shape.may_have_loop = shape.may_have_loop || FunctionMayHaveLoop(*function);
+    for (const auto& block : function->GetBlocks()) {
+      if (!block) {
+        continue;
+      }
+      ++shape.blocks;
+      shape.instructions += block->GetInstructions().size();
+    }
+  }
+  return shape;
+}
+
+void VerifyIfRequested(const Module& module) {
+  if (utils::IsEnvFlagSet("NUDTC_VERIFY_IR")) {
+    VerifyIR(module);
+  }
+}
+
+}  // namespace

 void RunIRPassPipeline(Module& module) {
-  const char* disable_mem2reg = std::getenv("NUDTC_DISABLE_MEM2REG");
-  if (disable_mem2reg != nullptr && disable_mem2reg[0] != '\0' && disable_mem2reg[0] != '0') {
+  if (utils::IsEnvFlagSet("NUDTC_DISABLE_MEM2REG")) {
    return;
  }
-  const char* disable_loop_mem_promotion =
-      std::getenv("NUDTC_DISABLE_LOOP_MEM_PROMOTION");
  const bool run_loop_mem_promotion =
-      disable_loop_mem_promotion == nullptr || disable_loop_mem_promotion[0] == '\0' ||
-      disable_loop_mem_promotion[0] == '0';
-  const char* disable_inline_cfg = std::getenv("NUDTC_DISABLE_CFG_INLINE");
+      utils::IsEnabledUnlessEnvFlag("NUDTC_DISABLE_LOOP_MEM_PROMOTION");
  const bool run_cfg_inline =
-      disable_inline_cfg == nullptr || disable_inline_cfg[0] == '\0' ||
-      disable_inline_cfg[0] == '0';
-  const char* disable_loop_unswitch = std::getenv("NUDTC_DISABLE_LOOP_UNSWITCH");
+      utils::IsEnabledUnlessEnvFlag("NUDTC_DISABLE_CFG_INLINE");
  const bool run_loop_unswitch =
-      disable_loop_unswitch == nullptr || disable_loop_unswitch[0] == '\0' ||
-      disable_loop_unswitch[0] == '0';
-  const char* disable_tail_recursion =
-      std::getenv("NUDTC_DISABLE_TAIL_RECURSION");
+      utils::IsEnabledUnlessEnvFlag("NUDTC_DISABLE_LOOP_UNSWITCH");
  const bool run_tail_recursion =
-      disable_tail_recursion == nullptr || disable_tail_recursion[0] == '\0' ||
-      disable_tail_recursion[0] == '0';
+      utils::IsEnabledUnlessEnvFlag("NUDTC_DISABLE_TAIL_RECURSION");

  RunMem2Reg(module);
+  VerifyIfRequested(module);
  if (run_tail_recursion) {
    RunTailRecursionElim(module);
+    VerifyIfRequested(module);
  }

-  constexpr int kMaxIterations = 8;
-  for (int iteration = 0; iteration < kMaxIterations; ++iteration) {
+  const auto initial_shape = AnalyzePipelineShape(module);
+  const bool disable_size_guard = utils::IsEnvFlagSet("NUDTC_DISABLE_IR_SIZE_GUARD");
+  const bool huge_cfg =
+      !disable_size_guard &&
+      (initial_shape.blocks > 1000 || initial_shape.instructions > 7000);
+  const bool large_cfg =
+      !disable_size_guard &&
+      (huge_cfg || initial_shape.blocks > 300 || initial_shape.instructions > 2500);
+  const int max_iterations = huge_cfg ? 3 : (large_cfg ? 5 : 8);
+
+  for (int iteration = 0; iteration < max_iterations; ++iteration) {
    bool changed = false;
+    const bool run_growth_passes = !large_cfg || iteration < 2;
+    const bool run_loop_passes =
+        initial_shape.may_have_loop && (!large_cfg || iteration < 2);
    if (run_tail_recursion) {
      changed |= RunTailRecursionElim(module);
+      VerifyIfRequested(module);
    }
-    if (run_cfg_inline) {
+    if (run_cfg_inline && run_growth_passes) {
      changed |= RunFunctionInlining(module);
+      VerifyIfRequested(module);
    }
    changed |= RunInterproceduralConstProp(module);
    changed |= RunArithmeticSimplify(module);
@ -54,17 +150,33 @@ void RunIRPassPipeline(Module& module) {
    changed |= RunIfConversion(module);
    changed |= RunDCE(module);
    changed |= RunCFGSimplify(module);
-    changed |= RunLICM(module);
-    if (run_loop_mem_promotion) {
+    VerifyIfRequested(module);
+    if (run_loop_passes) {
+      changed |= RunLICM(module);
+      VerifyIfRequested(module);
+    }
+    if (run_loop_passes && run_loop_mem_promotion) {
      changed |= RunLoopMemoryPromotion(module);
+      VerifyIfRequested(module);
    }
-    if (run_loop_unswitch) {
+    if (run_loop_passes && run_loop_unswitch) {
      changed |= RunLoopUnswitch(module);
+      VerifyIfRequested(module);
+    }
+    if (run_loop_passes) {
+      changed |= RunLoopVectorize(module);
+      VerifyIfRequested(module);
+      changed |= RunLoopParallelize(module);
+      VerifyIfRequested(module);
+      changed |= RunLoopStrengthReduction(module);
+      VerifyIfRequested(module);
+      changed |= RunLoopFission(module);
+      VerifyIfRequested(module);
+      changed |= RunLoopUnroll(module);
+      VerifyIfRequested(module);
+      changed |= RunLoopRepeatReduction(module);
+      VerifyIfRequested(module);
    }
-    changed |= RunLoopStrengthReduction(module);
-    changed |= RunLoopFission(module);
-    changed |= RunLoopUnroll(module);
-    changed |= RunLoopRepeatReduction(module);
    changed |= RunArithmeticSimplify(module);
    changed |= RunConstProp(module);
    changed |= RunConstFold(module);
@ -74,6 +186,7 @@ void RunIRPassPipeline(Module& module) {
    changed |= RunIfConversion(module);
    changed |= RunDCE(module);
    changed |= RunCFGSimplify(module);
+    VerifyIfRequested(module);
    if (!changed) {
      break;
    }
--- a/src/main.cpp
+++ b/src/main.cpp
@ -7,8 +7,10 @@
 #if !COMPILER_PARSE_ONLY
 #include "ir/IR.h"
 #include "ir/PassManager.h"
-#include "irgen/IRGen.h"
-#include "mir/MIR.h"
+#include "irgen/IRGen.h"
+#include "mir/CodeGen.h"
+#include "mir/MIR.h"
+#include "mir/Passes.h"
 #include "sem/Sema.h"
 #endif
 #include "utils/CLI.h"
@ -23,7 +25,7 @@ int main(int argc, char** argv) {
    }

    auto antlr = ParseFileWithAntlr(opts.input);
-    bool need_blank_line = false;
+    [[maybe_unused]] bool need_blank_line = false;
    if (opts.emit_parse_tree) {
      PrintSyntaxTree(antlr.tree, antlr.parser.get(), std::cout);
      need_blank_line = true;
--- a/src/mir/AddressHoisting.cpp
+++ b/src/mir/AddressHoisting.cpp
@ -1,140 +0,0 @@
-#include "mir/MIR.h"
-
-#include <cstdint>
-#include <unordered_map>
-#include <vector>
-
-namespace mir {
-namespace {
-
-bool IsHoistCandidate(const MachineFunction& function, int object_index, int use_count) {
-  const auto& object = function.GetStackObject(object_index);
-  if (object.kind != StackObjectKind::Local) {
-    return false;
-  }
-  if (use_count < 2) {
-    return false;
-  }
-  if (object.size >= 4096) {
-    return true;
-  }
-  return object.size >= 256 && use_count >= 4;
-}
-
-bool IsPlainFrameLea(const MachineInstr& inst, int object_index) {
-  if (inst.GetOpcode() != MachineInstr::Opcode::Lea || !inst.HasAddress() ||
-      inst.GetOperands().empty() || inst.GetOperands()[0].GetKind() != OperandKind::VReg) {
-    return false;
-  }
-  const auto& address = inst.GetAddress();
-  return address.base_kind == AddrBaseKind::FrameObject &&
-         address.base_index == object_index && address.const_offset == 0 &&
-         address.scaled_vregs.empty();
-}
-
-std::size_t FindEntryInsertPos(const MachineBasicBlock& block) {
-  const auto& instructions = block.GetInstructions();
-  std::size_t pos = 0;
-  while (pos < instructions.size() &&
-         instructions[pos].GetOpcode() == MachineInstr::Opcode::Arg) {
-    ++pos;
-  }
-  return pos;
-}
-
-}  // namespace
-
-void RunAddressHoisting(MachineModule& module) {
-  for (auto& function : module.GetFunctions()) {
-    if (!function || function->GetBlocks().empty()) {
-      continue;
-    }
-
-    std::unordered_map<int, int> use_counts;
-    for (auto& block : function->GetBlocks()) {
-      for (auto& inst : block->GetInstructions()) {
-        if (!inst.HasAddress()) {
-          continue;
-        }
-        const auto& address = inst.GetAddress();
-        if (address.base_kind == AddrBaseKind::FrameObject && address.base_index >= 0) {
-          ++use_counts[address.base_index];
-        }
-      }
-    }
-
-    std::unordered_map<int, int> base_vregs;
-    for (const auto& [object_index, count] : use_counts) {
-      if (!IsHoistCandidate(*function, object_index, count)) {
-        continue;
-      }
-      base_vregs.emplace(object_index, -1);
-    }
-    if (base_vregs.empty()) {
-      continue;
-    }
-
-    for (auto& block : function->GetBlocks()) {
-      for (auto& inst : block->GetInstructions()) {
-        if (!inst.HasAddress()) {
-          continue;
-        }
-        const auto& address = inst.GetAddress();
-        auto it = base_vregs.find(address.base_index);
-        if (it == base_vregs.end()) {
-          continue;
-        }
-        if (it->second >= 0) {
-          continue;
-        }
-        if (IsPlainFrameLea(inst, address.base_index)) {
-          it->second = inst.GetOperands()[0].GetVReg();
-        }
-      }
-    }
-
-    auto& entry_block = *function->GetBlocks().front();
-    auto& entry_insts = entry_block.GetInstructions();
-    std::size_t insert_pos = FindEntryInsertPos(entry_block);
-
-    for (auto& [object_index, base_vreg] : base_vregs) {
-      if (base_vreg >= 0) {
-        continue;
-      }
-      base_vreg = function->NewVReg(ValueType::Ptr);
-      MachineInstr lea(MachineInstr::Opcode::Lea, {MachineOperand::VReg(base_vreg)});
-      AddressExpr address;
-      address.base_kind = AddrBaseKind::FrameObject;
-      address.base_index = object_index;
-      lea.SetAddress(std::move(address));
-      entry_insts.insert(entry_insts.begin() + static_cast<std::ptrdiff_t>(insert_pos),
-                         std::move(lea));
-      ++insert_pos;
-    }
-
-    for (auto& block : function->GetBlocks()) {
-      for (auto& inst : block->GetInstructions()) {
-        if (!inst.HasAddress()) {
-          continue;
-        }
-        auto& address = inst.GetAddress();
-        auto it = base_vregs.find(address.base_index);
-        if (it == base_vregs.end()) {
-          continue;
-        }
-        if (IsPlainFrameLea(inst, address.base_index) &&
-            inst.GetOperands()[0].GetKind() == OperandKind::VReg &&
-            inst.GetOperands()[0].GetVReg() == it->second) {
-          continue;
-        }
-        if (address.base_kind != AddrBaseKind::FrameObject || address.base_index < 0) {
-          continue;
-        }
-        address.base_kind = AddrBaseKind::VReg;
-        address.base_index = it->second;
-      }
-    }
-  }
-}
-
-}  // namespace mir
--- a/src/mir/AsmPrinter.cpp
+++ b/src/mir/AsmPrinter.cpp
--- a/src/mir/AsmPrinterSupport.cpp
+++ b/src/mir/AsmPrinterSupport.cpp
@ -0,0 +1,368 @@
+#include "AsmPrinterSupport.h"
+
+#include <algorithm>
+#include <cstring>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+
+#include "utils/Log.h"
+#include "utils/OptConfig.h"
+
+namespace mir {
+
+int AlignTo(int value, int align) {
+  if (align <= 1) {
+    return value;
+  }
+  return ((value + align - 1) / align) * align;
+}
+
+bool IsPowerOfTwo(std::int64_t value) {
+  return value > 0 && (value & (value - 1)) == 0;
+}
+
+int Log2(std::int64_t value) {
+  int shift = 0;
+  while (value > 1) {
+    value >>= 1;
+    ++shift;
+  }
+  return shift;
+}
+
+int CountBits64(std::uint64_t value) {
+  int count = 0;
+  while (value != 0) {
+    value &= value - 1;
+    ++count;
+  }
+  return count;
+}
+
+std::vector<int> SetBitPositions(std::uint64_t value) {
+  std::vector<int> positions;
+  for (int bit = 0; bit < 63; ++bit) {
+    if ((value & (1ull << bit)) != 0) {
+      positions.push_back(bit);
+    }
+  }
+  return positions;
+}
+
+SignedDivMagic ComputeSignedDivMagic(std::int64_t divisor) {
+  const std::uint64_t two31 = 1ull << 31;
+  const std::uint64_t abs_divisor =
+      divisor < 0 ? static_cast<std::uint64_t>(-divisor)
+                  : static_cast<std::uint64_t>(divisor);
+  const std::uint64_t divisor_bits = static_cast<std::uint32_t>(divisor);
+  const std::uint64_t t = two31 + (divisor_bits >> 31);
+  const std::uint64_t anc = t - 1 - (t % abs_divisor);
+
+  int p = 31;
+  std::uint64_t q1 = two31 / anc;
+  std::uint64_t r1 = two31 - q1 * anc;
+  std::uint64_t q2 = two31 / abs_divisor;
+  std::uint64_t r2 = two31 - q2 * abs_divisor;
+
+  while (true) {
+    ++p;
+    q1 <<= 1;
+    r1 <<= 1;
+    if (r1 >= anc) {
+      ++q1;
+      r1 -= anc;
+    }
+    q2 <<= 1;
+    r2 <<= 1;
+    if (r2 >= abs_divisor) {
+      ++q2;
+      r2 -= abs_divisor;
+    }
+    const std::uint64_t delta = abs_divisor - r2;
+    if (q1 > delta || (q1 == delta && r1 != 0)) {
+      break;
+    }
+  }
+
+  std::int64_t multiplier = static_cast<std::int64_t>(q2 + 1);
+  if (divisor < 0) {
+    multiplier = -multiplier;
+  }
+  multiplier = static_cast<std::int32_t>(static_cast<std::uint32_t>(multiplier));
+  return {multiplier, p - 32};
+}
+
+std::uint64_t ComputeU64ModuloMagic(std::int64_t divisor) {
+  const auto abs_divisor = static_cast<std::uint64_t>(divisor);
+  return ~std::uint64_t{0} / abs_divisor;
+}
+
+const char* GetDRegName(int index) {
+  static const char* kNames[] = {
+      "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
+      "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
+      "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+      "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"};
+  if (index < 0 || index >= 32) {
+    throw std::runtime_error("float register index out of range");
+  }
+  return kNames[index];
+}
+
+int ToAsmAlign(int align) {
+  int value = 0;
+  int current = 1;
+  while (current < align) {
+    current <<= 1;
+    ++value;
+  }
+  return value;
+}
+
+std::uint32_t FloatBits(float value) {
+  std::uint32_t bits = 0;
+  std::memcpy(&bits, &value, sizeof(bits));
+  return bits;
+}
+
+namespace {
+
+bool Is32BitRegName(const char* reg) {
+  return reg != nullptr && reg[0] == 'w';
+}
+
+bool IsAddSubImm12(std::int64_t value) {
+  return value >= 0 && value <= 4095;
+}
+
+bool IsAddSubImm12Shifted(std::int64_t value) {
+  return value >= 0 && value <= (4095ll << 12) && (value & 0xfffll) == 0;
+}
+
+bool IsShiftedContiguousMask32(std::uint32_t value) {
+  if (value == 0 || value == 0xffffffffu) {
+    return false;
+  }
+  for (int start = 0; start < 32; ++start) {
+    std::uint32_t mask = 0;
+    for (int bit = start; bit < 32; ++bit) {
+      mask |= (1u << bit);
+      if (mask == value) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+bool IsAddSubImm(std::int64_t value) {
+  return IsAddSubImm12(value) || IsAddSubImm12Shifted(value);
+}
+
+bool IsLogicalImm32(std::int64_t value) {
+  if (value < 0 || value > 0xffffffffll) {
+    return false;
+  }
+  return IsShiftedContiguousMask32(static_cast<std::uint32_t>(value));
+}
+
+bool AsmImmLoweringEnabled() {
+  return utils::IsEnabledUnlessEnvFlag("NUDTC_DISABLE_ASM_IMM_LOWERING");
+}
+
+void EmitAddSubImm(std::ostream& os, const char* opcode, const char* dst,
+                   const char* src, std::int64_t value) {
+  if (!IsAddSubImm(value)) {
+    throw std::runtime_error(FormatError("mir", "invalid add/sub immediate"));
+  }
+  os << "  " << opcode << " " << dst << ", " << src << ", #";
+  if (IsAddSubImm12(value)) {
+    os << value << "\n";
+    return;
+  }
+  os << (value >> 12) << ", lsl #12\n";
+}
+
+void EmitAdjustRegByImm(std::ostream& os, const char* dst, const char* src,
+                        std::int64_t value) {
+  if (value == 0) {
+    if (std::string(dst) != src) {
+      os << "  mov " << dst << ", " << src << "\n";
+    }
+    return;
+  }
+
+  const char* opcode = value >= 0 ? "add" : "sub";
+  std::uint64_t remaining = value >= 0 ? static_cast<std::uint64_t>(value)
+                                       : static_cast<std::uint64_t>(-value);
+  bool first = true;
+  auto emit_chunk = [&](std::uint64_t amount, bool shifted) {
+    const char* current_src = first ? src : dst;
+    os << "  " << opcode << " " << dst << ", " << current_src << ", #" << amount;
+    if (shifted) {
+      os << ", lsl #12";
+    }
+    os << "\n";
+    first = false;
+  };
+
+  while (remaining >= 4096) {
+    const std::uint64_t units = std::min<std::uint64_t>(remaining >> 12, 4095);
+    emit_chunk(units, true);
+    remaining -= units << 12;
+  }
+  if (remaining > 0) {
+    emit_chunk(remaining, false);
+  }
+}
+
+void EmitMoveImm(std::ostream& os, const char* reg, std::int64_t value) {
+  if (reg == nullptr || reg[0] == '\0') {
+    throw std::runtime_error(FormatError("mir", "invalid register for immediate materialization"));
+  }
+
+  const bool is32 = Is32BitRegName(reg);
+  if (value == 0) {
+    os << "  mov " << reg << ", #0\n";
+    return;
+  }
+
+  if (is32) {
+    const std::uint32_t bits = static_cast<std::uint32_t>(value);
+    bool emitted = false;
+    for (int shift = 0; shift <= 16; shift += 16) {
+      const std::uint32_t chunk = (bits >> shift) & 0xffffu;
+      if (chunk == 0 && emitted) {
+        continue;
+      }
+      if (!emitted) {
+        os << "  movz " << reg << ", #" << chunk;
+        if (shift != 0) {
+          os << ", lsl #" << shift;
+        }
+        os << "\n";
+        emitted = true;
+      } else if (chunk != 0) {
+        os << "  movk " << reg << ", #" << chunk;
+        if (shift != 0) {
+          os << ", lsl #" << shift;
+        }
+        os << "\n";
+      }
+    }
+    return;
+  }
+
+  const std::uint64_t bits = static_cast<std::uint64_t>(value);
+  bool emitted = false;
+  for (int shift = 0; shift <= 48; shift += 16) {
+    const std::uint64_t chunk = (bits >> shift) & 0xffffull;
+    if (chunk == 0 && emitted) {
+      continue;
+    }
+    if (!emitted) {
+      os << "  movz " << reg << ", #" << chunk;
+      if (shift != 0) {
+        os << ", lsl #" << shift;
+      }
+      os << "\n";
+      emitted = true;
+    } else if (chunk != 0) {
+      os << "  movk " << reg << ", #" << chunk;
+      if (shift != 0) {
+        os << ", lsl #" << shift;
+      }
+      os << "\n";
+    }
+  }
+}
+
+void EmitCopy(std::ostream& os, const char* dst, const char* src, bool is_float) {
+  if (std::string(dst) == src) {
+    return;
+  }
+  os << "  " << (is_float ? "fmov" : "mov") << " " << dst << ", " << src << "\n";
+}
+
+int GetAddressShift(ValueType type) {
+  switch (GetValueSize(type)) {
+    case 4:
+      return 2;
+    case 8:
+      return 3;
+    case 16:
+      return 4;
+    default:
+      return 0;
+  }
+}
+
+bool TryEmitBaseOffsetAccess(ValueType type, const char* value_reg, const char* base_reg,
+                             std::int64_t offset, bool is_store, std::ostream& os) {
+  const int size = GetValueSize(type);
+  const char* mnemonic = is_store ? "str" : "ldr";
+  if (offset == 0) {
+    os << "  " << mnemonic << " " << value_reg << ", [" << base_reg << "]\n";
+    return true;
+  }
+  if (offset >= 0 && size > 0 && offset % size == 0 && offset / size <= 4095) {
+    os << "  " << mnemonic << " " << value_reg << ", [" << base_reg << ", #" << offset
+       << "]\n";
+    return true;
+  }
+  if (offset >= -256 && offset <= 255) {
+    os << "  " << (is_store ? "stur" : "ldur") << " " << value_reg << ", [" << base_reg
+       << ", #" << offset << "]\n";
+    return true;
+  }
+  return false;
+}
+
+void EmitLoadFromAddr(ValueType type, const char* dst, const char* addr_reg,
+                      std::ostream& os) {
+  switch (type) {
+    case ValueType::I1:
+    case ValueType::I32:
+      os << "  ldr " << dst << ", [" << addr_reg << "]\n";
+      break;
+    case ValueType::F32:
+      os << "  ldr " << dst << ", [" << addr_reg << "]\n";
+      break;
+    case ValueType::Ptr:
+      os << "  ldr " << dst << ", [" << addr_reg << "]\n";
+      break;
+    case ValueType::I32x4:
+    case ValueType::F32x4:
+      os << "  ldr " << dst << ", [" << addr_reg << "]\n";
+      break;
+    case ValueType::Void:
+      break;
+  }
+}
+
+void EmitStoreToAddr(ValueType type, const char* src, const char* addr_reg,
+                     std::ostream& os) {
+  switch (type) {
+    case ValueType::I1:
+    case ValueType::I32:
+      os << "  str " << src << ", [" << addr_reg << "]\n";
+      break;
+    case ValueType::F32:
+      os << "  str " << src << ", [" << addr_reg << "]\n";
+      break;
+    case ValueType::Ptr:
+      os << "  str " << src << ", [" << addr_reg << "]\n";
+      break;
+    case ValueType::I32x4:
+    case ValueType::F32x4:
+      os << "  str " << src << ", [" << addr_reg << "]\n";
+      break;
+    case ValueType::Void:
+      break;
+  }
+}
+
+}  // namespace mir
--- a/src/mir/AsmPrinterSupport.h
+++ b/src/mir/AsmPrinterSupport.h
@ -0,0 +1,45 @@
+#pragma once
+
+#include "mir/MIR.h"
+
+#include <cstdint>
+#include <iosfwd>
+#include <vector>
+
+namespace mir {
+
+struct SignedDivMagic {
+  std::int64_t multiplier = 0;
+  int shift = 0;
+};
+
+int AlignTo(int value, int align);
+bool IsPowerOfTwo(std::int64_t value);
+int Log2(std::int64_t value);
+int CountBits64(std::uint64_t value);
+std::vector<int> SetBitPositions(std::uint64_t value);
+SignedDivMagic ComputeSignedDivMagic(std::int64_t divisor);
+std::uint64_t ComputeU64ModuloMagic(std::int64_t divisor);
+const char* GetDRegName(int index);
+int ToAsmAlign(int align);
+std::uint32_t FloatBits(float value);
+
+bool IsAddSubImm(std::int64_t value);
+bool IsLogicalImm32(std::int64_t value);
+bool AsmImmLoweringEnabled();
+void EmitAddSubImm(std::ostream& os, const char* opcode, const char* dst,
+                   const char* src, std::int64_t value);
+void EmitAdjustRegByImm(std::ostream& os, const char* dst, const char* src,
+                        std::int64_t value);
+void EmitMoveImm(std::ostream& os, const char* reg, std::int64_t value);
+void EmitCopy(std::ostream& os, const char* dst, const char* src, bool is_float);
+int GetAddressShift(ValueType type);
+bool TryEmitBaseOffsetAccess(ValueType type, const char* value_reg,
+                             const char* base_reg, std::int64_t offset,
+                             bool is_store, std::ostream& os);
+void EmitLoadFromAddr(ValueType type, const char* dst, const char* addr_reg,
+                      std::ostream& os);
+void EmitStoreToAddr(ValueType type, const char* src, const char* addr_reg,
+                     std::ostream& os);
+
+}  // namespace mir
--- a/src/mir/CMakeLists.txt
+++ b/src/mir/CMakeLists.txt
@ -2,14 +2,14 @@ add_library(mir_core STATIC
  MIRContext.cpp
  MIRFunction.cpp
  MIRBasicBlock.cpp
-  MIRInstr.cpp
-  Register.cpp
-  Lowering.cpp
-  AddressHoisting.cpp
-  RegAlloc.cpp
-  FrameLowering.cpp
-  AsmPrinter.cpp
-)
+  MIRInstr.cpp
+  Register.cpp
+  Lowering.cpp
+  RegAlloc.cpp
+  FrameLowering.cpp
+  AsmPrinterSupport.cpp
+  AsmPrinter.cpp
+)

 target_link_libraries(mir_core PUBLIC
  build_options
--- a/src/mir/FrameLowering.cpp
+++ b/src/mir/FrameLowering.cpp
@ -1,4 +1,4 @@
-#include "mir/MIR.h"
+#include "mir/CodeGen.h"

 #include <string>

--- a/src/mir/Lowering.cpp
+++ b/src/mir/Lowering.cpp
@ -1,4 +1,4 @@
-#include "mir/MIR.h"
+#include "mir/CodeGen.h"

 #include <algorithm>
 #include <cstring>
@ -303,8 +303,8 @@ CondCode LowerIntCond(ir::Opcode opcode) {
  }
 }

-CondCode LowerFloatCond(ir::Opcode opcode) {
-  switch (opcode) {
+CondCode LowerFloatCond(ir::Opcode opcode) {
+  switch (opcode) {
    case ir::Opcode::FCmpEQ:
      return CondCode::EQ;
    case ir::Opcode::FCmpNE:
@ -319,12 +319,168 @@ CondCode LowerFloatCond(ir::Opcode opcode) {
      return CondCode::GE;
    default:
      throw std::runtime_error(FormatError("mir", "invalid float compare opcode"));
-  }
-}
-
-std::int64_t FloatBits(float value) {
-  std::uint32_t bits = 0;
-  std::memcpy(&bits, &value, sizeof(bits));
+  }
+}
+
+bool IsIntegerCompareOpcode(ir::Opcode opcode) {
+  switch (opcode) {
+    case ir::Opcode::ICmpEQ:
+    case ir::Opcode::ICmpNE:
+    case ir::Opcode::ICmpLT:
+    case ir::Opcode::ICmpGT:
+    case ir::Opcode::ICmpLE:
+    case ir::Opcode::ICmpGE:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsFloatCompareOpcode(ir::Opcode opcode) {
+  switch (opcode) {
+    case ir::Opcode::FCmpEQ:
+    case ir::Opcode::FCmpNE:
+    case ir::Opcode::FCmpLT:
+    case ir::Opcode::FCmpGT:
+    case ir::Opcode::FCmpLE:
+    case ir::Opcode::FCmpGE:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsArgumentOf(const ir::Function& function, ir::Value* value) {
+  auto* arg = ir::dyncast<ir::Argument>(value);
+  if (arg == nullptr || arg->GetIndex() >= function.GetArguments().size()) {
+    return false;
+  }
+  return function.GetArgument(arg->GetIndex()) == arg;
+}
+
+bool IsMappedI32Value(const ir::Function& function, ir::Value* value) {
+  if (value == nullptr || value->GetType() == nullptr || !value->GetType()->IsInt32()) {
+    return false;
+  }
+  return ir::dyncast<ir::ConstantInt>(value) != nullptr || IsArgumentOf(function, value);
+}
+
+bool IsMappedF32Value(const ir::Function& function, ir::Value* value) {
+  if (value == nullptr || value->GetType() == nullptr || !value->GetType()->IsFloat()) {
+    return false;
+  }
+  return ir::dyncast<ir::ConstantFloat>(value) != nullptr || IsArgumentOf(function, value);
+}
+
+bool MatchSingleReturnValue(ir::BasicBlock* block, ir::Value** value) {
+  if (block == nullptr || block->GetInstructions().size() != 1) {
+    return false;
+  }
+  auto* ret = ir::dyncast<ir::ReturnInst>(block->GetInstructions().front().get());
+  if (ret == nullptr || !ret->HasReturnValue()) {
+    return false;
+  }
+  if (value != nullptr) {
+    *value = ret->GetReturnValue();
+  }
+  return true;
+}
+
+struct IntSelectCallShape {
+  CondCode cond = CondCode::EQ;
+  ir::Value* cmp_lhs = nullptr;
+  ir::Value* cmp_rhs = nullptr;
+  ir::Value* true_value = nullptr;
+  ir::Value* false_value = nullptr;
+};
+
+struct FloatSelectCallShape {
+  CondCode cond = CondCode::EQ;
+  ir::Value* cmp_lhs = nullptr;
+  ir::Value* cmp_rhs = nullptr;
+  ir::Value* true_value = nullptr;
+  ir::Value* false_value = nullptr;
+};
+
+bool MatchTwoWayIntSelectFunction(const ir::Function& function,
+                                  IntSelectCallShape* shape) {
+  if (shape == nullptr || function.IsExternal() || function.GetReturnType() == nullptr ||
+      !function.GetReturnType()->IsInt32() || function.GetBlocks().size() != 3 ||
+      function.IsRecursive()) {
+    return false;
+  }
+
+  auto* entry = function.GetEntryBlock();
+  if (entry == nullptr || entry->GetInstructions().size() != 2) {
+    return false;
+  }
+  auto* cmp = ir::dyncast<ir::BinaryInst>(entry->GetInstructions()[0].get());
+  auto* branch = ir::dyncast<ir::CondBrInst>(entry->GetInstructions()[1].get());
+  if (cmp == nullptr || branch == nullptr || branch->GetCondition() != cmp ||
+      !IsIntegerCompareOpcode(cmp->GetOpcode()) ||
+      !IsMappedI32Value(function, cmp->GetLhs()) ||
+      !IsMappedI32Value(function, cmp->GetRhs())) {
+    return false;
+  }
+
+  ir::Value* true_value = nullptr;
+  ir::Value* false_value = nullptr;
+  if (!MatchSingleReturnValue(branch->GetThenBlock(), &true_value) ||
+      !MatchSingleReturnValue(branch->GetElseBlock(), &false_value) ||
+      !IsMappedI32Value(function, true_value) ||
+      !IsMappedI32Value(function, false_value)) {
+    return false;
+  }
+
+  shape->cond = LowerIntCond(cmp->GetOpcode());
+  shape->cmp_lhs = cmp->GetLhs();
+  shape->cmp_rhs = cmp->GetRhs();
+  shape->true_value = true_value;
+  shape->false_value = false_value;
+  return true;
+}
+
+bool MatchTwoWayFloatSelectFunction(const ir::Function& function,
+                                    FloatSelectCallShape* shape) {
+  if (shape == nullptr || function.IsExternal() || function.GetReturnType() == nullptr ||
+      !function.GetReturnType()->IsFloat() || function.GetBlocks().size() != 3 ||
+      function.IsRecursive()) {
+    return false;
+  }
+
+  auto* entry = function.GetEntryBlock();
+  if (entry == nullptr || entry->GetInstructions().size() != 2) {
+    return false;
+  }
+  auto* cmp = ir::dyncast<ir::BinaryInst>(entry->GetInstructions()[0].get());
+  auto* branch = ir::dyncast<ir::CondBrInst>(entry->GetInstructions()[1].get());
+  if (cmp == nullptr || branch == nullptr || branch->GetCondition() != cmp ||
+      !IsFloatCompareOpcode(cmp->GetOpcode()) ||
+      !IsMappedF32Value(function, cmp->GetLhs()) ||
+      !IsMappedF32Value(function, cmp->GetRhs())) {
+    return false;
+  }
+
+  ir::Value* true_value = nullptr;
+  ir::Value* false_value = nullptr;
+  if (!MatchSingleReturnValue(branch->GetThenBlock(), &true_value) ||
+      !MatchSingleReturnValue(branch->GetElseBlock(), &false_value) ||
+      !IsMappedF32Value(function, true_value) ||
+      !IsMappedF32Value(function, false_value)) {
+    return false;
+  }
+
+  shape->cond = LowerFloatCond(cmp->GetOpcode());
+  shape->cmp_lhs = cmp->GetLhs();
+  shape->cmp_rhs = cmp->GetRhs();
+  shape->true_value = true_value;
+  shape->false_value = false_value;
+  return true;
+}
+
+std::int64_t FloatBits(float value) {
+  std::uint32_t bits = 0;
+  std::memcpy(&bits, &value, sizeof(bits));
  return static_cast<std::int64_t>(bits);
 }

@ -353,11 +509,22 @@ class Lowerer {
    if (auto* cb = ir::dyncast<ir::ConstantI1>(value)) {
      return MachineOperand::Imm(cb->GetValue() ? 1 : 0);
    }
-    if (auto* cf = ir::dyncast<ir::ConstantFloat>(value)) {
-      return MachineOperand::Imm(FloatBits(cf->GetValue()));
-    }
-
-    if (inline_values != nullptr) {
+    if (auto* cf = ir::dyncast<ir::ConstantFloat>(value)) {
+      return MachineOperand::Imm(FloatBits(cf->GetValue()));
+    }
+    if (auto* function = ir::dyncast<ir::Function>(value)) {
+      auto lowered = NewVRegValue(ValueType::Ptr);
+      AddressExpr address;
+      address.base_kind = AddrBaseKind::Global;
+      address.symbol = function->GetName();
+      MachineInstr instr(MachineInstr::Opcode::Lea,
+                         {MachineOperand::VReg(lowered.index)});
+      instr.SetAddress(std::move(address));
+      current_block_->Append(std::move(instr));
+      return MachineOperand::VReg(lowered.index);
+    }
+
+    if (inline_values != nullptr) {
      auto inline_it = inline_values->find(value);
      if (inline_it != inline_values->end()) {
        return inline_it->second;
@ -452,16 +619,31 @@ class Lowerer {
    return {LoweredKind::VReg, type, current_function_->NewVReg(type), ""};
  }

-  LoweredValue MaterializeOperandAsValue(const MachineOperand& operand, ValueType type) {
-    if (operand.GetKind() == OperandKind::VReg) {
-      return {LoweredKind::VReg, type, operand.GetVReg(), ""};
-    }
-
+  LoweredValue MaterializeOperandAsValue(const MachineOperand& operand, ValueType type) {
+    if (operand.GetKind() == OperandKind::VReg) {
+      return {LoweredKind::VReg, type, operand.GetVReg(), ""};
+    }
+
    auto lowered = NewVRegValue(type);
    current_block_->Append(MachineInstr::Opcode::Copy,
-                           {MachineOperand::VReg(lowered.index), operand});
-    return lowered;
-  }
+                           {MachineOperand::VReg(lowered.index), operand});
+    return lowered;
+  }
+
+  MachineOperand ResolveMappedCallOperand(ir::CallInst* call, ir::Value* callee_value,
+                                          const OperandMap* inline_values) {
+    if (ir::dyncast<ir::ConstantInt>(callee_value) != nullptr ||
+        ir::dyncast<ir::ConstantI1>(callee_value) != nullptr ||
+        ir::dyncast<ir::ConstantFloat>(callee_value) != nullptr) {
+      return ResolveScalarOperand(callee_value, inline_values);
+    }
+
+    auto* arg = ir::dyncast<ir::Argument>(callee_value);
+    if (arg == nullptr || call == nullptr || arg->GetIndex() >= call->GetArguments().size()) {
+      throw std::runtime_error(FormatError("mir", "unmapped select-call operand"));
+    }
+    return ResolveScalarOperand(call->GetArguments()[arg->GetIndex()], inline_values);
+  }

  void InsertBeforeTerminator(MachineBasicBlock* block, MachineInstr instr) {
    auto& instructions = block->GetInstructions();
@ -866,6 +1048,48 @@ class Lowerer {
  bool TryEmitMathIdiomCall(ir::CallInst* call, const OperandMap* inline_values,
                            MachineOperand* result_operand) {
    auto* callee = call == nullptr ? nullptr : call->GetCallee();
+    IntSelectCallShape select_shape;
+    if (callee != nullptr && call->GetType() != nullptr && call->GetType()->IsInt32() &&
+        MatchTwoWayIntSelectFunction(*callee, &select_shape)) {
+      auto lowered = NewVRegValue(ValueType::I32);
+      MachineInstr instr(
+          MachineInstr::Opcode::CSelect,
+          {MachineOperand::VReg(lowered.index),
+           ResolveMappedCallOperand(call, select_shape.true_value, inline_values),
+           ResolveMappedCallOperand(call, select_shape.false_value, inline_values),
+           ResolveMappedCallOperand(call, select_shape.cmp_lhs, inline_values),
+           ResolveMappedCallOperand(call, select_shape.cmp_rhs, inline_values)});
+      instr.SetCondCode(select_shape.cond);
+      current_block_->Append(std::move(instr));
+      if (result_operand != nullptr) {
+        *result_operand = MachineOperand::VReg(lowered.index);
+      } else {
+        values_[call] = lowered;
+      }
+      return true;
+    }
+
+    FloatSelectCallShape float_select_shape;
+    if (callee != nullptr && call->GetType() != nullptr && call->GetType()->IsFloat() &&
+        MatchTwoWayFloatSelectFunction(*callee, &float_select_shape)) {
+      auto lowered = NewVRegValue(ValueType::F32);
+      MachineInstr instr(
+          MachineInstr::Opcode::CSelect,
+          {MachineOperand::VReg(lowered.index),
+           ResolveMappedCallOperand(call, float_select_shape.true_value, inline_values),
+           ResolveMappedCallOperand(call, float_select_shape.false_value, inline_values),
+           ResolveMappedCallOperand(call, float_select_shape.cmp_lhs, inline_values),
+           ResolveMappedCallOperand(call, float_select_shape.cmp_rhs, inline_values)});
+      instr.SetCondCode(float_select_shape.cond);
+      current_block_->Append(std::move(instr));
+      if (result_operand != nullptr) {
+        *result_operand = MachineOperand::VReg(lowered.index);
+      } else {
+        values_[call] = lowered;
+      }
+      return true;
+    }
+
    int modulo = 0;
    if (callee != nullptr && call->GetType() != nullptr && call->GetType()->IsInt32() &&
        call->GetArguments().size() == 2 &&
--- a/src/mir/MIRInstr.cpp
+++ b/src/mir/MIRInstr.cpp
@ -39,6 +39,8 @@ std::vector<int> MachineInstr::GetDefs() const {
    case Opcode::Add:
    case Opcode::Sub:
    case Opcode::Mul:
+    case Opcode::MAdd:
+    case Opcode::MSub:
    case Opcode::Div:
    case Opcode::Rem:
    case Opcode::ModMul:
@ -59,6 +61,7 @@ std::vector<int> MachineInstr::GetDefs() const {
    case Opcode::FNeg:
    case Opcode::ICmp:
    case Opcode::FCmp:
+    case Opcode::CSelect:
    case Opcode::ZExt:
    case Opcode::ItoF:
    case Opcode::FtoI:
@ -132,6 +135,18 @@ std::vector<int> MachineInstr::GetUses() const {
        push_vreg(operands_[1]);
      }
      break;
+    case Opcode::MAdd:
+    case Opcode::MSub:
+      if (operands_.size() >= 2) {
+        push_vreg(operands_[1]);
+      }
+      if (operands_.size() >= 3) {
+        push_vreg(operands_[2]);
+      }
+      if (operands_.size() >= 4) {
+        push_vreg(operands_[3]);
+      }
+      break;
    case Opcode::Add:
    case Opcode::Sub:
    case Opcode::Mul:
@ -152,12 +167,21 @@ std::vector<int> MachineInstr::GetUses() const {
    case Opcode::FDiv:
    case Opcode::ICmp:
    case Opcode::FCmp:
+    case Opcode::CSelect:
      if (operands_.size() >= 2) {
        push_vreg(operands_[1]);
      }
      if (operands_.size() >= 3) {
        push_vreg(operands_[2]);
      }
+      if (opcode_ == Opcode::CSelect) {
+        if (operands_.size() >= 4) {
+          push_vreg(operands_[3]);
+        }
+        if (operands_.size() >= 5) {
+          push_vreg(operands_[4]);
+        }
+      }
      break;
    case Opcode::CondBr:
      if (!operands_.empty()) {
--- a/src/mir/RegAlloc.cpp
+++ b/src/mir/RegAlloc.cpp
@ -1,4 +1,4 @@
-#include "mir/MIR.h"
+#include "mir/CodeGen.h"

 #include <algorithm>
 #include <cstdint>
--- a/src/mir/Register.cpp
+++ b/src/mir/Register.cpp
@ -20,6 +20,11 @@ const char* kSRegNames[] = {
    "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15",
    "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
    "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31"};
+const char* kVRegNames[] = {
+    "v0",  "v1",  "v2",  "v3",  "v4",  "v5",  "v6",  "v7",
+    "v8",  "v9",  "v10", "v11", "v12", "v13", "v14", "v15",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+    "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"};

 }  // namespace

@ -27,7 +32,36 @@ bool IsGPR(ValueType type) {
  return type == ValueType::I1 || type == ValueType::I32 || type == ValueType::Ptr;
 }

-bool IsFPR(ValueType type) { return type == ValueType::F32; }
+bool IsFPR(ValueType type) {
+  return type == ValueType::F32 || IsNEON(type);
+}
+
+bool IsVector(ValueType type) {
+  return type == ValueType::I32x4 || type == ValueType::F32x4;
+}
+
+bool IsNEON(ValueType type) { return IsVector(type); }
+
+int GetVectorLaneCount(ValueType type) {
+  switch (type) {
+    case ValueType::I32x4:
+    case ValueType::F32x4:
+      return 4;
+    default:
+      return 1;
+  }
+}
+
+ValueType GetVectorElementType(ValueType type) {
+  switch (type) {
+    case ValueType::I32x4:
+      return ValueType::I32;
+    case ValueType::F32x4:
+      return ValueType::F32;
+    default:
+      return type;
+  }
+}

 int GetValueSize(ValueType type) {
  switch (type) {
@ -39,6 +73,9 @@ int GetValueSize(ValueType type) {
      return 4;
    case ValueType::Ptr:
      return 8;
+    case ValueType::I32x4:
+    case ValueType::F32x4:
+      return 16;
  }
  return 0;
 }
@ -53,6 +90,9 @@ int GetValueAlign(ValueType type) {
    case ValueType::I32:
    case ValueType::F32:
      return 4;
+    case ValueType::I32x4:
+    case ValueType::F32x4:
+      return 16;
  }
  return 1;
 }
@ -65,6 +105,9 @@ const char* GetPhysRegName(PhysReg reg, ValueType type) {
    if (reg.index < 0 || reg.index >= 32) {
      throw std::runtime_error("float register index out of range");
    }
+    if (IsVector(type)) {
+      return kVRegNames[reg.index];
+    }
    return kSRegNames[reg.index];
  }
  if (reg.index < 0 || reg.index >= 31) {
--- a/src/mir/passes/AddressHoisting.cpp
+++ b/src/mir/passes/AddressHoisting.cpp
@ -0,0 +1,464 @@
+#include "mir/Passes.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstddef>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "utils/OptConfig.h"
+
+namespace mir {
+namespace {
+
+bool IsHoistCandidate(const MachineFunction& function, int object_index, int use_count) {
+  const auto& object = function.GetStackObject(object_index);
+  if (object.kind != StackObjectKind::Local) {
+    return false;
+  }
+  if (use_count < 2) {
+    return false;
+  }
+  if (object.size >= 4096) {
+    return true;
+  }
+  return object.size >= 256 && use_count >= 4;
+}
+
+bool IsPlainFrameLea(const MachineInstr& inst, int object_index) {
+  if (inst.GetOpcode() != MachineInstr::Opcode::Lea || !inst.HasAddress() ||
+      inst.GetOperands().empty() || inst.GetOperands()[0].GetKind() != OperandKind::VReg) {
+    return false;
+  }
+  const auto& address = inst.GetAddress();
+  return address.base_kind == AddrBaseKind::FrameObject &&
+         address.base_index == object_index && address.const_offset == 0 &&
+         address.scaled_vregs.empty();
+}
+
+bool IsPlainGlobalLea(const MachineInstr& inst, const std::string& symbol) {
+  if (inst.GetOpcode() != MachineInstr::Opcode::Lea || !inst.HasAddress() ||
+      inst.GetOperands().empty() || inst.GetOperands()[0].GetKind() != OperandKind::VReg) {
+    return false;
+  }
+  const auto& address = inst.GetAddress();
+  return address.base_kind == AddrBaseKind::Global && address.symbol == symbol &&
+         address.const_offset == 0 && address.scaled_vregs.empty();
+}
+
+bool HasCallClobberingInstruction(const MachineBasicBlock& block) {
+  for (const auto& inst : block.GetInstructions()) {
+    if (inst.GetOpcode() == MachineInstr::Opcode::Call ||
+        inst.GetOpcode() == MachineInstr::Opcode::Memset) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool HasCallClobberingInstruction(const MachineFunction& function) {
+  for (const auto& block : function.GetBlocks()) {
+    if (HasCallClobberingInstruction(*block)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::size_t FindEntryInsertPos(const MachineBasicBlock& block) {
+  const auto& instructions = block.GetInstructions();
+  std::size_t pos = 0;
+  while (pos < instructions.size() &&
+         instructions[pos].GetOpcode() == MachineInstr::Opcode::Arg) {
+    ++pos;
+  }
+  return pos;
+}
+
+struct AddressStemKey {
+  AddrBaseKind base_kind = AddrBaseKind::None;
+  int base_index = -1;
+  std::string symbol;
+  std::vector<std::pair<int, std::int64_t>> scaled_vregs;
+
+  bool operator==(const AddressStemKey& rhs) const {
+    return base_kind == rhs.base_kind && base_index == rhs.base_index &&
+           symbol == rhs.symbol && scaled_vregs == rhs.scaled_vregs;
+  }
+};
+
+struct AddressStemKeyHash {
+  std::size_t operator()(const AddressStemKey& key) const {
+    std::size_t h = static_cast<std::size_t>(key.base_kind);
+    h ^= std::hash<int>{}(key.base_index) + 0x9e3779b9 + (h << 6) + (h >> 2);
+    h ^= std::hash<std::string>{}(key.symbol) + 0x9e3779b9 + (h << 6) + (h >> 2);
+    for (const auto& term : key.scaled_vregs) {
+      h ^= std::hash<int>{}(term.first) + 0x9e3779b9 + (h << 6) + (h >> 2);
+      h ^= std::hash<std::int64_t>{}(term.second) + 0x9e3779b9 + (h << 6) + (h >> 2);
+    }
+    return h;
+  }
+};
+
+struct AddressStemInfo {
+  int count = 0;
+  std::size_t first_pos = 0;
+  std::size_t existing_pos = 0;
+  int base_vreg = -1;
+  bool has_existing_base = false;
+};
+
+bool ShouldHoistAddressStem(const AddressExpr& address) {
+  if (address.base_kind == AddrBaseKind::None || address.scaled_vregs.empty()) {
+    return false;
+  }
+  return true;
+}
+
+AddressStemKey MakeAddressStemKey(const AddressExpr& address) {
+  AddressStemKey key;
+  key.base_kind = address.base_kind;
+  key.base_index = address.base_index;
+  key.symbol = address.symbol;
+  key.scaled_vregs = address.scaled_vregs;
+  return key;
+}
+
+AddressExpr MakeStemAddress(const AddressStemKey& key) {
+  AddressExpr address;
+  address.base_kind = key.base_kind;
+  address.base_index = key.base_index;
+  address.symbol = key.symbol;
+  address.scaled_vregs = key.scaled_vregs;
+  return address;
+}
+
+bool IsExistingStemLea(const MachineInstr& inst, const AddressStemKey& key) {
+  if (inst.GetOpcode() != MachineInstr::Opcode::Lea || !inst.HasAddress() ||
+      inst.GetOperands().empty() || inst.GetOperands()[0].GetKind() != OperandKind::VReg) {
+    return false;
+  }
+  const auto& address = inst.GetAddress();
+  return address.const_offset == 0 && MakeAddressStemKey(address) == key;
+}
+
+bool RunScaledAddressStemHoisting(MachineFunction& function, MachineBasicBlock& block) {
+  if (HasCallClobberingInstruction(block)) {
+    return false;
+  }
+
+  auto& instructions = block.GetInstructions();
+  std::unordered_map<AddressStemKey, AddressStemInfo, AddressStemKeyHash> stems;
+  for (std::size_t i = 0; i < instructions.size(); ++i) {
+    const auto& inst = instructions[i];
+    if (!inst.HasAddress() || !ShouldHoistAddressStem(inst.GetAddress())) {
+      continue;
+    }
+    const auto key = MakeAddressStemKey(inst.GetAddress());
+    auto& info = stems[key];
+    if (info.count == 0) {
+      info.first_pos = i;
+    }
+    ++info.count;
+    if (IsExistingStemLea(inst, key)) {
+      info.base_vreg = inst.GetOperands()[0].GetVReg();
+      info.existing_pos = i;
+      info.has_existing_base = true;
+    }
+  }
+
+  struct SelectedStem {
+    AddressStemKey key;
+    std::size_t insert_pos = 0;
+    int base_vreg = -1;
+    bool needs_insert = true;
+  };
+  std::vector<SelectedStem> selected;
+  for (auto& [key, info] : stems) {
+    if (info.count < 3) {
+      continue;
+    }
+    const bool reuse_existing = info.has_existing_base && info.existing_pos == info.first_pos;
+    if (!reuse_existing) {
+      info.base_vreg = function.NewVReg(ValueType::Ptr);
+    }
+    selected.push_back({key, info.first_pos, info.base_vreg, !reuse_existing});
+  }
+  if (selected.empty()) {
+    return false;
+  }
+
+  std::sort(selected.begin(), selected.end(), [](const SelectedStem& lhs,
+                                                 const SelectedStem& rhs) {
+    return lhs.insert_pos < rhs.insert_pos;
+  });
+
+  auto find_selected = [&](const AddressExpr& address) -> const SelectedStem* {
+    if (!ShouldHoistAddressStem(address)) {
+      return nullptr;
+    }
+    const auto key = MakeAddressStemKey(address);
+    for (const auto& stem : selected) {
+      if (stem.key == key) {
+        return &stem;
+      }
+    }
+    return nullptr;
+  };
+
+  for (auto& inst : instructions) {
+    if (!inst.HasAddress()) {
+      continue;
+    }
+    auto* stem = find_selected(inst.GetAddress());
+    if (stem == nullptr) {
+      continue;
+    }
+    if (IsExistingStemLea(inst, stem->key) &&
+        inst.GetOperands()[0].GetKind() == OperandKind::VReg &&
+        inst.GetOperands()[0].GetVReg() == stem->base_vreg) {
+      continue;
+    }
+    const std::int64_t old_offset = inst.GetAddress().const_offset;
+    auto& address = inst.GetAddress();
+    address.base_kind = AddrBaseKind::VReg;
+    address.base_index = stem->base_vreg;
+    address.symbol.clear();
+    address.const_offset = old_offset;
+    address.scaled_vregs.clear();
+  }
+
+  std::vector<MachineInstr> rewritten;
+  rewritten.reserve(instructions.size() + selected.size());
+  std::size_t next_stem = 0;
+  for (std::size_t i = 0; i < instructions.size(); ++i) {
+    while (next_stem < selected.size() && selected[next_stem].insert_pos == i) {
+      const auto& stem = selected[next_stem];
+      if (stem.needs_insert) {
+        MachineInstr lea(MachineInstr::Opcode::Lea,
+                         {MachineOperand::VReg(stem.base_vreg)});
+        lea.SetAddress(MakeStemAddress(stem.key));
+        rewritten.push_back(std::move(lea));
+      }
+      ++next_stem;
+    }
+    rewritten.push_back(std::move(instructions[i]));
+  }
+  while (next_stem < selected.size()) {
+    const auto& stem = selected[next_stem];
+    if (stem.needs_insert) {
+      MachineInstr lea(MachineInstr::Opcode::Lea,
+                       {MachineOperand::VReg(stem.base_vreg)});
+      lea.SetAddress(MakeStemAddress(stem.key));
+      rewritten.push_back(std::move(lea));
+    }
+    ++next_stem;
+  }
+  instructions = std::move(rewritten);
+  return true;
+}
+
+}  // namespace
+
+void RunAddressHoisting(MachineModule& module) {
+  for (auto& function : module.GetFunctions()) {
+    if (!function || function->GetBlocks().empty()) {
+      continue;
+    }
+
+    const bool run_global_addr_hoist =
+        utils::IsEnabledUnlessEnvFlag("NUDTC_DISABLE_GLOBAL_ADDR_HOIST");
+    const bool has_call_clobber = HasCallClobberingInstruction(*function);
+    std::unordered_map<int, int> use_counts;
+    std::unordered_map<std::string, int> global_use_counts;
+    for (auto& block : function->GetBlocks()) {
+      for (auto& inst : block->GetInstructions()) {
+        if (!inst.HasAddress()) {
+          continue;
+        }
+        const auto& address = inst.GetAddress();
+        if (address.base_kind == AddrBaseKind::FrameObject && address.base_index >= 0) {
+          ++use_counts[address.base_index];
+        } else if (run_global_addr_hoist && !has_call_clobber &&
+                   address.base_kind == AddrBaseKind::Global && !address.symbol.empty()) {
+          ++global_use_counts[address.symbol];
+        }
+      }
+    }
+
+    std::unordered_map<int, int> base_vregs;
+    for (const auto& [object_index, count] : use_counts) {
+      if (!IsHoistCandidate(*function, object_index, count)) {
+        continue;
+      }
+      base_vregs.emplace(object_index, -1);
+    }
+    std::unordered_map<std::string, int> global_base_vregs;
+    for (const auto& [symbol, count] : global_use_counts) {
+      if (count >= 2) {
+        global_base_vregs.emplace(symbol, -1);
+      }
+    }
+
+    if (base_vregs.empty() && global_base_vregs.empty() && !run_global_addr_hoist) {
+      continue;
+    }
+
+    // Existing LEA instructions may be inside non-entry blocks and do not
+    // necessarily dominate all rewritten users. Always create a fresh base in
+    // the entry block for function-wide hoisting.
+
+    auto& entry_block = *function->GetBlocks().front();
+    auto& entry_insts = entry_block.GetInstructions();
+    std::size_t insert_pos = FindEntryInsertPos(entry_block);
+
+    for (auto& [object_index, base_vreg] : base_vregs) {
+      if (base_vreg >= 0) {
+        continue;
+      }
+      base_vreg = function->NewVReg(ValueType::Ptr);
+      MachineInstr lea(MachineInstr::Opcode::Lea, {MachineOperand::VReg(base_vreg)});
+      AddressExpr address;
+      address.base_kind = AddrBaseKind::FrameObject;
+      address.base_index = object_index;
+      lea.SetAddress(std::move(address));
+      entry_insts.insert(entry_insts.begin() + static_cast<std::ptrdiff_t>(insert_pos),
+                         std::move(lea));
+      ++insert_pos;
+    }
+    for (auto& [symbol, base_vreg] : global_base_vregs) {
+      if (base_vreg >= 0) {
+        continue;
+      }
+      base_vreg = function->NewVReg(ValueType::Ptr);
+      MachineInstr lea(MachineInstr::Opcode::Lea, {MachineOperand::VReg(base_vreg)});
+      AddressExpr address;
+      address.base_kind = AddrBaseKind::Global;
+      address.symbol = symbol;
+      lea.SetAddress(std::move(address));
+      entry_insts.insert(entry_insts.begin() + static_cast<std::ptrdiff_t>(insert_pos),
+                         std::move(lea));
+      ++insert_pos;
+    }
+
+    for (auto& block : function->GetBlocks()) {
+      for (auto& inst : block->GetInstructions()) {
+        if (!inst.HasAddress()) {
+          continue;
+        }
+        auto& address = inst.GetAddress();
+        auto it = base_vregs.find(address.base_index);
+        if (it == base_vregs.end()) {
+          continue;
+        }
+        if (IsPlainFrameLea(inst, address.base_index) &&
+            inst.GetOperands()[0].GetKind() == OperandKind::VReg &&
+            inst.GetOperands()[0].GetVReg() == it->second) {
+          continue;
+        }
+        if (address.base_kind != AddrBaseKind::FrameObject || address.base_index < 0) {
+          continue;
+        }
+        address.base_kind = AddrBaseKind::VReg;
+        address.base_index = it->second;
+      }
+    }
+    for (auto& block : function->GetBlocks()) {
+      for (auto& inst : block->GetInstructions()) {
+        if (!inst.HasAddress()) {
+          continue;
+        }
+        auto& address = inst.GetAddress();
+        if (address.base_kind != AddrBaseKind::Global || address.symbol.empty()) {
+          continue;
+        }
+        auto it = global_base_vregs.find(address.symbol);
+        if (it == global_base_vregs.end()) {
+          continue;
+        }
+        if (IsPlainGlobalLea(inst, address.symbol) &&
+            inst.GetOperands()[0].GetKind() == OperandKind::VReg &&
+            inst.GetOperands()[0].GetVReg() == it->second) {
+          continue;
+        }
+        address.base_kind = AddrBaseKind::VReg;
+        address.base_index = it->second;
+      }
+    }
+
+    if (!run_global_addr_hoist) {
+      continue;
+    }
+    for (auto& block : function->GetBlocks()) {
+      if (HasCallClobberingInstruction(*block)) {
+        continue;
+      }
+
+      std::unordered_map<std::string, int> block_global_counts;
+      for (const auto& inst : block->GetInstructions()) {
+        if (!inst.HasAddress()) {
+          continue;
+        }
+        const auto& address = inst.GetAddress();
+        if (address.base_kind == AddrBaseKind::Global && !address.symbol.empty()) {
+          ++block_global_counts[address.symbol];
+        }
+      }
+
+      std::unordered_map<std::string, int> block_base_vregs;
+      for (const auto& [symbol, count] : block_global_counts) {
+        if (count >= 3) {
+          block_base_vregs.emplace(symbol, -1);
+        }
+      }
+      if (block_base_vregs.empty()) {
+        continue;
+      }
+
+      auto& instructions = block->GetInstructions();
+      std::size_t insert_pos = FindEntryInsertPos(*block);
+      for (auto& [symbol, base_vreg] : block_base_vregs) {
+        if (base_vreg >= 0) {
+          continue;
+        }
+        base_vreg = function->NewVReg(ValueType::Ptr);
+        MachineInstr lea(MachineInstr::Opcode::Lea, {MachineOperand::VReg(base_vreg)});
+        AddressExpr address;
+        address.base_kind = AddrBaseKind::Global;
+        address.symbol = symbol;
+        lea.SetAddress(std::move(address));
+        instructions.insert(instructions.begin() + static_cast<std::ptrdiff_t>(insert_pos),
+                            std::move(lea));
+        ++insert_pos;
+      }
+
+      for (auto& inst : block->GetInstructions()) {
+        if (!inst.HasAddress()) {
+          continue;
+        }
+        auto& address = inst.GetAddress();
+        if (address.base_kind != AddrBaseKind::Global || address.symbol.empty()) {
+          continue;
+        }
+        auto it = block_base_vregs.find(address.symbol);
+        if (it == block_base_vregs.end()) {
+          continue;
+        }
+        if (IsPlainGlobalLea(inst, address.symbol) &&
+            inst.GetOperands()[0].GetKind() == OperandKind::VReg &&
+            inst.GetOperands()[0].GetVReg() == it->second) {
+          continue;
+        }
+        address.base_kind = AddrBaseKind::VReg;
+        address.base_index = it->second;
+      }
+    }
+
+    for (auto& block : function->GetBlocks()) {
+      RunScaledAddressStemHoisting(*function, *block);
+    }
+  }
+}
+
+}  // namespace mir
--- a/src/mir/passes/CFGCleanup.cpp
+++ b/src/mir/passes/CFGCleanup.cpp
@ -1,4 +1,4 @@
-#include "mir/MIR.h"
+#include "mir/Passes.h"

 #include <algorithm>
 #include <string>
@ -6,6 +6,8 @@
 #include <unordered_set>
 #include <vector>

+#include "utils/OptConfig.h"
+
 namespace mir {
 namespace {

@ -29,6 +31,9 @@ std::vector<int> CollectSuccessors(const MachineFunction& function, int index) {
  }
  const auto& instructions = blocks[index]->GetInstructions();
  if (instructions.empty()) {
+    if (index + 1 < static_cast<int>(blocks.size()) && blocks[index + 1]) {
+      succs.push_back(index + 1);
+    }
    return succs;
  }
  const auto& term = instructions.back();
@ -52,6 +57,12 @@ std::vector<int> CollectSuccessors(const MachineFunction& function, int index) {
        succs.push_back(succ);
      }
    }
+    return succs;
+  }
+
+  if (!term.IsTerminator() && index + 1 < static_cast<int>(blocks.size()) &&
+      blocks[index + 1]) {
+    succs.push_back(index + 1);
  }
  return succs;
 }
@ -136,6 +147,28 @@ bool RewriteBranchTargets(MachineFunction& function) {
  return changed;
 }

+bool RemoveBranchToNextBlock(MachineFunction& function) {
+  bool changed = false;
+  auto& blocks = function.GetBlocks();
+  for (std::size_t i = 0; i + 1 < blocks.size(); ++i) {
+    auto& block = blocks[i];
+    auto& next = blocks[i + 1];
+    if (!block || !next || block->GetInstructions().empty()) {
+      continue;
+    }
+    auto& instructions = block->GetInstructions();
+    auto& term = instructions.back();
+    if (term.GetOpcode() != MachineInstr::Opcode::Br || term.GetOperands().empty() ||
+        term.GetOperands()[0].GetKind() != OperandKind::Block ||
+        term.GetOperands()[0].GetText() != next->GetName()) {
+      continue;
+    }
+    instructions.pop_back();
+    changed = true;
+  }
+  return changed;
+}
+
 bool RemoveUnreachableBlocks(MachineFunction& function) {
  auto& blocks = function.GetBlocks();
  if (blocks.empty() || !blocks.front()) {
@ -221,6 +254,9 @@ bool RunCFGCleanupOnFunction(MachineFunction& function) {
      break;
    }
  }
+  if (utils::IsEnvFlagSet("NUDTC_ENABLE_MIR_FALLTHROUGH_BRANCH_ELIM")) {
+    changed |= RemoveBranchToNextBlock(function);
+  }
  return changed;
 }

--- a/src/mir/passes/CMakeLists.txt
+++ b/src/mir/passes/CMakeLists.txt
@ -1,8 +1,10 @@
 add_library(mir_passes STATIC
  PassManager.cpp
+  AddressHoisting.cpp
  Peephole.cpp
  SpillReduction.cpp
  CFGCleanup.cpp
+  MIRVerifier.cpp
 )

 target_link_libraries(mir_passes PUBLIC
--- a/src/mir/passes/MIRVerifier.cpp
+++ b/src/mir/passes/MIRVerifier.cpp
@ -0,0 +1,117 @@
+#include "mir/Passes.h"
+
+#include <algorithm>
+#include <stdexcept>
+#include <string>
+#include <unordered_set>
+
+namespace mir {
+namespace {
+
+[[noreturn]] void Fail(const MachineFunction* function,
+                       const MachineBasicBlock* block,
+                       const std::string& message) {
+  std::string where = "[mir-verify]";
+  if (function != nullptr) {
+    where += " function " + function->GetName();
+  }
+  if (block != nullptr) {
+    where += " block " + block->GetName();
+  }
+  throw std::runtime_error(where + ": " + message);
+}
+
+void CheckVReg(const MachineFunction& function, const MachineBasicBlock& block,
+               int vreg) {
+  if (vreg < 0 || vreg >= static_cast<int>(function.GetVRegs().size())) {
+    Fail(&function, &block, "instruction references invalid virtual register");
+  }
+}
+
+void CheckAddress(const MachineFunction& function, const MachineBasicBlock& block,
+                  const AddressExpr& address) {
+  if (address.base_kind == AddrBaseKind::FrameObject) {
+    if (address.base_index < 0 ||
+        address.base_index >= static_cast<int>(function.GetStackObjects().size())) {
+      Fail(&function, &block, "address references invalid stack object");
+    }
+  } else if (address.base_kind == AddrBaseKind::VReg) {
+    CheckVReg(function, block, address.base_index);
+  }
+  for (const auto& term : address.scaled_vregs) {
+    CheckVReg(function, block, term.first);
+  }
+}
+
+void CheckBlockTargets(const MachineFunction& function,
+                       const MachineBasicBlock& block,
+                       const std::unordered_set<std::string>& block_names,
+                       const MachineInstr& instr) {
+  for (const auto& operand : instr.GetOperands()) {
+    if (operand.GetKind() == OperandKind::Block &&
+        block_names.count(operand.GetText()) == 0) {
+      Fail(&function, &block, "branch references unknown block");
+    }
+  }
+}
+
+void CheckFunction(const MachineFunction& function) {
+  std::unordered_set<std::string> block_names;
+  for (const auto& block : function.GetBlocks()) {
+    if (!block) {
+      Fail(&function, nullptr, "null block");
+    }
+    if (!block_names.insert(block->GetName()).second) {
+      Fail(&function, block.get(), "duplicate block name");
+    }
+  }
+
+  for (const auto& object : function.GetStackObjects()) {
+    if (object.index < 0 || object.size < 0 || object.align <= 0) {
+      Fail(&function, nullptr, "invalid stack object");
+    }
+  }
+
+  for (const auto& block_ptr : function.GetBlocks()) {
+    const auto& block = *block_ptr;
+    const auto& instructions = block.GetInstructions();
+    for (std::size_t i = 0; i < instructions.size(); ++i) {
+      const auto& instr = instructions[i];
+      if (instr.IsTerminator() && i + 1 != instructions.size()) {
+        Fail(&function, &block, "terminator is not the last instruction");
+      }
+      for (int def : instr.GetDefs()) {
+        CheckVReg(function, block, def);
+      }
+      for (int use : instr.GetUses()) {
+        CheckVReg(function, block, use);
+      }
+      if (instr.HasAddress()) {
+        CheckAddress(function, block, instr.GetAddress());
+      }
+      CheckBlockTargets(function, block, block_names, instr);
+    }
+  }
+
+  for (const auto& vreg : function.GetVRegs()) {
+    const auto& allocation = function.GetAllocation(vreg.id);
+    if (allocation.kind == Allocation::Kind::Spill &&
+        (allocation.stack_object < 0 ||
+         allocation.stack_object >=
+             static_cast<int>(function.GetStackObjects().size()))) {
+      Fail(&function, nullptr, "spill allocation references invalid stack object");
+    }
+  }
+}
+
+}  // namespace
+
+void VerifyMIR(const MachineModule& module) {
+  for (const auto& function : module.GetFunctions()) {
+    if (function) {
+      CheckFunction(*function);
+    }
+  }
+}
+
+}  // namespace mir
--- a/src/mir/passes/PassManager.cpp
+++ b/src/mir/passes/PassManager.cpp
@ -1,23 +1,30 @@
-#include "mir/MIR.h"
+#include "mir/Passes.h"

-#include <cstdlib>
+#include "utils/OptConfig.h"

 namespace mir {
+namespace {
+
+void VerifyIfRequested(const MachineModule& module) {
+  if (utils::IsEnvFlagSet("NUDTC_VERIFY_MIR")) {
+    VerifyMIR(module);
+  }
+}
+
+}  // namespace

 void RunMIRPreRegAllocPassPipeline(MachineModule& module) {
-  const char* disable_spill_reduction = std::getenv("NUDTC_DISABLE_MIR_SPILL_REDUCTION");
  const bool run_spill_reduction =
-      disable_spill_reduction == nullptr || disable_spill_reduction[0] == '\0' ||
-      disable_spill_reduction[0] == '0';
-  const char* disable_cfg_cleanup = std::getenv("NUDTC_DISABLE_MIR_CFG_CLEANUP");
+      utils::IsEnabledUnlessEnvFlag("NUDTC_DISABLE_MIR_SPILL_REDUCTION");
  const bool run_cfg_cleanup =
-      disable_cfg_cleanup == nullptr || disable_cfg_cleanup[0] == '\0' ||
-      disable_cfg_cleanup[0] == '0';
+      utils::IsEnabledUnlessEnvFlag("NUDTC_DISABLE_MIR_CFG_CLEANUP");

  if (run_spill_reduction) {
    RunSpillReduction(module);
+    VerifyIfRequested(module);
  }
  RunAddressHoisting(module);
+  VerifyIfRequested(module);

  constexpr int kMaxIterations = 4;
  for (int iteration = 0; iteration < kMaxIterations; ++iteration) {
@ -26,6 +33,7 @@ void RunMIRPreRegAllocPassPipeline(MachineModule& module) {
    if (run_cfg_cleanup) {
      changed |= RunCFGCleanup(module);
    }
+    VerifyIfRequested(module);
    if (!changed) {
      break;
    }
@ -33,10 +41,8 @@ void RunMIRPreRegAllocPassPipeline(MachineModule& module) {
 }

 void RunMIRPostRegAllocPassPipeline(MachineModule& module) {
-  const char* disable_cfg_cleanup = std::getenv("NUDTC_DISABLE_MIR_CFG_CLEANUP");
  const bool run_cfg_cleanup =
-      disable_cfg_cleanup == nullptr || disable_cfg_cleanup[0] == '\0' ||
-      disable_cfg_cleanup[0] == '0';
+      utils::IsEnabledUnlessEnvFlag("NUDTC_DISABLE_MIR_CFG_CLEANUP");
  constexpr int kMaxIterations = 2;
  for (int iteration = 0; iteration < kMaxIterations; ++iteration) {
    bool changed = false;
@ -44,6 +50,7 @@ void RunMIRPostRegAllocPassPipeline(MachineModule& module) {
    if (run_cfg_cleanup) {
      changed |= RunCFGCleanup(module);
    }
+    VerifyIfRequested(module);
    if (!changed) {
      break;
    }
--- a/src/mir/passes/Peephole.cpp
+++ b/src/mir/passes/Peephole.cpp
@ -1,9 +1,11 @@
-#include "mir/MIR.h"
+#include "mir/Passes.h"

 #include "ir/IR.h"
+#include "utils/OptConfig.h"

 #include <algorithm>
 #include <cstdint>
+#include <limits>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@ -60,6 +62,19 @@ bool IsImm(const MachineOperand& operand, std::int64_t value) {
  return operand.GetKind() == OperandKind::Imm && operand.GetImm() == value;
 }

+bool IsPositivePowerOfTwo(std::int64_t value) {
+  return value > 0 && (value & (value - 1)) == 0;
+}
+
+int Log2I64(std::int64_t value) {
+  int shift = 0;
+  while (value > 1) {
+    value >>= 1;
+    ++shift;
+  }
+  return shift;
+}
+
 bool SameExactOperand(const MachineOperand& lhs, const MachineOperand& rhs) {
  if (lhs.GetKind() != rhs.GetKind()) {
    return false;
@ -212,6 +227,8 @@ bool RewriteUses(MachineInstr& inst, const AliasMap& aliases) {
    case MachineInstr::Opcode::Add:
    case MachineInstr::Opcode::Sub:
    case MachineInstr::Opcode::Mul:
+    case MachineInstr::Opcode::MAdd:
+    case MachineInstr::Opcode::MSub:
    case MachineInstr::Opcode::Div:
    case MachineInstr::Opcode::Rem:
    case MachineInstr::Opcode::ModMul:
@ -229,12 +246,26 @@ bool RewriteUses(MachineInstr& inst, const AliasMap& aliases) {
    case MachineInstr::Opcode::FDiv:
    case MachineInstr::Opcode::ICmp:
    case MachineInstr::Opcode::FCmp:
+    case MachineInstr::Opcode::CSelect:
      if (operands.size() >= 2) {
        changed |= RewriteOperand(operands[1], aliases);
      }
      if (operands.size() >= 3) {
        changed |= RewriteOperand(operands[2], aliases);
      }
+      if (inst.GetOpcode() == MachineInstr::Opcode::CSelect) {
+        if (operands.size() >= 4) {
+          changed |= RewriteOperand(operands[3], aliases);
+        }
+        if (operands.size() >= 5) {
+          changed |= RewriteOperand(operands[4], aliases);
+        }
+      }
+      if ((inst.GetOpcode() == MachineInstr::Opcode::MAdd ||
+           inst.GetOpcode() == MachineInstr::Opcode::MSub) &&
+          operands.size() >= 4) {
+        changed |= RewriteOperand(operands[3], aliases);
+      }
      break;
    case MachineInstr::Opcode::CondBr:
      if (!operands.empty()) {
@ -301,6 +332,79 @@ bool SimplifyZExt(MachineInstr& inst) {
  return true;
 }

+std::int64_t WrapI32(std::int64_t value) {
+  return static_cast<std::int32_t>(static_cast<std::uint32_t>(value));
+}
+
+bool TryFoldIntegerBinaryImmediate(MachineInstr::Opcode opcode,
+                                   const MachineOperand& lhs,
+                                   const MachineOperand& rhs,
+                                   MachineOperand* result) {
+  if (lhs.GetKind() != OperandKind::Imm || rhs.GetKind() != OperandKind::Imm) {
+    return false;
+  }
+
+  const std::int64_t a = WrapI32(lhs.GetImm());
+  const std::int64_t b = WrapI32(rhs.GetImm());
+  switch (opcode) {
+    case MachineInstr::Opcode::Add:
+      *result = MachineOperand::Imm(WrapI32(a + b));
+      return true;
+    case MachineInstr::Opcode::Sub:
+      *result = MachineOperand::Imm(WrapI32(a - b));
+      return true;
+    case MachineInstr::Opcode::Mul:
+      *result = MachineOperand::Imm(WrapI32(a * b));
+      return true;
+    case MachineInstr::Opcode::Div:
+      if (b == 0 ||
+          (a == std::numeric_limits<std::int32_t>::min() && b == -1)) {
+        return false;
+      }
+      *result = MachineOperand::Imm(WrapI32(a / b));
+      return true;
+    case MachineInstr::Opcode::Rem:
+      if (b == 0 ||
+          (a == std::numeric_limits<std::int32_t>::min() && b == -1)) {
+        return false;
+      }
+      *result = MachineOperand::Imm(WrapI32(a % b));
+      return true;
+    case MachineInstr::Opcode::And:
+      *result = MachineOperand::Imm(WrapI32(a & b));
+      return true;
+    case MachineInstr::Opcode::Or:
+      *result = MachineOperand::Imm(WrapI32(a | b));
+      return true;
+    case MachineInstr::Opcode::Xor:
+      *result = MachineOperand::Imm(WrapI32(a ^ b));
+      return true;
+    case MachineInstr::Opcode::Shl:
+      if (b < 0 || b >= 32) {
+        return false;
+      }
+      *result = MachineOperand::Imm(WrapI32(static_cast<std::uint32_t>(a)
+                                            << static_cast<unsigned>(b)));
+      return true;
+    case MachineInstr::Opcode::AShr:
+      if (b < 0 || b >= 32) {
+        return false;
+      }
+      *result = MachineOperand::Imm(WrapI32(static_cast<std::int32_t>(a) >>
+                                            static_cast<unsigned>(b)));
+      return true;
+    case MachineInstr::Opcode::LShr:
+      if (b < 0 || b >= 32) {
+        return false;
+      }
+      *result = MachineOperand::Imm(WrapI32(static_cast<std::uint32_t>(a) >>
+                                            static_cast<unsigned>(b)));
+      return true;
+    default:
+      return false;
+  }
+}
+
 bool SimplifyIntegerBinary(MachineInstr& inst) {
  const auto opcode = inst.GetOpcode();
  const auto& operands = inst.GetOperands();
@ -310,6 +414,12 @@ bool SimplifyIntegerBinary(MachineInstr& inst) {

  const auto& lhs = operands[1];
  const auto& rhs = operands[2];
+  MachineOperand folded;
+  if (TryFoldIntegerBinaryImmediate(opcode, lhs, rhs, &folded)) {
+    inst = MakeCopyLike(inst, folded);
+    return true;
+  }
+
  switch (opcode) {
    case MachineInstr::Opcode::Add:
      if (IsImm(rhs, 0)) {
@ -326,6 +436,10 @@ bool SimplifyIntegerBinary(MachineInstr& inst) {
        inst = MakeCopyLike(inst, lhs);
        return true;
      }
+      if (SameExactOperand(lhs, rhs)) {
+        inst = MakeCopyLike(inst, MachineOperand::Imm(0));
+        return true;
+      }
      return false;
    case MachineInstr::Opcode::Mul:
      if (IsImm(rhs, 1)) {
@ -340,6 +454,28 @@ bool SimplifyIntegerBinary(MachineInstr& inst) {
        inst = MakeCopyLike(inst, MachineOperand::Imm(0));
        return true;
      }
+      if (IsImm(rhs, -1)) {
+        inst = MachineInstr(MachineInstr::Opcode::Sub,
+                            {operands[0], MachineOperand::Imm(0), lhs});
+        return true;
+      }
+      if (IsImm(lhs, -1)) {
+        inst = MachineInstr(MachineInstr::Opcode::Sub,
+                            {operands[0], MachineOperand::Imm(0), rhs});
+        return true;
+      }
+      if (rhs.GetKind() == OperandKind::Imm && IsPositivePowerOfTwo(rhs.GetImm())) {
+        inst = MachineInstr(MachineInstr::Opcode::Shl,
+                            {operands[0], lhs,
+                             MachineOperand::Imm(Log2I64(rhs.GetImm()))});
+        return true;
+      }
+      if (lhs.GetKind() == OperandKind::Imm && IsPositivePowerOfTwo(lhs.GetImm())) {
+        inst = MachineInstr(MachineInstr::Opcode::Shl,
+                            {operands[0], rhs,
+                             MachineOperand::Imm(Log2I64(lhs.GetImm()))});
+        return true;
+      }
      return false;
    case MachineInstr::Opcode::Div:
      if (IsImm(rhs, 1)) {
@ -347,7 +483,17 @@ bool SimplifyIntegerBinary(MachineInstr& inst) {
        return true;
      }
      return false;
+    case MachineInstr::Opcode::Rem:
+      if (IsImm(rhs, 1) || IsImm(rhs, -1)) {
+        inst = MakeCopyLike(inst, MachineOperand::Imm(0));
+        return true;
+      }
+      return false;
    case MachineInstr::Opcode::And:
+      if (SameExactOperand(lhs, rhs)) {
+        inst = MakeCopyLike(inst, lhs);
+        return true;
+      }
      if (IsImm(rhs, -1)) {
        inst = MakeCopyLike(inst, lhs);
        return true;
@ -362,7 +508,28 @@ bool SimplifyIntegerBinary(MachineInstr& inst) {
      }
      return false;
    case MachineInstr::Opcode::Or:
+      if (SameExactOperand(lhs, rhs)) {
+        inst = MakeCopyLike(inst, lhs);
+        return true;
+      }
+      if (IsImm(rhs, -1) || IsImm(lhs, -1)) {
+        inst = MakeCopyLike(inst, MachineOperand::Imm(-1));
+        return true;
+      }
+      if (IsImm(rhs, 0)) {
+        inst = MakeCopyLike(inst, lhs);
+        return true;
+      }
+      if (IsImm(lhs, 0)) {
+        inst = MakeCopyLike(inst, rhs);
+        return true;
+      }
+      return false;
    case MachineInstr::Opcode::Xor:
+      if (SameExactOperand(lhs, rhs)) {
+        inst = MakeCopyLike(inst, MachineOperand::Imm(0));
+        return true;
+      }
      if (IsImm(rhs, 0)) {
        inst = MakeCopyLike(inst, lhs);
        return true;
@ -385,6 +552,62 @@ bool SimplifyIntegerBinary(MachineInstr& inst) {
  }
 }

+bool EvalIntCompare(CondCode code, std::int64_t lhs, std::int64_t rhs) {
+  const std::int64_t a = WrapI32(lhs);
+  const std::int64_t b = WrapI32(rhs);
+  switch (code) {
+    case CondCode::EQ:
+      return a == b;
+    case CondCode::NE:
+      return a != b;
+    case CondCode::LT:
+      return a < b;
+    case CondCode::GT:
+      return a > b;
+    case CondCode::LE:
+      return a <= b;
+    case CondCode::GE:
+      return a >= b;
+  }
+  return false;
+}
+
+bool SimplifyICmp(MachineInstr& inst) {
+  if (inst.GetOpcode() != MachineInstr::Opcode::ICmp ||
+      inst.GetOperands().size() < 3) {
+    return false;
+  }
+
+  const auto& lhs = inst.GetOperands()[1];
+  const auto& rhs = inst.GetOperands()[2];
+  if (lhs.GetKind() == OperandKind::Imm && rhs.GetKind() == OperandKind::Imm) {
+    inst = MakeCopyLike(inst, MachineOperand::Imm(
+                                  EvalIntCompare(inst.GetCondCode(), lhs.GetImm(),
+                                                 rhs.GetImm()) ? 1 : 0));
+    return true;
+  }
+
+  if (!SameExactOperand(lhs, rhs)) {
+    return false;
+  }
+
+  bool value = false;
+  switch (inst.GetCondCode()) {
+    case CondCode::EQ:
+    case CondCode::LE:
+    case CondCode::GE:
+      value = true;
+      break;
+    case CondCode::NE:
+    case CondCode::LT:
+    case CondCode::GT:
+      value = false;
+      break;
+  }
+  inst = MakeCopyLike(inst, MachineOperand::Imm(value ? 1 : 0));
+  return true;
+}
+
 bool SimplifyCondBr(MachineInstr& inst) {
  auto& operands = inst.GetOperands();
  if (operands.size() < 3) {
@ -415,6 +638,7 @@ bool SimplifyInstruction(MachineInstr& inst) {
    case MachineInstr::Opcode::Sub:
    case MachineInstr::Opcode::Mul:
    case MachineInstr::Opcode::Div:
+    case MachineInstr::Opcode::Rem:
    case MachineInstr::Opcode::And:
    case MachineInstr::Opcode::Or:
    case MachineInstr::Opcode::Xor:
@ -422,6 +646,8 @@ bool SimplifyInstruction(MachineInstr& inst) {
    case MachineInstr::Opcode::AShr:
    case MachineInstr::Opcode::LShr:
      return SimplifyIntegerBinary(inst);
+    case MachineInstr::Opcode::ICmp:
+      return SimplifyICmp(inst);
    case MachineInstr::Opcode::CondBr:
      return SimplifyCondBr(inst);
    default:
@ -535,6 +761,10 @@ CFGInfo BuildCFG(const MachineFunction& function) {
  for (std::size_t i = 0; i < blocks.size(); ++i) {
    const auto& instructions = blocks[i]->GetInstructions();
    if (instructions.empty()) {
+      if (i + 1 < blocks.size() && blocks[i + 1]) {
+        cfg.successors[i].push_back(static_cast<int>(i + 1));
+        cfg.predecessors[i + 1].push_back(static_cast<int>(i));
+      }
      continue;
    }
    const auto& terminator = instructions.back();
@ -545,6 +775,10 @@ CFGInfo BuildCFG(const MachineFunction& function) {
               terminator.GetOperands().size() >= 3) {
      add_edge(static_cast<int>(i), terminator.GetOperands()[1].GetText());
      add_edge(static_cast<int>(i), terminator.GetOperands()[2].GetText());
+    } else if (!terminator.IsTerminator() && i + 1 < blocks.size() &&
+               blocks[i + 1]) {
+      cfg.successors[i].push_back(static_cast<int>(i + 1));
+      cfg.predecessors[i + 1].push_back(static_cast<int>(i));
    }

    auto& succs = cfg.successors[i];
@ -576,8 +810,33 @@ bool SameBaseObject(const AddressKey& lhs, const AddressKey& rhs) {
  return false;
 }

+int AccessSize(ValueType type) {
+  const int size = GetValueSize(type);
+  return size > 0 ? size : 1;
+}
+
+bool HasConstantOffsetOnly(const AddressKey& key) {
+  return key.scaled_vregs.empty();
+}
+
+bool MustOverlapAddressRange(const AddressKey& lhs, ValueType lhs_type,
+                             const AddressKey& rhs, ValueType rhs_type) {
+  if (!SameBaseObject(lhs, rhs)) {
+    return false;
+  }
+  if (!HasConstantOffsetOnly(lhs) || !HasConstantOffsetOnly(rhs)) {
+    return true;
+  }
+  const auto lhs_begin = lhs.const_offset;
+  const auto lhs_end = lhs_begin + AccessSize(lhs_type);
+  const auto rhs_begin = rhs.const_offset;
+  const auto rhs_end = rhs_begin + AccessSize(rhs_type);
+  return lhs_begin < rhs_end && rhs_begin < lhs_end;
+}
+
 void InvalidateMemoryState(std::unordered_map<AddressKey, MemoryState, AddressKeyHash>& states,
-                           const AddressKey* store_key) {
+                           const AddressKey* store_key,
+                           ValueType store_type = ValueType::Void) {
  if (store_key == nullptr) {
    states.clear();
    return;
@ -587,9 +846,14 @@ void InvalidateMemoryState(std::unordered_map<AddressKey, MemoryState, AddressKe
    states.clear();
    return;
  }
+  const bool precise_memory =
+      utils::IsEnabledUnlessEnvFlag("NUDTC_DISABLE_MIR_PRECISE_MEMORY");

  for (auto it = states.begin(); it != states.end();) {
-    if (it->first.base_kind == AddrBaseKind::VReg || SameBaseObject(it->first, *store_key)) {
+    if (it->first.base_kind == AddrBaseKind::VReg ||
+        (precise_memory
+             ? MustOverlapAddressRange(it->first, it->second.type, *store_key, store_type)
+             : SameBaseObject(it->first, *store_key))) {
      it = states.erase(it);
      continue;
    }
@ -659,7 +923,7 @@ bool TryOptimizeMemoryInstruction(
    removed[static_cast<std::size_t>(existing->second.pending_store_index)] = true;
  }

-  InvalidateMemoryState(states, &key);
+  InvalidateMemoryState(states, &key, inst.GetValueType());
  states[key] = {value, inst.GetValueType(), static_cast<int>(current_index)};
  return false;
 }
@ -683,7 +947,7 @@ void ApplyMemoryDataflowInstruction(const MachineModule& module, const MachineIn

  const AddressKey key = MakeAddressKey(inst.GetAddress());
  if (inst.GetOpcode() == MachineInstr::Opcode::Store) {
-    InvalidateMemoryState(states, &key);
+    InvalidateMemoryState(states, &key, inst.GetValueType());
    states[key] = {inst.GetOperands()[0], inst.GetValueType(), -1};
    return;
  }
@ -699,6 +963,13 @@ MemoryMap SimulateBlockMemory(const MachineModule& module, const MachineBasicBlo
 }

 bool CombineBitTestMasks(std::vector<MachineInstr>& instructions);
+bool CombineMultiplyAccumulate(std::vector<MachineInstr>& instructions,
+                               const std::unordered_map<int, int>& function_use_counts,
+                               bool allow_live_range_changes);
+bool HasAssignedAllocations(const MachineFunction& function);
+std::unordered_map<int, int> CountFunctionUsesWithReplacement(
+    const MachineFunction& function, const MachineBasicBlock& replaced_block,
+    const std::vector<MachineInstr>& replacement);

 bool RunPeepholeOnBlock(const MachineModule& module, const MachineFunction& function,
                        MachineBasicBlock& block, const MemoryMap& in_state) {
@ -765,6 +1036,10 @@ bool RunPeepholeOnBlock(const MachineModule& module, const MachineFunction& func
    changed = true;
  }
  changed |= CombineBitTestMasks(compacted);
+  const auto function_use_counts =
+      CountFunctionUsesWithReplacement(function, block, compacted);
+  changed |= CombineMultiplyAccumulate(compacted, function_use_counts,
+                                       !HasAssignedAllocations(function));
  if (changed) {
    block.GetInstructions() = std::move(compacted);
  }
@ -780,6 +1055,8 @@ bool IsSideEffectFree(const MachineInstr& inst) {
    case MachineInstr::Opcode::Add:
    case MachineInstr::Opcode::Sub:
    case MachineInstr::Opcode::Mul:
+    case MachineInstr::Opcode::MAdd:
+    case MachineInstr::Opcode::MSub:
    case MachineInstr::Opcode::Div:
    case MachineInstr::Opcode::Rem:
    case MachineInstr::Opcode::ModMul:
@ -799,6 +1076,7 @@ bool IsSideEffectFree(const MachineInstr& inst) {
    case MachineInstr::Opcode::FNeg:
    case MachineInstr::Opcode::ICmp:
    case MachineInstr::Opcode::FCmp:
+    case MachineInstr::Opcode::CSelect:
    case MachineInstr::Opcode::ZExt:
    case MachineInstr::Opcode::ItoF:
    case MachineInstr::Opcode::FtoI:
@ -827,6 +1105,28 @@ std::unordered_map<int, int> CountBlockUses(const std::vector<MachineInstr>& ins
  return counts;
 }

+std::unordered_map<int, int> CountFunctionUsesWithReplacement(
+    const MachineFunction& function, const MachineBasicBlock& replaced_block,
+    const std::vector<MachineInstr>& replacement) {
+  std::unordered_map<int, int> counts;
+  auto accumulate = [&](const std::vector<MachineInstr>& instructions) {
+    for (const auto& inst : instructions) {
+      for (int use : inst.GetUses()) {
+        ++counts[use];
+      }
+    }
+  };
+
+  for (const auto& block : function.GetBlocks()) {
+    if (block.get() == &replaced_block) {
+      accumulate(replacement);
+    } else if (block) {
+      accumulate(block->GetInstructions());
+    }
+  }
+  return counts;
+}
+
 bool GetDefVReg(const MachineInstr& inst, int* out) {
  const auto defs = inst.GetDefs();
  if (defs.size() != 1) {
@ -917,6 +1217,123 @@ bool CombineBitTestMasks(std::vector<MachineInstr>& instructions) {
  return changed;
 }

+bool MatchMulOperand(const std::unordered_map<int, std::size_t>& def_index,
+                     const std::unordered_map<int, int>& use_counts,
+                     const std::unordered_map<int, int>& function_use_counts,
+                     const std::vector<bool>& removed,
+                     const MachineOperand& operand,
+                     const std::vector<MachineInstr>& instructions,
+                     std::size_t user_index,
+                     std::size_t* mul_index) {
+  if (operand.GetKind() != OperandKind::VReg) {
+    return false;
+  }
+  const int vreg = operand.GetVReg();
+  auto use_it = use_counts.find(vreg);
+  if (use_it == use_counts.end() || use_it->second != 1) {
+    return false;
+  }
+  auto function_use_it = function_use_counts.find(vreg);
+  if (function_use_it == function_use_counts.end() ||
+      function_use_it->second != 1) {
+    return false;
+  }
+  auto def_it = def_index.find(vreg);
+  if (def_it == def_index.end() || def_it->second >= user_index ||
+      removed[def_it->second]) {
+    return false;
+  }
+
+  const auto& producer = instructions[def_it->second];
+  if (producer.GetOpcode() != MachineInstr::Opcode::Mul ||
+      producer.GetOperands().size() < 3) {
+    return false;
+  }
+  if (mul_index != nullptr) {
+    *mul_index = def_it->second;
+  }
+  return true;
+}
+
+bool CombineMultiplyAccumulate(std::vector<MachineInstr>& instructions,
+                               const std::unordered_map<int, int>& function_use_counts,
+                               bool allow_live_range_changes) {
+  if (!allow_live_range_changes) {
+    return false;
+  }
+  if (!utils::IsEnabledUnlessEnvFlag("NUDTC_DISABLE_MIR_MADD")) {
+    return false;
+  }
+
+  const auto use_counts = CountBlockUses(instructions);
+  std::unordered_map<int, std::size_t> def_index;
+  for (std::size_t i = 0; i < instructions.size(); ++i) {
+    int def = -1;
+    if (GetDefVReg(instructions[i], &def)) {
+      def_index[def] = i;
+    }
+  }
+
+  bool changed = false;
+  std::vector<bool> removed(instructions.size(), false);
+  for (std::size_t i = 0; i < instructions.size(); ++i) {
+    auto& user = instructions[i];
+    if ((user.GetOpcode() != MachineInstr::Opcode::Add &&
+         user.GetOpcode() != MachineInstr::Opcode::Sub) ||
+        user.GetOperands().size() < 3 ||
+        user.GetOperands()[0].GetKind() != OperandKind::VReg) {
+      continue;
+    }
+
+    std::size_t mul_index = 0;
+    MachineOperand accumulator;
+    MachineInstr::Opcode fused_opcode = MachineInstr::Opcode::MAdd;
+    bool matched = false;
+    if (user.GetOpcode() == MachineInstr::Opcode::Add) {
+      if (MatchMulOperand(def_index, use_counts, function_use_counts, removed,
+                          user.GetOperands()[1], instructions, i, &mul_index)) {
+        accumulator = user.GetOperands()[2];
+        matched = true;
+      } else if (MatchMulOperand(def_index, use_counts, function_use_counts,
+                                 removed, user.GetOperands()[2], instructions,
+                                 i, &mul_index)) {
+        accumulator = user.GetOperands()[1];
+        matched = true;
+      }
+    } else if (MatchMulOperand(def_index, use_counts, function_use_counts,
+                               removed, user.GetOperands()[2], instructions, i,
+                               &mul_index)) {
+      accumulator = user.GetOperands()[1];
+      fused_opcode = MachineInstr::Opcode::MSub;
+      matched = true;
+    }
+    if (!matched) {
+      continue;
+    }
+
+    const auto& mul = instructions[mul_index];
+    user = MachineInstr(fused_opcode,
+                        {user.GetOperands()[0], mul.GetOperands()[1],
+                         mul.GetOperands()[2], accumulator});
+    removed[mul_index] = true;
+    changed = true;
+  }
+
+  if (!changed) {
+    return false;
+  }
+
+  std::vector<MachineInstr> compacted;
+  compacted.reserve(instructions.size());
+  for (std::size_t i = 0; i < instructions.size(); ++i) {
+    if (!removed[i]) {
+      compacted.push_back(std::move(instructions[i]));
+    }
+  }
+  instructions = std::move(compacted);
+  return true;
+}
+
 bool RunDeadInstrElimination(MachineFunction& function) {
  bool changed = false;

--- a/src/mir/passes/SpillReduction.cpp
+++ b/src/mir/passes/SpillReduction.cpp
@ -1,4 +1,4 @@
-#include "mir/MIR.h"
+#include "mir/Passes.h"

 #include <unordered_map>
 #include <utility>
@ -127,6 +127,8 @@ bool RewriteUses(MachineInstr& inst, const std::unordered_map<int, int>& rename_
    case MachineInstr::Opcode::Add:
    case MachineInstr::Opcode::Sub:
    case MachineInstr::Opcode::Mul:
+    case MachineInstr::Opcode::MAdd:
+    case MachineInstr::Opcode::MSub:
    case MachineInstr::Opcode::Div:
    case MachineInstr::Opcode::Rem:
    case MachineInstr::Opcode::ModMul:
@ -144,12 +146,26 @@ bool RewriteUses(MachineInstr& inst, const std::unordered_map<int, int>& rename_
    case MachineInstr::Opcode::FDiv:
    case MachineInstr::Opcode::ICmp:
    case MachineInstr::Opcode::FCmp:
+    case MachineInstr::Opcode::CSelect:
      if (operands.size() >= 2) {
        changed |= RewriteMappedOperand(operands[1], rename_map);
      }
      if (operands.size() >= 3) {
        changed |= RewriteMappedOperand(operands[2], rename_map);
      }
+      if (inst.GetOpcode() == MachineInstr::Opcode::CSelect) {
+        if (operands.size() >= 4) {
+          changed |= RewriteMappedOperand(operands[3], rename_map);
+        }
+        if (operands.size() >= 5) {
+          changed |= RewriteMappedOperand(operands[4], rename_map);
+        }
+      }
+      if ((inst.GetOpcode() == MachineInstr::Opcode::MAdd ||
+           inst.GetOpcode() == MachineInstr::Opcode::MSub) &&
+          operands.size() >= 4) {
+        changed |= RewriteMappedOperand(operands[3], rename_map);
+      }
      break;
    case MachineInstr::Opcode::CondBr:
      if (!operands.empty()) {
--- a/study_materials/00_prerequisites.md
+++ b/study_materials/00_prerequisites.md
@ -0,0 +1,210 @@
+# 00 前置知识
+
+这一章面向“学过编译原理，但没有实际改过编译器代码”的读者。重点不是重新讲教材，而是告诉你：教材里的概念在这个项目中对应哪些文件、哪些类、哪些函数。
+
+阅读时不要追求一次看懂全部 C++ 细节。先建立概念映射：
+
+```text
+教材概念 -> 项目目录 -> 关键类/函数 -> 一个样例输出中的表现
+```
+
+例如你知道 SSA 和 phi，但第一次看 `Mem2Reg.cpp` 时，只需要先看懂它为什么要插 phi、为什么沿支配树重命名，不需要立刻掌握每个迭代器和容器操作。
+
+最低掌握目标：
+
+```text
+能把“语义分析、IR、SSA、支配树、循环、alias、寄存器分配”这些词对应到本项目的文件。
+```
+
+## 编译器基本阶段
+
+一个典型编译器分为前端、中端、后端三层。
+
+```text
+前端：源码 -> token -> AST -> 语义检查
+中端：AST -> IR -> 优化后的 IR
+后端：IR -> 机器相关 MIR -> 寄存器分配 -> 汇编
+```
+
+本项目对应关系：
+
+- 前端：`src/antlr4/SysY.g4`、`src/frontend/`、`src/sem/`
+- IR 生成：`src/irgen/`
+- IR 基础设施：`include/ir/IR.h`、`src/ir/`
+- IR 优化：`src/ir/passes/`
+- 后端 MIR：`include/mir/MIR.h`、`src/mir/`
+- 后端优化：`src/mir/passes/`
+- 驱动入口：`src/main.cpp`
+
+## SysY 语言背景
+
+SysY 是类 C 的教学语言。你需要关注这些特性：
+
+- 基本类型：`int`、`float`、`void`
+- 标量和数组变量
+- 常量定义和常量表达式
+- 函数定义、函数调用、参数传递
+- `if`、`while`、`break`、`continue`、`return`
+- 短路逻辑：`&&` 和 `||`
+- 库函数：`getint`、`getfloat`、`putint`、`putfloat`、`starttime`、`stoptime` 等
+
+答辩中常见追问：数组初始化、短路求值、作用域遮蔽、函数参数数组退化、常量折叠在哪里做。
+
+## AST、语义分析和符号表
+
+AST 是语法结构树。语法正确不代表语义正确，例如变量未定义、重复定义、函数参数不匹配都属于语义阶段处理。
+
+符号表通常要支持：
+
+- 多级作用域
+- 变量、常量、函数的不同符号类型
+- 类型信息和维度信息
+- 常量初值
+- 函数副作用信息
+
+本项目里相关代码主要在：
+
+- `include/sem/Sema.h`
+- `include/sem/SymbolTable.h`
+- `src/sem/Sema.cpp`
+- `src/sem/ConstEval.cpp`
+- `src/sem/SymbolTable.cpp`
+
+## IR 和 SSA
+
+IR 是中间表示。一个好的 IR 能让优化和后端生成更容易。
+
+SSA 的核心规则是每个变量只被定义一次。控制流合流处用 `phi` 合并不同来源的值：
+
+```llvm
+while.cond:
+  %i = phi i32 [0, %entry], [%i.next, %while.body]
+  %cmp = icmp slt i32 %i, %n
+```
+
+本项目 IR 是接近 LLVM 风格的自研 IR：
+
+- `Value` 表示一切能被使用的值。
+- `User` 表示使用其他 `Value` 的对象。
+- `Use` 维护 use-def 链。
+- `Instruction` 是具体指令。
+- `BasicBlock` 是基本块。
+- `Function` 是函数。
+- `Module` 是整个程序。
+
+## CFG、基本块和支配关系
+
+CFG 是控制流图。节点是基本块，边是跳转。
+
+支配关系定义：如果从入口到 B 的所有路径都必须经过 A，则 A 支配 B。
+
+为什么重要：
+
+- Mem2Reg 需要支配树来插入 `phi`。
+- GVN/CSE 需要知道一个值是否支配所有使用点。
+- LICM 需要确认 hoist 后的指令在所有路径上可用。
+- 循环分析需要用回边识别自然循环。
+
+本项目相关代码：
+
+- `src/ir/analysis/DominatorTree.cpp`
+- `src/ir/analysis/LoopInfo.cpp`
+- `include/ir/Analysis.h`
+
+## 数据流分析
+
+数据流分析通过在 CFG 上传播集合或映射来回答问题。
+
+常见方向：
+
+- 前向分析：可达定义、可用表达式、常量传播
+- 后向分析：活跃变量、死代码消除
+
+本项目中典型例子：
+
+- `LoadStoreElim.cpp` 维护可用内存值。
+- `GVN.cpp` 维护全局值编号。
+- `DCE.cpp` 根据 use-def 删除无用指令。
+- `RegAlloc.cpp` 需要活跃区间和冲突关系。
+
+## Alias / Mod-Ref 分析
+
+Alias 分析回答两个指针是否可能指向同一位置。
+
+Mod-Ref 分析回答一条指令是否会读写某个内存区域。
+
+这类分析决定内存优化能否安全执行：
+
+- 如果 `load A[i]` 和 `store B[j]` 一定不 alias，则 store 不会破坏 load 的值。
+- 如果函数调用可能写全局或参数内存，就不能随便把调用前后的 load 合并。
+
+本项目相关代码：
+
+- `src/ir/passes/MemoryUtils.h`
+- `src/ir/passes/LoopMemoryUtils.h`
+- `src/ir/passes/LoadStoreElim.cpp`
+- `src/ir/passes/LICM.cpp`
+- `src/ir/passes/LoopMemoryPromotion.cpp`
+
+## 常见中端优化
+
+- 常量传播：把已知常量沿 use-def 传播。
+- 常量折叠：把 `2 + 3` 直接变成 `5`。
+- CSE：消除同一基本块内重复表达式。
+- GVN：跨基本块消除等价表达式。
+- DCE：删除无副作用且结果不用的指令。
+- CFGSimplify：删除不可达块、合并空块、简化跳转。
+- Mem2Reg：把栈变量提升到 SSA 寄存器值。
+- LICM：把循环不变计算移到循环外。
+- LoopUnroll：循环展开，减少分支并暴露更多优化机会。
+- LoopStrengthReduction：把循环内乘法地址计算变成递推加法。
+- LoopUnswitch：把循环内不变条件移到循环外。
+- LoopFission：把一个循环拆成多个循环，降低依赖和寄存器压力。
+- Function Inlining：把小函数调用展开到调用点。
+- Tail Recursion Elim：把尾递归改成循环。
+
+## 后端和 AArch64 基础
+
+后端目标是生成能在 AArch64 上运行的汇编。
+
+你需要知道：
+
+- 整数寄存器：`x0-x30`，32 位视图是 `w0-w30`
+- 浮点/SIMD 寄存器：`v0-v31`，标量 float 常用 `s0-s31`
+- 参数传递：整数/指针常用 `x0-x7`，浮点常用 `s0-s7`
+- 返回值：整数/指针常用 `x0`，浮点常用 `s0`
+- 调用者保存和被调用者保存寄存器要区分
+- 栈帧需要保存 `x29`/`x30`、callee-saved 寄存器和 spill slot
+
+常见 AArch64 指令：
+
+- `add/sub/mul/sdiv/msub`：整数算术
+- `fadd/fsub/fmul/fdiv`：浮点算术
+- `ldr/str`：访存
+- `ldp/stp`：成对访存
+- `cmp/b.eq/b.ne/b.lt/...`：比较和跳转
+- `csel`：条件选择
+- `madd/msub`：乘加/乘减融合
+
+本项目后端相关：
+
+- `src/mir/Lowering.cpp`
+- `src/mir/RegAlloc.cpp`
+- `src/mir/FrameLowering.cpp`
+- `src/mir/AsmPrinter.cpp`
+- `src/mir/passes/Peephole.cpp`
+- `src/mir/passes/SpillReduction.cpp`
+
+## 性能优化基本方法
+
+比赛优化不要只看单个样例的比例，要看总时间收益。
+
+建议流程：
+
+1. 先保证全量正确。
+2. 建立 GCC baseline。
+3. 用 speedup 升序找最差比例。
+4. 用 our time 降序找最大总耗时。
+5. 优先优化“总损失秒数大”的样例。
+6. 用 `analyze_case.sh` 看 IR 和汇编，确认优化是否真的命中。
+7. 做局部修改后只跑相关样例，不要每次跑全量。
--- a/study_materials/01_project_architecture.md
+++ b/study_materials/01_project_architecture.md
@ -0,0 +1,162 @@
+# 01 项目整体结构
+
+本章适合先建立“地图”，不要逐行读实现。你只需要先知道每个目录负责什么、阶段之间传递什么对象、主流程从哪里进入。
+
+如果没有工程编码经验，建议按下面方式阅读：
+
+```text
+先看 src/main.cpp 的阶段调用。
+再看 include/ 下的核心数据结构声明。
+最后才看 src/ 下每个阶段的具体实现。
+```
+
+最低掌握目标：
+
+```text
+看到一个文件路径时，能判断它属于前端、语义、IRGen、中端、MIR、寄存器分配还是汇编输出。
+```
+
+## 端到端流程
+
+入口在 `src/main.cpp`。当前默认逻辑可以概括为：
+
+```text
+ParseCLI
+  -> ParseFileWithAntlr
+  -> RunSema
+  -> GenerateIR
+  -> RunIRPassPipeline
+  -> LowerToMIR
+  -> RunMIRPreRegAllocPassPipeline
+  -> RunRegAlloc
+  -> RunMIRPostRegAllocPassPipeline
+  -> RunFrameLowering
+  -> PrintAsm
+```
+
+不同命令行选项会截断输出阶段：
+
+- `--emit-parse-tree`：输出语法树。
+- `--emit-ir`：输出优化后的 IR。
+- `--emit-asm`：输出 AArch64 汇编。
+- 不显式指定时默认输出 IR。
+
+## 根目录职责
+
+- `src/`：主要实现代码。
+- `include/`：公共头文件。
+- `doc/`：实验说明和阶段性总结文档。
+- `Reference/`：课程或比赛参考资料。
+- `scripts/`：构建、验证、性能分析脚本。
+- `sylib/`：SysY 运行时库。
+- `test/`：测试用例、输出、测试结果目录。
+- `build*`：不同构建目录，不应该作为源码理解重点。
+- `output/`：测试日志、baseline、分析产物。
+
+## 源码层级
+
+```text
+src/antlr4      语法文件
+src/frontend    ANTLR 驱动、语法树打印
+src/sem         语义分析、符号表、常量求值
+src/irgen       AST 到 IR
+src/ir          IR 数据结构、打印、分析
+src/ir/passes   中端优化
+src/mir         MIR、指令选择、寄存器分配、汇编输出
+src/mir/passes  后端优化
+src/utils       CLI、日志工具
+```
+
+## 数据结构主线
+
+前端阶段的数据主线：
+
+```text
+SysYParser::CompUnitContext
+  -> SemaResult / SymbolTable / ConstEval
+  -> ir::Module
+```
+
+中端阶段的数据主线：
+
+```text
+ir::Module
+  -> ir::Function
+  -> ir::BasicBlock
+  -> ir::Instruction
+  -> ir::Value / ir::Use
+```
+
+后端阶段的数据主线：
+
+```text
+mir::MachineModule
+  -> mir::MachineFunction
+  -> mir::MachineBasicBlock
+  -> mir::MachineInstr
+  -> mir::Register / Operand
+```
+
+## 构建方式
+
+Release 构建：
+
+```bash
+cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
+cmake --build build -j "$(nproc)"
+```
+
+常用单样例：
+
+```bash
+./build/bin/compiler --emit-ir test/test_case/functional/00_main.sy
+./build/bin/compiler --emit-asm test/test_case/functional/00_main.sy
+```
+
+后端完整链路验证：
+
+```bash
+./scripts/verify_asm.sh test/test_case/functional/00_main.sy test/test_result/function/asm --run
+```
+
+## Pass 组织方式
+
+IR pass 统一入口：
+
+- `include/ir/PassManager.h`
+- `src/ir/passes/PassManager.cpp`
+
+MIR pass 统一入口：
+
+- `include/mir/MIR.h`
+- `src/mir/passes/PassManager.cpp`
+
+当前 IR pass pipeline 不是只跑一遍，而是有迭代：
+
+- 先 `Mem2Reg`
+- 再可选 `TailRecursionElim`
+- 循环执行常量、GVN、LoadStore、CSE、CFG、LICM、循环优化等
+- 对大函数使用 size guard 限制迭代和增长型 pass，避免编译时间爆炸
+
+## 环境变量开关
+
+当前项目保留了一些调试开关：
+
+- `NUDTC_DISABLE_MEM2REG`
+- `NUDTC_DISABLE_LOOP_MEM_PROMOTION`
+- `NUDTC_DISABLE_CFG_INLINE`
+- `NUDTC_DISABLE_LOOP_UNSWITCH`
+- `NUDTC_DISABLE_TAIL_RECURSION`
+- `NUDTC_DISABLE_IR_SIZE_GUARD`
+- `NUDTC_DISABLE_MIR_SPILL_REDUCTION`
+- `NUDTC_DISABLE_MIR_CFG_CLEANUP`
+
+答辩时可以说明：这些开关主要用于定位 pass 是否导致正确性或性能问题。
+
+## 当前项目特点
+
+- 前端使用 ANTLR，降低词法语法实现成本。
+- IR 是自研的 SSA-like IR，不直接依赖 LLVM。
+- 中端 pass 数量较多，是当前性能优化主战场。
+- 后端是自研 AArch64 后端，包含 MIR、寄存器分配、栈帧和汇编打印。
+- 测试脚本已经支持 GCC baseline、speedup 计算、单样例分析和全量计时。
--- a/study_materials/02_frontend_sema_irgen.md
+++ b/study_materials/02_frontend_sema_irgen.md
@ -0,0 +1,170 @@
+# 02 前端、语义分析和 IR 生成
+
+本章面向知道“词法/语法/语义/中间代码”概念，但没有写过前端代码的读者。阅读时重点看数据如何流动：ANTLR 语法树如何经过语义分析，再变成 IR。
+
+不需要一开始掌握 ANTLR 的所有接口。先抓住三件事：
+
+```text
+语法树提供结构。
+语义分析补类型、作用域、常量和函数信息。
+IRGen 把语句和表达式翻译成 BasicBlock 和 Instruction。
+```
+
+最低掌握目标：
+
+```text
+能解释一个变量定义、一个 if/while、一个函数调用分别在哪里被检查，在哪里被翻译成 IR。
+```
+
+## ANTLR 语法前端
+
+语法定义在：
+
+- `src/antlr4/SysY.g4`
+
+前端驱动在：
+
+- `include/frontend/AntlrDriver.h`
+- `src/frontend/AntlrDriver.cpp`
+- `include/frontend/SyntaxTreePrinter.h`
+- `src/frontend/SyntaxTreePrinter.cpp`
+
+前端主要职责：
+
+- 读取 `.sy` 文件。
+- 调用 ANTLR lexer/parser。
+- 生成 `SysYParser::CompUnitContext`。
+- 在 `--emit-parse-tree` 时打印语法树。
+
+答辩要点：语法树由 ANTLR 生成，但后续语义和代码生成是项目自己的实现。
+
+## 语义分析
+
+语义分析入口：
+
+- `include/sem/Sema.h`
+- `src/sem/Sema.cpp`
+
+相关支持：
+
+- `include/sem/SymbolTable.h`
+- `src/sem/SymbolTable.cpp`
+- `src/sem/ConstEval.cpp`
+
+语义分析需要解决的问题：
+
+- 变量和函数是否定义。
+- 是否重复定义。
+- 作用域是否正确。
+- `break` / `continue` 是否处于循环中。
+- `return` 类型是否匹配。
+- 函数调用实参与形参是否匹配。
+- 数组维度和初始化是否合法。
+- 常量表达式是否可在编译期求值。
+- 函数副作用信息，用于后续 pure/readonly/inline 等优化。
+
+## 符号表
+
+符号表需要支持嵌套作用域。典型操作：
+
+- 进入作用域。
+- 离开作用域。
+- 插入符号。
+- 从当前作用域向外查找符号。
+
+当前代码中你需要重点理解：
+
+- 变量符号和函数符号如何区分。
+- 常量值如何保存。
+- 数组维度如何保存。
+- 函数参数信息如何保存。
+- 语义信息如何传递给 IRGen。
+
+## 常量求值
+
+常量求值用于：
+
+- `const int a = 1 + 2;`
+- 数组维度。
+- 全局常量初始化。
+- 编译期能确定的数组初始化压缩。
+
+常量求值通常必须保守：
+
+- 只允许无副作用表达式。
+- 不能把运行期输入当成常量。
+- 浮点常量需要注意精度和输出格式。
+
+## IR 生成文件分工
+
+IR 生成代码在 `src/irgen/`：
+
+- `IRGenDriver.cpp`：整体驱动和入口。
+- `IRGenDecl.cpp`：变量、常量、数组、全局定义。
+- `IRGenExp.cpp`：表达式、算术、比较、逻辑、函数调用。
+- `IRGenFunc.cpp`：函数定义、参数、返回。
+- `IRGenStmt.cpp`：语句、控制流、循环、break/continue。
+
+头文件：
+
+- `include/irgen/IRGen.h`
+
+## 表达式生成
+
+表达式通常生成一个 `ir::Value*`。
+
+需要区分：
+
+- 左值：能取地址，例如变量、数组元素。
+- 右值：表达式计算结果。
+- 整数、浮点、布尔转换。
+- 常量和运行期值。
+
+数组访问通常生成 `getelementptr`，再根据上下文决定 load 或 store。
+
+## 短路逻辑
+
+`&&` 和 `||` 不能简单翻译成普通二元运算，因为它们有短路语义。
+
+例如：
+
+```c
+if (a != 0 && b / a > 1) { ... }
+```
+
+如果 `a == 0`，右侧不能执行。IRGen 需要生成分支基本块，并在必要时使用 `phi` 合并结果。
+
+## 控制流生成
+
+`if` 通常生成：
+
+```text
+cond block
+  -> then block
+  -> else block
+  -> merge block
+```
+
+`while` 通常生成：
+
+```text
+preheader/current
+  -> cond block
+  -> body block
+  -> cond block
+  -> exit block
+```
+
+`break` 跳到 exit，`continue` 跳到 cond 或 latch。`IRGen` 中的 loop context 用来记录这些目标块。
+
+## 数组初始化
+
+数组初始化是前端和 IRGen 的难点之一：
+
+- 多维数组要按线性顺序展开。
+- 局部数组可能用 store 初始化。
+- 全局数组需要生成全局初值。
+- 大型全零数组应压缩为零初始化，避免编译期内存浪费。
+- 显式初始化但全零的大数组也应走零初始化压缩。
+
+答辩时如果被问到数组相关 bug，可以从“维度展开、默认零填充、全局/局部差异、大数组压缩”四点说明。
--- a/study_materials/03_ir_core_and_analyses.md
+++ b/study_materials/03_ir_core_and_analyses.md
@ -0,0 +1,173 @@
+# 03 IR 核心和分析
+
+本章是从“编译原理理论”过渡到“真实代码结构”的关键。你应该把每个理论概念都对应到 C++ 类，而不是只背概念。
+
+阅读建议：
+
+```text
+先读 include/ir/IR.h 看类之间的关系。
+再读 src/ir/IRPrinter.cpp 看这些对象如何被打印出来。
+最后用 --emit-ir 输出一个小样例，对照代码和文本 IR。
+```
+
+最低掌握目标：
+
+```text
+看到一段 IR 时，能指出哪些是 Value、Instruction、BasicBlock、Function、phi、terminator。
+```
+
+## IR 核心文件
+
+头文件：
+
+- `include/ir/IR.h`
+- `include/ir/Analysis.h`
+- `include/ir/PassManager.h`
+- `include/ir/utils.h`
+
+实现：
+
+- `src/ir/Value.cpp`
+- `src/ir/Instruction.cpp`
+- `src/ir/BasicBlock.cpp`
+- `src/ir/Function.cpp`
+- `src/ir/Module.cpp`
+- `src/ir/Type.cpp`
+- `src/ir/Context.cpp`
+- `src/ir/IRBuilder.cpp`
+- `src/ir/IRPrinter.cpp`
+- `src/ir/analysis/DominatorTree.cpp`
+- `src/ir/analysis/LoopInfo.cpp`
+
+## Value / User / Use
+
+`Value` 是 IR 的基础类。常量、参数、指令、函数都可以是 Value。
+
+`User` 表示使用其他 Value 的对象。Instruction 通常是 User。
+
+`Use` 维护“谁使用了我”的反向关系。它支持：
+
+- `ReplaceAllUsesWith`
+- DCE 判断指令是否仍被使用
+- GVN/CSE 替换等价值
+
+答辩时可以说：Use-Def 链是 SSA IR 做优化的基础设施。
+
+## Type
+
+类型系统支持：
+
+- `void`
+- `i1`
+- `i32`
+- `float`
+- `label`
+- `pointer`
+- `array`
+- `function`
+
+数组类型保存元素类型和元素个数。GEP 需要依赖类型来计算索引层级。
+
+## Instruction
+
+主要指令类型包括：
+
+- `AllocaInst`
+- `LoadInst`
+- `StoreInst`
+- `BinaryInst`
+- `UnaryInst`
+- `ReturnInst`
+- `CallInst`
+- `GetElementPtrInst`
+- `PhiInst`
+- `CastInst`
+- `CondBrInst`
+- `UncondBrInst`
+- `MemsetInst`
+
+你需要重点理解 `load/store/gep/phi/branch/call`，它们是优化和后端的核心。
+
+## BasicBlock / Function / Module
+
+`BasicBlock` 保存指令列表、前驱、后继。
+
+`Function` 保存参数、基本块、函数属性和副作用信息。
+
+`Module` 保存函数、全局变量、上下文。
+
+答辩时可以用下面这句话：
+
+```text
+Module 是编译单元，Function 是 CFG，BasicBlock 是顺序指令段，Instruction 通过 Value/User/Use 组成 SSA 数据流。
+```
+
+## IRBuilder 和 IRPrinter
+
+`IRBuilder` 用于创建 IR 指令，避免 IRGen 直接操作所有构造细节。
+
+`IRPrinter` 用于输出类似 LLVM IR 的文本，便于调试：
+
+```bash
+./build/bin/compiler --emit-ir case.sy
+```
+
+## DominatorTree
+
+支配树用于回答：
+
+- A 块是否支配 B 块。
+- A 指令是否支配 B 指令。
+- 某个定义是否在所有使用路径之前。
+
+应用场景：
+
+- Mem2Reg 插入和重命名。
+- GVN 替换值。
+- LICM 移动循环不变代码。
+- LoadStoreElim 判断 store 是否支配 load。
+
+## LoopInfo
+
+`LoopInfo` 基于 CFG 和支配树识别自然循环。
+
+Loop 中重要字段：
+
+- `header`
+- `preheader`
+- `latches`
+- `blocks`
+- `block_list`
+- `exiting_blocks`
+- `exit_blocks`
+- `subloops`
+
+循环优化常用限制：
+
+- 是否有唯一 latch。
+- 是否有 preheader。
+- 是否能识别 induction variable。
+- 循环体是否过大。
+- 是否有调用或未知内存写。
+
+## PassUtils / LoopPassUtils / MemoryUtils
+
+辅助文件：
+
+- `src/ir/passes/PassUtils.h`
+- `src/ir/passes/LoopPassUtils.h`
+- `src/ir/passes/MemoryUtils.h`
+- `src/ir/passes/LoopMemoryUtils.h`
+- `src/ir/passes/MathIdiomUtils.h`
+
+作用：
+
+- 判断值等价。
+- 收集可达块。
+- 判断循环不变值。
+- 分析 GEP 地址。
+- 判断 alias。
+- 识别 induction variable。
+- 识别数学/模运算 idiom。
+
+这些工具决定优化 pass 能否写得保守且可复用。
--- a/study_materials/04_ir_optimization_passes.md
+++ b/study_materials/04_ir_optimization_passes.md
@ -0,0 +1,307 @@
+# 04 IR 中端优化
+
+IR 优化是当前比赛性能的主要来源。本章默认你知道常量传播、DCE、LICM 等教材概念，但还不熟悉它们在工程中如何落地。
+
+阅读每个 pass 时不要先陷入 C++ 细节。先按这个顺序看：
+
+```text
+它要消除什么冗余？
+它遍历 Module、Function、BasicBlock 还是 Loop？
+它依赖哪些分析？
+它的安全 guard 在哪里？
+它改完后谁负责清理？
+```
+
+最低掌握目标：
+
+```text
+能从 PassManager.cpp 说出优化执行顺序，并能挑 3 个 pass 解释“输入、输出、安全条件”。
+```
+
+统一入口在：
+
+- `src/ir/passes/PassManager.cpp`
+
+声明在：
+
+- `include/ir/PassManager.h`
+
+## Pipeline 概览
+
+当前流程大致为：
+
+```text
+Mem2Reg
+TailRecursionElim
+for up to N iterations:
+  FunctionInlining
+  InterproceduralConstProp
+  ArithmeticSimplify
+  ConstProp
+  ConstFold
+  GVN
+  LoadStoreElim
+  CSE
+  IfConversion
+  DCE
+  CFGSimplify
+  LICM
+  LoopMemoryPromotion
+  LoopUnswitch
+  LoopStrengthReduction
+  LoopFission
+  LoopUnroll
+  LoopRepeatReduction
+  cleanup passes again
+```
+
+`N` 会受 size guard 控制，大函数减少迭代，避免编译时间过长。
+
+## Mem2Reg
+
+文件：
+
+- `src/ir/passes/Mem2Reg.cpp`
+
+作用：
+
+- 把可提升的局部变量从 `alloca/load/store` 形式变成 SSA 值。
+- 在控制流合流处插入 `phi`。
+- 大幅降低后续优化难度。
+
+风险：
+
+- 指针逃逸的 alloca 不能提升。
+- 数组和地址被传出的对象不能简单提升。
+
+## 常量和算术简化
+
+文件：
+
+- `src/ir/passes/ConstProp.cpp`
+- `src/ir/passes/ConstFold.cpp`
+- `src/ir/passes/ArithmeticSimplify.cpp`
+- `src/ir/passes/InterproceduralConstProp.cpp`
+
+优化内容：
+
+- 常量传播。
+- 常量折叠。
+- 代数恒等式。
+- 函数参数常量特化。
+- 常见模乘/模幂 idiom 的简化基础。
+
+典型例子：
+
+```text
+x + 0 -> x
+x * 1 -> x
+x * 0 -> 0
+if (true) -> direct branch
+```
+
+## CSE 和 GVN
+
+文件：
+
+- `src/ir/passes/CSE.cpp`
+- `src/ir/passes/GVN.cpp`
+
+CSE 通常偏局部，GVN 支持跨块。
+
+它们依赖：
+
+- 表达式等价判断。
+- 支配关系。
+- 指令是否无副作用。
+
+收益：
+
+- 消除重复地址计算。
+- 消除重复算术。
+- 为后端减少指令数量和寄存器压力。
+
+## LoadStoreElim
+
+文件：
+
+- `src/ir/passes/LoadStoreElim.cpp`
+
+作用：
+
+- 跨基本块传播已知内存值。
+- store-to-load forwarding。
+- 删除被覆盖且未观察的死 store。
+- 保守处理 call、memset 和未知 alias。
+- 当前新增了循环 preheader store 到循环内 load 的安全转发。
+
+当前新增优化的核心思想：
+
+```text
+preheader:
+  store v, B[i][k]
+
+loop j = i + 1 .. n:
+  x = load B[i][k]
+  store ..., B[j][k]
+
+如果能证明 B[j][k] 和 B[i][k] 不 alias，则 x 可以替换为 v。
+```
+
+这个优化命中三角矩阵类循环，典型收益来自 `h-11-01`。
+
+## IfConversion
+
+文件：
+
+- `src/ir/passes/IfConversion.cpp`
+
+作用：
+
+- 把小型分支结构转成 select-like 形式。
+- 为后端 `csel` 提供机会。
+
+收益：
+
+- 降低分支开销。
+- 对 min/max/clamp 这类小函数和条件赋值有帮助。
+
+风险：
+
+- 不能错误执行有副作用分支。
+- 浮点和整数条件选择要区分。
+
+## DCE 和 CFGSimplify
+
+文件：
+
+- `src/ir/passes/DCE.cpp`
+- `src/ir/passes/CFGSimplify.cpp`
+
+DCE 删除无副作用且结果不用的指令。
+
+CFGSimplify 处理：
+
+- 删除不可达块。
+- 简化常量条件跳转。
+- 合并空块。
+- 清理无用 `phi`。
+
+这两个 pass 通常在其他优化之后反复运行。
+
+## Inline
+
+文件：
+
+- `src/ir/passes/Inline.cpp`
+
+作用：
+
+- 小函数内联。
+- 结合副作用分析决定是否内联。
+- 内联后暴露常量传播、GVN、LICM 机会。
+
+收益典型来自：
+
+- min/max/clamp 小函数。
+- wrapper 函数。
+- 热循环中的小函数调用。
+
+风险：
+
+- 代码体积膨胀。
+- 编译时间增加。
+- 过度内联可能增加寄存器压力。
+
+## TailRecursionElim
+
+文件：
+
+- `src/ir/passes/TailRecursionElim.cpp`
+
+作用：
+
+- 把尾递归调用改写成循环。
+- 避免大量函数调用和栈增长。
+
+适合：
+
+- 阶乘、递归搜索尾调用变体。
+- `h-1-01`、`65_color` 这类可能出现尾递归结构的样例。
+
+## LICM
+
+文件：
+
+- `src/ir/passes/LICM.cpp`
+
+作用：
+
+- 把循环不变计算移动到 preheader。
+- 对 load 需要依赖 alias/mod-ref 判断。
+
+能 hoist 的条件：
+
+- 操作数循环不变。
+- 指令无副作用。
+- 对 load，要证明循环内没有可能写同一地址的 store/call。
+
+## LoopMemoryPromotion
+
+文件：
+
+- `src/ir/passes/LoopMemoryPromotion.cpp`
+
+作用：
+
+- 把循环中反复访问的内存位置提升为标量。
+- 循环结束再写回。
+
+适合：
+
+- 累加器。
+- 循环内反复 load/store 同一个数组元素。
+- 支配循环入口的 reaching store 场景。
+
+## LoopStrengthReduction
+
+文件：
+
+- `src/ir/passes/LoopStrengthReduction.cpp`
+
+作用：
+
+- 把循环内地址乘法变成指针递推。
+
+典型变化：
+
+```text
+addr = base + i * 4096 + k * 4
+```
+
+变成：
+
+```text
+ptr.phi = phi [init], [ptr.next]
+ptr.next = ptr.phi + stride
+```
+
+后端就能生成 `add x, x, #stride`，减少乘法和复杂地址计算。
+
+## LoopUnroll / LoopFission / LoopUnswitch / LoopRepeatReduction
+
+文件：
+
+- `src/ir/passes/LoopUnroll.cpp`
+- `src/ir/passes/LoopFission.cpp`
+- `src/ir/passes/LoopUnswitch.cpp`
+- `src/ir/passes/LoopRepeatReduction.cpp`
+
+用途：
+
+- LoopUnroll：减少分支、暴露指令级并行。
+- LoopFission：拆分循环，降低依赖和寄存器压力。
+- LoopUnswitch：移出循环不变条件。
+- LoopRepeatReduction：识别重复循环模式并简化。
+
+比赛中这些 pass 必须保守。错误的依赖分析会直接导致错答案。
--- a/study_materials/05_backend_mir_and_codegen.md
+++ b/study_materials/05_backend_mir_and_codegen.md
@ -0,0 +1,239 @@
+# 05 后端、MIR 和 AArch64 代码生成
+
+本章面向懂“指令选择、寄存器分配、目标代码生成”概念，但没有写过后端代码的读者。后端比中端更容易被寄存器、ABI、指令格式细节淹没，阅读时先抓住对象生命周期。
+
+主线是：
+
+```text
+IR Value -> MIR vreg -> 物理寄存器或 spill slot -> AArch64 汇编文本
+```
+
+最低掌握目标：
+
+```text
+能解释为什么需要 MIR，为什么需要寄存器分配，为什么 post-RA 优化和 pre-RA 优化不同。
+```
+
+## 后端入口
+
+后端流程在 `src/main.cpp` 中：
+
+```text
+LowerToMIR
+RunMIRPreRegAllocPassPipeline
+RunRegAlloc
+RunMIRPostRegAllocPassPipeline
+RunFrameLowering
+PrintAsm
+```
+
+核心文件：
+
+- `include/mir/MIR.h`
+- `src/mir/Lowering.cpp`
+- `src/mir/RegAlloc.cpp`
+- `src/mir/FrameLowering.cpp`
+- `src/mir/AsmPrinter.cpp`
+- `src/mir/MIRInstr.cpp`
+- `src/mir/Register.cpp`
+
+## MIR 数据结构
+
+MIR 是机器相关中间表示，比 IR 更接近 AArch64。
+
+主要概念：
+
+- `MachineModule`
+- `MachineFunction`
+- `MachineBasicBlock`
+- `MachineInstr`
+- `MachineOperand`
+- 虚拟寄存器
+- 物理寄存器
+- 栈槽
+
+MIR 的价值：
+
+- 指令选择后还能做机器级优化。
+- 寄存器分配在 MIR 上进行。
+- 栈帧和调用约定在 MIR/AsmPrinter 阶段处理。
+
+## Lowering
+
+文件：
+
+- `src/mir/Lowering.cpp`
+
+作用：
+
+- 把 IR 指令翻译成 MIR 指令。
+- 处理整数、浮点、比较、分支、调用、返回。
+- 处理 GEP 和地址计算。
+- 识别部分模式，例如 `madd/msub`、条件选择、小函数选择化等。
+
+关键点：
+
+- IR 是无限虚拟寄存器模型。
+- MIR 要逐步接近有限物理寄存器和 AArch64 指令。
+- 复杂 IR 指令可能需要拆成多条 MIR。
+
+## 寄存器分配
+
+文件：
+
+- `src/mir/RegAlloc.cpp`
+
+目标：
+
+- 把虚拟寄存器映射到物理寄存器。
+- 如果物理寄存器不够，就 spill 到栈。
+
+需要理解：
+
+- 活跃区间。
+- 冲突图或线性扫描思想。
+- caller-saved 和 callee-saved。
+- call-clobber 处理。
+- spill/reload 插入。
+
+当前优化方向：
+
+- 扩展可分配寄存器集合。
+- 更强 copy coalescing。
+- rematerialization。
+- live range split。
+- spill/reload peephole。
+
+答辩时可以说明：寄存器分配是自研后端性能最重要的瓶颈之一。
+
+## FrameLowering
+
+文件：
+
+- `src/mir/FrameLowering.cpp`
+
+作用：
+
+- 计算栈帧大小。
+- 分配 spill slot。
+- 保存和恢复 callee-saved 寄存器。
+- 生成函数 prologue/epilogue。
+
+AArch64 常见结构：
+
+```asm
+stp x29, x30, [sp, #-16]!
+mov x29, sp
+...
+ldp x29, x30, [sp], #16
+ret
+```
+
+实际项目中还要处理更大的局部栈空间、对齐、浮点寄存器保存。
+
+## AsmPrinter
+
+文件：
+
+- `src/mir/AsmPrinter.cpp`
+
+作用：
+
+- 把 MIR 打印成 AArch64 汇编。
+- 输出 `.data`、`.bss`、`.text`。
+- 处理全局变量、常量数组、字符串或零初始化。
+- 做部分物理寄存器级 peephole 和汇编选择。
+
+当前后端优化中，AsmPrinter 可能处理：
+
+- `cbz/cbnz` 分支。
+- `tbz/tbnz` 位测试分支。
+- `madd/msub` 融合。
+- 常数乘除取模 lowering。
+- `ldp/stp` 合并。
+- 栈访存压缩。
+- 条件选择 `csel`。
+
+## MIR Pass
+
+入口：
+
+- `src/mir/passes/PassManager.cpp`
+
+已有 pass：
+
+- `src/mir/passes/Peephole.cpp`
+- `src/mir/passes/SpillReduction.cpp`
+- `src/mir/passes/CFGCleanup.cpp`
+
+pre-RA pipeline：
+
+```text
+SpillReduction
+AddressHoisting
+Peephole + CFGCleanup 迭代
+```
+
+post-RA pipeline：
+
+```text
+Peephole + CFGCleanup 迭代
+```
+
+## 后端优化理解重点
+
+### Copy 消除
+
+目标是删除：
+
+```asm
+mov x1, x1
+mov x2, x3
+mov x3, x2
+```
+
+但必须保证不破坏活跃值。
+
+### Load/Store pair
+
+AArch64 支持：
+
+```asm
+ldp x1, x2, [sp, #offset]
+stp x1, x2, [sp, #offset]
+```
+
+它能减少指令数和访存调度压力。
+
+### 常数除法和取模
+
+普通 `sdiv` 很慢。对常数除法可以用 magic number、shift、multiply-high 等方式替代。
+
+对 2 的幂：
+
+```text
+x / 8 -> shift with sign correction
+x % 8 -> bit mask with sign handling
+```
+
+### madd/msub
+
+模式：
+
+```text
+a + b * c -> madd
+a - b * c -> msub
+```
+
+收益：
+
+- 减少一条指令。
+- 降低寄存器压力。
+- 更贴合 AArch64 后端。
+
+## 当前后端风险
+
+- 过度 peephole 可能破坏 flags 或寄存器活跃关系。
+- post-RA 优化必须理解物理寄存器和调用约定。
+- `ldp/stp` 合并必须保证地址连续、对齐、无中间 clobber。
+- spill/reload 消除必须保证内存 slot 没被写入改变。
--- a/study_materials/06_testing_and_performance.md
+++ b/study_materials/06_testing_and_performance.md
@ -0,0 +1,191 @@
+# 06 测试、Baseline 和性能分析
+
+本章适合没有实际工程经验的读者建立“怎么证明自己没改坏”的习惯。比赛编译器不是只看理论优化是否成立，还要靠脚本验证端到端正确性和性能。
+
+你不需要先会写复杂脚本，但必须会读脚本输出：
+
+```text
+PASS/FAIL/SKIP 是什么。
+diff 说明哪里错。
+timing.tsv 说明哪个样例慢。
+whole.log 说明完整执行过程。
+```
+
+最低掌握目标：
+
+```text
+能独立运行单样例验证，能解释 speedup = gcc_time / our_time，能根据 loss 判断优先优化哪个样例。
+```
+
+## 正确性优先级
+
+比赛优化必须遵守：
+
+```text
+全量正确 > 大样例正确 > 性能提升 > 编译时间
+```
+
+任何优化如果造成错答案，即使速度快也不能保留。
+
+## 常用脚本
+
+### IR 验证
+
+```bash
+./scripts/verify_ir.sh case.sy output_dir
+```
+
+用于检查 IR 阶段。
+
+### 单样例 ASM 链路验证
+
+```bash
+./scripts/verify_asm.sh test/test_case/functional/00_main.sy test/test_result/function/asm --run
+```
+
+流程：
+
+```text
+源码 -> compiler 生成汇编 -> aarch64-linux-gnu-gcc 链接 -> qemu-aarch64 运行 -> 对比 .out
+```
+
+### 全量 ASM 验证
+
+```bash
+./scripts/verify_asm_all.sh test/test_case
+```
+
+用于全量正确性。
+
+### 全量计时验证
+
+```bash
+./scripts/verify_asm_all_time.sh test/test_case
+```
+
+输出：
+
+- 每个样例是否 PASS。
+- 每个样例运行耗时。
+- 和 GCC baseline 的 speedup。
+- speedup 升序表。
+- our elapsed 降序表。
+
+### GCC baseline
+
+```bash
+./scripts/run_baseline.sh test/test_case
+```
+
+输出通常在：
+
+- `output/baseline/gcc_timing.tsv`
+
+### 单样例分析
+
+```bash
+./scripts/analyze_case.sh test/test_case/h_performance/h-11-01.sy
+```
+
+用途：
+
+- 保存单样例 IR。
+- 保存汇编。
+- 保存运行结果。
+- 对比性能差距。
+
+## 如何读 speedup 表
+
+speedup 定义：
+
+```text
+speedup = gcc_time / our_time
+```
+
+解释：
+
+- `speedup > 1`：我们比 GCC 快。
+- `speedup = 1`：接近 GCC。
+- `speedup < 1`：我们比 GCC 慢。
+
+优化优先级不要只看比例。应计算损失秒数：
+
+```text
+loss = our_time - gcc_time
+```
+
+一个 `0.2s -> 0.04s` 的 0.2x 样例虽然比例难看，但总损失只有 0.16s；一个 `60s -> 52s` 的 0.86x 样例损失 8s，更值得优先优化。
+
+## 最新性能分析模板
+
+建议每次全量测试后记录：
+
+```text
+log path:
+summary:
+build elapsed:
+validation elapsed:
+total elapsed:
+our comparable sum:
+gcc comparable sum:
+overall speedup:
+top losses:
+top gains:
+regressions from previous run:
+```
+
+## 定位性能问题的方法
+
+1. 从 `timing.tsv` 找 top loss。
+2. 用 `analyze_case.sh` 生成 IR 和汇编。
+3. 看 IR 是否还有明显冗余：
+   - 循环内重复 load。
+   - 循环内重复 GEP/mul。
+   - 小函数未内联。
+   - 尾递归未消除。
+   - 常数除法还在。
+4. 看汇编是否有后端问题：
+   - 过多 `ldr/str`。
+   - 过多 spill/reload。
+   - 没有 `madd/msub`。
+   - 没有 `csel`。
+   - 没有 `ldp/stp`。
+   - 分支链太长。
+5. 改一个点后只跑相关样例。
+6. 确认不回退已知敏感样例。
+
+## 已知敏感样例
+
+这些样例适合作为优化后 smoke test：
+
+- `h-4-01/02/03`：min/max/clamp、选择化、内联敏感。
+- `h-11-01`：三角循环、循环内不变 load、地址递推敏感。
+- `h-12-01`：浮点大输出、正确性和性能都敏感。
+- `38_light2d`：曾出现过输出错误，适合回归。
+- `gameoflife-*`：stencil/访存优化敏感。
+- `crypto-*`：常数模乘、模幂、除法 lowering 敏感。
+- `29_long_line`：编译时间和 size guard 敏感。
+
+## 提交前建议验证
+
+轻量验证：
+
+```bash
+cmake --build build -j "$(nproc)"
+./scripts/verify_asm.sh test/test_case/functional/00_main.sy test/test_result/function/asm --run
+./scripts/verify_asm.sh test/test_case/h_functional/38_light2d.sy test/test_result/function/asm --run
+```
+
+性能相关验证：
+
+```bash
+./scripts/verify_asm.sh test/test_case/h_performance/h-11-01.sy test/test_result/function/asm --run
+```
+
+全量验证：
+
+```bash
+./scripts/verify_asm_all_time.sh test/test_case
+```
+
+全量很慢，不应在每个小改动后都跑，但重要提交前必须跑。
--- a/study_materials/07_defense_qa.md
+++ b/study_materials/07_defense_qa.md
@ -0,0 +1,126 @@
+# 07 答辩和验收问答
+
+本章按“懂理论但没有深度编码经历”的答辩场景整理。回答时不要把自己包装成每一行代码都手写过，而要准确说明你理解的架构、关键优化、正确性保证和测试证据。
+
+答辩回答的基本格式：
+
+```text
+先说阶段和文件位置。
+再说核心算法或模式。
+再说安全条件。
+最后说怎么验证。
+```
+
+最低掌握目标：
+
+```text
+被问到一个优化时，能说出它大概在哪个文件、解决什么问题、为什么不会错。
+```
+
+## 项目一句话介绍
+
+这是一个面向 SysY 的自研编译器，使用 ANTLR 完成语法解析，自研语义分析、SSA-like IR、中端优化、MIR 后端和 AArch64 汇编生成，并通过脚本完成源码到汇编再到 qemu 运行的全链路验证。
+
+## 如果问“你们完成了哪些阶段”
+
+建议回答：
+
+```text
+我们完成了从 SysY 源码到 AArch64 汇编的完整链路。前端包括语法树、语义检查、符号表和常量求值；中端包括自研 IR、支配树、循环分析和多轮优化；后端包括 MIR lowering、寄存器分配、栈帧生成、AArch64 汇编打印和机器级 peephole。
+```
+
+## 如果问“为什么不用 LLVM”
+
+建议回答：
+
+```text
+实验和比赛目标是自主实现编译器，所以中端 IR、优化 pass、后端 MIR 和 AArch64 汇编输出都是自研实现。ANTLR 只负责语法解析，后续语义、优化和代码生成都在项目代码中完成。
+```
+
+## 如果问“IR 是什么样的”
+
+建议回答：
+
+```text
+IR 是接近 LLVM 风格的 SSA-like IR。核心是 Value/User/Use 维护 use-def 链，Function 由 BasicBlock 构成 CFG，Instruction 表示 load/store/gep/phi/call/branch 等操作。优化 pass 基于这个 IR 做常量传播、GVN、DCE、LICM、循环优化和内存优化。
+```
+
+## 如果问“Mem2Reg 怎么做”
+
+建议回答：
+
+```text
+Mem2Reg 会识别未逃逸的 alloca，把变量的 load/store 提升成 SSA 值。控制流合流处根据支配边界插入 phi，然后沿支配树重命名定义和使用。数组或地址逃逸的对象不会被提升。
+```
+
+## 如果问“中端优化有哪些”
+
+建议回答：
+
+```text
+基础优化包括常量传播、常量折叠、代数化简、CSE、GVN、DCE 和 CFG 简化。函数级优化包括内联、过程间常量传播和尾递归消除。循环优化包括 LICM、LoopMemoryPromotion、LoopUnswitch、LoopStrengthReduction、LoopFission、LoopUnroll 和重复循环消除。内存优化包括跨块 load/store elimination、store-to-load forwarding 和保守 alias 分析。
+```
+
+## 如果问“怎么保证优化正确”
+
+建议回答：
+
+```text
+所有会移动或删除指令的优化都要求保守条件。比如 LICM 只移动无副作用且操作数循环不变的指令；load hoist 要证明循环内没有可能写同一地址的 store/call；LoadStoreElim 遇到未知地址或未知副作用调用会清空内存状态。每次优化后通过单样例和全量脚本回归。
+```
+
+## 如果问“最近做的 h-11 优化是什么”
+
+建议回答：
+
+```text
+h-11 是三角矩阵类循环。IR 中 preheader 已经把 B[i][k] 计算并 store，内层 j 循环又反复 load B[i][k]。我们在 LoadStoreElim 中加入 loop-aware forwarding，识别 j 从 i+1 单调递增，因此 B[j][k] 不会 alias B[i][k]，可以把循环内 reload 替换成 preheader 中的标量值。这个优化不是按样例名匹配，而是基于 GEP 地址、归纳变量和保守 alias 条件。
+```
+
+## 如果问“后端做了什么”
+
+建议回答：
+
+```text
+后端实现了 IR 到 MIR 的 lowering、AArch64 指令选择、寄存器分配、栈帧生成和汇编打印。优化包括 MIR peephole、CFG cleanup、spill/reload 简化、地址 hoisting、部分 madd/msub、常数除法/取模 lowering、条件选择和访存合并等。
+```
+
+## 如果问“寄存器分配是什么算法”
+
+建议回答：
+
+```text
+当前是自研的基于活跃信息的寄存器分配实现，核心任务是把虚拟寄存器映射到 AArch64 物理寄存器，并在冲突或寄存器不足时插入 spill/reload。后续优化空间主要是 live-range split、rematerialization、更强 copy coalescing 和 call-clobber 精细处理。
+```
+
+## 如果问“性能如何评估”
+
+建议回答：
+
+```text
+我们用脚本先运行 GCC baseline，保存每个样例的基线时间；再运行自研编译器生成的汇编并在 qemu-aarch64 下执行，计算 speedup = gcc_time / our_time。分析时同时看 speedup 升序和 our time 降序，优先优化总损失秒数最大的样例。
+```
+
+## 如果问“为什么有些样例比 GCC 快很多”
+
+建议回答：
+
+```text
+有些样例命中了特定的数学 idiom、循环化简或内联后常量传播，使得运行循环被大幅缩短。这里要强调我们不按文件名特判，而是识别通用 IR 模式。答辩时应该能指出对应 pass 和触发条件。
+```
+
+## 如果问“还有哪些不足”
+
+建议回答：
+
+```text
+主要不足在三个方向。第一，alias/mod-ref 仍偏保守，限制了 LICM 和内存提升。第二，循环优化还没有完整多面体或系统向量化能力，复杂依赖下只能保守处理。第三，后端寄存器分配和指令调度仍可继续增强，特别是 spill 控制、copy coalescing、ldp/stp 调度和 NEON 向量化。
+```
+
+## 答辩注意事项
+
+- 不要说“所有优化都完成了”，要说“已实现一套完整可运行的优化流水线，并在若干热点上继续增强”。
+- 不要把样例特化说成通用优化；如果是模式识别，要讲清楚触发条件。
+- 被问正确性时，优先讲保守条件和测试脚本。
+- 被问性能时，优先讲 baseline、speedup、总损失秒数和具体 IR/ASM 证据。
+- 被问代码位置时，直接对应到文件。
--- a/study_materials/08_file_index.md
+++ b/study_materials/08_file_index.md
@ -0,0 +1,166 @@
+# 08 文件索引
+
+这个文件用于快速定位代码。对没有实际编码经验的读者，它不是让你逐个打开文件背下来，而是作为“查地图”的工具。
+
+建议用法：
+
+```text
+听到一个概念，例如 LICM。
+先在本文件找到对应源文件。
+再打开对应学习资料了解作用。
+最后只读该文件中的核心函数。
+```
+
+最低掌握目标：
+
+```text
+能在 30 秒内从一个编译器术语定位到项目中的相关文件。
+```
+
+## 根目录
+
+- `README.md`：项目基础说明。
+- `CMakeLists.txt`：顶层构建配置。
+- `src/`：所有主要实现。
+- `include/`：公共头文件。
+- `doc/`：实验文档和优化说明。
+- `scripts/`：测试和分析脚本。
+- `sylib/`：运行时库。
+- `test/`：测试用例和测试输出。
+- `Reference/`：参考资料。
+- `study_materials/`：当前学习资料。
+
+## Frontend
+
+- `src/antlr4/SysY.g4`：SysY 语法。
+- `include/frontend/AntlrDriver.h`：ANTLR 解析入口声明。
+- `src/frontend/AntlrDriver.cpp`：ANTLR 解析实现。
+- `include/frontend/SyntaxTreePrinter.h`：语法树打印声明。
+- `src/frontend/SyntaxTreePrinter.cpp`：语法树打印实现。
+
+## Sema
+
+- `include/sem/Sema.h`：语义分析接口和结果结构。
+- `include/sem/SymbolTable.h`：符号表结构。
+- `src/sem/Sema.cpp`：语义分析主逻辑。
+- `src/sem/ConstEval.cpp`：常量表达式求值。
+- `src/sem/SymbolTable.cpp`：符号表实现。
+
+## IRGen
+
+- `include/irgen/IRGen.h`：IR 生成接口。
+- `src/irgen/IRGenDriver.cpp`：IRGen 总驱动。
+- `src/irgen/IRGenDecl.cpp`：声明和初始化。
+- `src/irgen/IRGenExp.cpp`：表达式生成。
+- `src/irgen/IRGenFunc.cpp`：函数生成。
+- `src/irgen/IRGenStmt.cpp`：语句和控制流生成。
+
+## IR Core
+
+- `include/ir/IR.h`：IR 核心类定义。
+- `include/ir/Analysis.h`：支配树和循环信息接口。
+- `include/ir/PassManager.h`：IR pass 声明。
+- `include/ir/utils.h`：IR 辅助工具。
+- `src/ir/Value.cpp`：Value 和 use-def。
+- `src/ir/Instruction.cpp`：指令实现。
+- `src/ir/BasicBlock.cpp`：基本块实现。
+- `src/ir/Function.cpp`：函数实现。
+- `src/ir/Module.cpp`：模块实现。
+- `src/ir/Type.cpp`：类型系统。
+- `src/ir/Context.cpp`：常量池和命名上下文。
+- `src/ir/IRBuilder.cpp`：IR 创建器。
+- `src/ir/IRPrinter.cpp`：IR 打印。
+
+## IR Analysis
+
+- `src/ir/analysis/DominatorTree.cpp`：支配树。
+- `src/ir/analysis/LoopInfo.cpp`：循环识别。
+
+## IR Passes
+
+- `src/ir/passes/PassManager.cpp`：中端优化流水线。
+- `src/ir/passes/Mem2Reg.cpp`：内存到 SSA 寄存器提升。
+- `src/ir/passes/ConstProp.cpp`：常量传播。
+- `src/ir/passes/ConstFold.cpp`：常量折叠。
+- `src/ir/passes/ArithmeticSimplify.cpp`：代数化简。
+- `src/ir/passes/CSE.cpp`：局部公共子表达式消除。
+- `src/ir/passes/GVN.cpp`：全局值编号。
+- `src/ir/passes/LoadStoreElim.cpp`：load/store 消除和 store-to-load forwarding。
+- `src/ir/passes/DCE.cpp`：死代码删除。
+- `src/ir/passes/CFGSimplify.cpp`：CFG 简化。
+- `src/ir/passes/IfConversion.cpp`：小分支选择化。
+- `src/ir/passes/Inline.cpp`：函数内联。
+- `src/ir/passes/InterproceduralConstProp.cpp`：过程间常量传播。
+- `src/ir/passes/TailRecursionElim.cpp`：尾递归消除。
+- `src/ir/passes/LICM.cpp`：循环不变代码外提。
+- `src/ir/passes/LoopMemoryPromotion.cpp`：循环内存标量提升。
+- `src/ir/passes/LoopStrengthReduction.cpp`：强度削弱和地址递推。
+- `src/ir/passes/LoopFission.cpp`：循环拆分。
+- `src/ir/passes/LoopUnroll.cpp`：循环展开。
+- `src/ir/passes/LoopUnswitch.cpp`：循环 unswitch。
+- `src/ir/passes/LoopRepeatReduction.cpp`：重复循环规约。
+- `src/ir/passes/PassUtils.h`：通用 pass 工具。
+- `src/ir/passes/LoopPassUtils.h`：循环 pass 工具。
+- `src/ir/passes/MemoryUtils.h`：内存和 alias 工具。
+- `src/ir/passes/LoopMemoryUtils.h`：循环内存分析工具。
+- `src/ir/passes/MathIdiomUtils.h`：数学模式识别工具。
+
+## MIR Core
+
+- `include/mir/MIR.h`：MIR 核心声明。
+- `include/mir/CodeGen.h`：后端 codegen 阶段入口声明，如 lowering、寄存器分配、栈帧生成、汇编输出。
+- `include/mir/Passes.h`：MIR pass 入口声明，如 peephole、spill reduction、CFG cleanup、address hoisting。
+- `src/mir/Lowering.cpp`：IR 到 MIR。
+- `src/mir/RegAlloc.cpp`：寄存器分配。
+- `src/mir/FrameLowering.cpp`：栈帧生成。
+- `src/mir/AsmPrinter.cpp`：汇编打印。
+- `src/mir/AsmPrinterSupport.cpp`：汇编打印辅助函数，如立即数 lowering、地址访问辅助、copy/move 输出。
+- `src/mir/AsmPrinterSupport.h`：AsmPrinter 辅助函数声明。
+- `src/mir/MIRInstr.cpp`：MIR 指令实现。
+- `src/mir/MIRFunction.cpp`：MachineFunction。
+- `src/mir/MIRBasicBlock.cpp`：MachineBasicBlock。
+- `src/mir/MIRContext.cpp`：MIR 上下文。
+- `src/mir/Register.cpp`：寄存器定义和工具。
+
+## MIR Passes
+
+- `src/mir/passes/PassManager.cpp`：MIR pass 流水线。
+- `src/mir/passes/AddressHoisting.cpp`：地址提升。
+- `src/mir/passes/Peephole.cpp`：机器级 peephole。
+- `src/mir/passes/SpillReduction.cpp`：spill/reload 简化。
+- `src/mir/passes/CFGCleanup.cpp`：MIR CFG 清理。
+
+## Utils
+
+- `include/utils/CLI.h`：命令行选项声明。
+- `include/utils/OptConfig.h`：优化环境变量开关工具。
+- `src/utils/CLI.cpp`：命令行解析。
+- `include/utils/Log.h`：日志和错误工具。
+- `src/utils/Log.cpp`：日志实现。
+
+## Scripts
+
+- `scripts/verify_ir.sh`：IR 验证。
+- `scripts/verify_asm.sh`：单样例 ASM 验证。
+- `scripts/verify_asm_all.sh`：全量 ASM 验证。
+- `scripts/verify_asm_all_time.sh`：全量计时和 speedup。
+- `scripts/run_baseline.sh`：GCC baseline。
+- `scripts/analyze_case.sh`：单样例全流程分析。
+- `scripts/clean_outputs.sh`：清理输出。
+- `scripts/lab1_build_test.sh`：Lab1 构建测试。
+- `scripts/lab2_build_test.sh`：Lab2 构建测试。
+- `scripts/lab3_build_test.sh`：Lab3 构建测试。
+- `scripts/usage.txt`：脚本使用说明。
+
+## Doc
+
+- `doc/Lab1-语法树构建.md`
+- `doc/Lab2-中间表示生成.md`
+- `doc/Lab3-指令选择与汇编生成.md`
+- `doc/Lab4-基本标量优化.md`
+- `doc/Lab5-寄存器分配.md`
+- `doc/Lab6-并行与循环优化.md`
+- `doc/lab4-6.md`
+- `doc/lab3-latest-test-analysis.md`
+- `doc/competition-optimization-round.md`
+- `doc/preliminary-design.md`
--- a/study_materials/09_optimization_inventory.md
+++ b/study_materials/09_optimization_inventory.md
@ -0,0 +1,269 @@
+# 09 优化总清单
+
+本文按当前代码实际存在的优化列出实现位置和实现方式。对没有实际编码经验的读者，它的作用是把“优化名称”翻译成“代码位置 + 触发模式 + 安全条件”。
+
+阅读时不要一次背完整表格。建议按优先级掌握：
+
+```text
+先掌握 IR 内存和循环优化。
+再掌握函数内联、尾递归、math idiom。
+最后掌握后端寄存器分配和 AArch64 peephole。
+```
+
+答辩时可以直接按本文说明：先讲优化在 pipeline 中的位置，再讲触发条件、保守性和收益来源。
+
+注意：本文只把当前代码能定位到的优化写成“已实现”。例如通用 NEON 向量化、完整 loop blocking、完整并行化库接入，在当前代码中没有形成稳定通用 pass，不能作为已完成项答辩。
+
+## 总入口和开关
+
+| 优化层级 | 入口文件 | 实现方式 |
+| --- | --- | --- |
+| IR 优化总 pipeline | `src/ir/passes/PassManager.cpp` | `RunIRPassPipeline` 串联 Mem2Reg、函数级优化、标量优化、内存优化、循环优化和清理 pass。 |
+| IR pass 声明 | `include/ir/PassManager.h` | 统一声明 `RunMem2Reg`、`RunGVN`、`RunLoadStoreElim`、`RunLICM`、`RunLoopUnroll` 等 pass。 |
+| MIR pre-RA pipeline | `src/mir/passes/PassManager.cpp` | 寄存器分配前运行 `SpillReduction`、`AddressHoisting`、`Peephole`、`CFGCleanup`。 |
+| MIR post-RA pipeline | `src/mir/passes/PassManager.cpp` | 寄存器分配后再次运行 `Peephole`、`CFGCleanup`，处理物理寄存器级冗余。 |
+| 后端主流程 | `src/main.cpp` | `LowerToMIR -> RunMIRPreRegAllocPassPipeline -> RunRegAlloc -> RunMIRPostRegAllocPassPipeline -> RunFrameLowering -> PrintAsm`。 |
+
+当前可用调试开关：
+
+| 开关 | 位置 | 作用 |
+| --- | --- | --- |
+| `NUDTC_DISABLE_MEM2REG` | `src/ir/passes/PassManager.cpp` | 关闭 Mem2Reg，用于定位 SSA 提升相关错误。 |
+| `NUDTC_DISABLE_TAIL_RECURSION` | `src/ir/passes/PassManager.cpp` | 关闭尾递归消除。 |
+| `NUDTC_DISABLE_CFG_INLINE` | `src/ir/passes/PassManager.cpp` | 关闭多基本块函数内联，只保留更简单的内联路径。 |
+| `NUDTC_DISABLE_LOOP_MEM_PROMOTION` | `src/ir/passes/PassManager.cpp` | 关闭循环内存提升。 |
+| `NUDTC_DISABLE_LOOP_UNSWITCH` | `src/ir/passes/PassManager.cpp` | 关闭 loop unswitching。 |
+| `NUDTC_DISABLE_IR_SIZE_GUARD` | `src/ir/passes/PassManager.cpp` | 关闭大函数 pass 轮数保护。 |
+| `NUDTC_DISABLE_MIR_SPILL_REDUCTION` | `src/mir/passes/PassManager.cpp` | 关闭 MIR spill 压力降低。 |
+| `NUDTC_DISABLE_MIR_CFG_CLEANUP` | `src/mir/passes/PassManager.cpp` | 关闭 MIR CFG 清理。 |
+| `NUDTC_DISABLE_MIR_PRECISE_MEMORY` | `src/mir/passes/Peephole.cpp` | 关闭 MIR 精确内存转发和死 store 删除，退回更保守的内存失效。 |
+| `NUDTC_DISABLE_MIR_MADD` | `src/mir/passes/Peephole.cpp` | 关闭 MIR 层 `mul + add/sub` 到 `madd/msub` 融合。 |
+| `NUDTC_DISABLE_ASM_MUL_CONST` | `src/mir/AsmPrinter.cpp` | 关闭汇编层常数乘法 strength reduction。 |
+| `NUDTC_DISABLE_ASM_MADD` | `src/mir/AsmPrinter.cpp` | 关闭汇编层兜底 `madd/msub` 融合。 |
+| `NUDTC_DISABLE_ASM_FALLTHROUGH_OPT` | `src/mir/AsmPrinter.cpp` | 关闭汇编层 fallthrough 分支优化。 |
+
+## 编译期和数据布局优化
+
+| 优化 | 实现位置 | 怎么实现 |
+| --- | --- | --- |
+| 未初始化全局数组压缩 | `src/irgen/IRGenDecl.cpp` | 全局数组没有 initializer 时，把 IR 全局变量 initializer 置空，后端按 `.zero` 或 `.bss` 输出，避免在 IR 中构造巨大零数组。 |
+| 显式全零全局数组压缩 | `src/irgen/IRGenDecl.cpp` | `IsExplicitZeroInitVal`、`IsExplicitZeroConstInitVal` 判断显式初始化列表是否全零；若全零，和未初始化数组一样压缩为零初始化。 |
+| const 全零数组语义保持 | `src/irgen/IRGenDecl.cpp`、`src/irgen/IRGenExp.cpp` | 对全零 const array 记录 `const_array_all_zero`，常量下标读取时直接返回 0，既减少内存占用又保持常量折叠能力。 |
+| 全局零数据汇编压缩 | `src/mir/AsmPrinter.cpp` | `EmitGlobal` 和 `IsZeroScalarConstant` 检查全零或连续零片段，输出 `.zero` 或放入 `.bss`，不逐元素打印。 |
+| 局部大数组基址物化 | `src/mir/Lowering.cpp` | `ShouldMaterializeAllocaBase` 对大数组 alloca 提前物化 base，避免后续每次访问都重新构造复杂栈地址。 |
+| 大栈对象地址 hoisting | `src/mir/AddressHoisting.cpp` | `IsHoistCandidate` 统计 frame object 使用次数和对象大小；大对象或高频对象在入口块生成一次 `Lea`，后续地址引用改用该虚拟寄存器。 |
+
+## IR 基础标量优化
+
+| 优化 | 实现位置 | 怎么实现 |
+| --- | --- | --- |
+| Mem2Reg | `src/ir/passes/Mem2Reg.cpp` | `IsPromotableAlloca` 过滤标量且未逃逸的 alloca；基于支配信息插入 phi；`RenamePromotedAlloca` 沿支配树重命名 load/store。 |
+| 常量传播 | `src/ir/passes/ConstProp.cpp` | 在 IR 层传播已知常量，常量条件分支可被后续 CFGSimplify 消掉。 |
+| 常量折叠 | `src/ir/passes/ConstFold.cpp` | 对常量操作数的算术、比较、转换直接求值，替换为 `Constant`。 |
+| 算术恒等式简化 | `src/ir/passes/ArithmeticSimplify.cpp` | 删除 `x+0`、`x*1` 等冗余形式，并为后续 CSE/GVN 暴露相同表达式。 |
+| 2 的幂取模测试简化 | `src/ir/passes/ArithmeticSimplify.cpp` | `SimplifyPowerOfTwoRemTests` 识别 `x % 2^k == 0` 或 `!= 0`，改写为 `(x & (2^k-1)) == 0`，避免真实除法/取模。 |
+| CSE | `src/ir/passes/CSE.cpp` | 在局部范围内用表达式 key 消除重复计算，适合基本块内地址计算和纯算术。 |
+| GVN | `src/ir/passes/GVN.cpp` | `ExprKey` 给常量、指令 opcode、类型、操作数编号建值编号；沿支配树维护 available 表，跨块替换被支配的重复表达式。 |
+| 纯调用 GVN | `src/ir/passes/GVN.cpp`、`src/ir/passes/MemoryUtils.h` | 对 `memutils::IsPureCall` 判定为纯的函数调用也参与 GVN，前提是无副作用且相同参数。 |
+| DCE | `src/ir/passes/DCE.cpp` | 删除无副作用且结果无人使用的指令；通常跟在 GVN、LoadStoreElim、Inline 后面清理。 |
+| CFGSimplify | `src/ir/passes/CFGSimplify.cpp` | 简化常量分支、删除不可达块、清理退化 phi，给下一轮优化降低 CFG 复杂度。 |
+| Phi 简化 | `src/ir/passes/PassUtils.h`、`src/ir/passes/CFGSimplify.cpp` | `SimplifyPhiInst` 在所有 incoming 相同或退化时替换 phi。 |
+| IfConversion | `src/ir/passes/IfConversion.cpp` | `TryConvertConditionalAccumulation` 把小型条件累加分支改写成 mask 运算，降低热循环内分支开销。 |
+
+## IR 函数级优化
+
+| 优化 | 实现位置 | 怎么实现 |
+| --- | --- | --- |
+| 只读全局标量传播 | `src/ir/passes/InterproceduralConstProp.cpp` | `PropagateReadonlyScalarGlobals` 找到只有 load、无写入的 scalar global，用初始化常量替换 load。 |
+| 函数参数常量特化 | `src/ir/passes/InterproceduralConstProp.cpp` | `PropagateConstantArguments` 如果某个函数形参在所有直接调用点都是同一个标量常量，就在 callee 内替换该参数。 |
+| 单基本块小函数内联 | `src/ir/passes/Inline.cpp` | `AnalyzeInlineCandidate` 建立成本模型；`InlineCallSite` 克隆 callee 指令并用实参替换形参。 |
+| 多基本块小函数内联 | `src/ir/passes/Inline.cpp` | `CanInlineCFGCallSite` 和 `InlineCFGCallSite` 复制 callee CFG，拆分调用块，重连 return 到 continuation，并用 phi 合并返回值。 |
+| 内联成本模型 | `src/ir/passes/Inline.cpp` | 对块数、指令数、return 数、内存操作、嵌套调用、控制流复杂度计分；纯函数、小调用次数函数预算更宽。 |
+| 避免破坏数学 idiom | `src/ir/passes/Inline.cpp`、`src/ir/passes/MathIdiomUtils.h` | 对 pow2 digit extract、Newton sqrt 等可被后端识别的函数形状跳过内联，保留给 lowering 生成专门指令序列。 |
+| 尾递归消除 | `src/ir/passes/TailRecursionElim.cpp` | `MatchTailRecursiveCall` 匹配 return 前的自递归调用；为形参建立入口 phi；把尾调用改写成给 phi 加 incoming 后跳回函数头。 |
+| 调用递归标记刷新 | `src/ir/passes/TailRecursionElim.cpp` | 尾递归改写后重新计算函数递归属性，避免后续内联/分析继续把它当递归调用。 |
+
+## IR 内存和别名优化
+
+| 优化 | 实现位置 | 怎么实现 |
+| --- | --- | --- |
+| 精确地址 key | `src/ir/passes/MemoryUtils.h` | `BuildExactAddressKey` 递归拆 GEP，记录 root 和 index components；能区分 local/global/param/readonly global。 |
+| alloca 逃逸分析 | `src/ir/passes/MemoryUtils.h` | `AnalyzeEscapes` 判断局部对象地址是否被传出或参与未知使用；未逃逸对象可做更激进 load/store 优化。 |
+| 保守 alias 判断 | `src/ir/passes/MemoryUtils.h` | `MayAliasConservatively` 对不同 root、常量下标、readonly global 等情况返回 no-alias，否则保守认为可能 alias。 |
+| call mod-ref 判断 | `src/ir/passes/MemoryUtils.h` | `CallMayReadRoot`、`CallMayWriteRoot` 利用函数纯度、readonly 信息和 root 类型决定 call 是否会读写某地址。 |
+| Loop affine 分析 | `src/ir/passes/LoopMemoryUtils.h` | `AnalyzeAffine` 把表达式拆成 `iv_coeff * iv + invariant + constant`，供 LICM、promotion、fission、unroll 判定依赖。 |
+| Loop pointer 分析 | `src/ir/passes/LoopMemoryUtils.h` | `AnalyzePointer` 分析 GEP base、byte offset、是否 invariant address、是否 exact key。 |
+| 循环内存访问收集 | `src/ir/passes/LoopMemoryUtils.h` | `CollectMemoryAccesses` 收集 loop 内 load/store/memset/call，为 LICM、promotion、fission、unroll 做依赖检查。 |
+| 同迭代 alias 判断 | `src/ir/passes/LoopMemoryUtils.h` | `MayAliasSameIteration` 结合 exact key、affine offset 和 root 判断同一次迭代是否可能访问同地址。 |
+| 跨迭代依赖判断 | `src/ir/passes/LoopMemoryUtils.h` | `HasCrossIterationDependence` 保守判断不同迭代的读写是否可能冲突，作为 loop fission/unroll 的安全 guard。 |
+| 跨块 Load/Store Elimination | `src/ir/passes/LoadStoreElim.cpp` | 为每个基本块维护 `MemoryState`，数据流 meet 后进行 store-to-load forwarding 和死 store 删除。 |
+| store-to-load forwarding | `src/ir/passes/LoadStoreElim.cpp` | 如果 load 地址 key 在当前 available map 中有已知值，直接用该值替换 load。 |
+| 死 store 删除 | `src/ir/passes/LoadStoreElim.cpp` | 若某地址上一条 pending store 在被 load/call 观察前被同地址 store 覆盖，则删除旧 store。 |
+| 重复 store 删除 | `src/ir/passes/LoadStoreElim.cpp` | 若同地址连续 store 同一个值，当前 store 可删除。 |
+| alias 失效 | `src/ir/passes/LoadStoreElim.cpp` | 遇到 store/call/memset 时按 exact key 和保守 alias 规则失效相关内存状态，避免错误转发。 |
+| 循环 preheader store 转发 | `src/ir/passes/LoadStoreElim.cpp` | `OptimizeLoopPreheaderStoreForwarding` 查找 preheader 中支配循环的 store，把 loop 内相同地址 load 转成该 store 的值。 |
+| 三角循环 no-alias 特判 | `src/ir/passes/LoadStoreElim.cpp` | `IsTriangularIVNoAlias` 识别 `j = i + c` 这类三角区域访问，证明 `B[i][k]` 和 `B[j][k]` 不 alias，服务于 preheader load forwarding。 |
+
+## IR 循环优化
+
+| 优化 | 实现位置 | 怎么实现 |
+| --- | --- | --- |
+| 循环信息分析 | `src/ir/LoopInfo.cpp`、`include/ir/LoopInfo.h` | 基于支配关系识别 back edge、header、latch、preheader、exit blocks，给所有 loop pass 提供结构信息。 |
+| LICM 普通不变量外提 | `src/ir/passes/LICM.cpp` | `IsHoistableInstruction` 和 `IsLoopInvariant` 找到循环不变、无副作用指令，移动到 preheader terminator 前。 |
+| LICM load 外提 | `src/ir/passes/LICM.cpp`、`src/ir/passes/LoopMemoryUtils.h` | `IsSafeInvariantLoadToHoist` 证明 loop 内没有可能写同一地址的 store/call 后，把 invariant load 外提。 |
+| LICM load 去重 | `src/ir/passes/LICM.cpp` | `HoistedLoadKey` 记录已外提 load，同一个 exact address 和类型只保留一个。 |
+| LoopMemoryPromotion | `src/ir/passes/LoopMemoryPromotion.cpp` | 把循环内反复 load/store 的标量内存位置提升成 SSA 标量，循环出口再写回。 |
+| promotion candidate 收集 | `src/ir/passes/LoopMemoryPromotion.cpp` | `CollectCandidates` 按 exact address 和类型分组，要求 scalar、地址循环不变、存在 load/store。 |
+| reaching seed store | `src/ir/passes/LoopMemoryPromotion.cpp` | 除了 preheader 直接 seed store，也允许支配循环入口的 reaching store 作为初始值，提高 matmul/spmv/transpose 类命中率。 |
+| promotion 安全检查 | `src/ir/passes/LoopMemoryPromotion.cpp` | `IsSafeToPromoteCandidate` 检查 exit 可写回、call/memset 不破坏、alias 不冲突、无危险跨迭代依赖。 |
+| promotion phi 插入和重命名 | `src/ir/passes/LoopMemoryPromotion.cpp` | `InsertPhiNodes`、`RenameLoadsAndStores` 把 load 替成当前标量值，把 store 改成新标量定义。 |
+| promotion exit store | `src/ir/passes/LoopMemoryPromotion.cpp` | `InsertExitStores` 在循环出口把最终标量写回原地址。 |
+| LoopStrengthReduction 乘法递推 | `src/ir/passes/LoopStrengthReduction.cpp` | `ReduceLoopMultiplications` 识别 `iv * invariant_factor`，生成递推 phi，每轮加固定 step，替换循环内乘法。 |
+| LoopStrengthReduction 地址递推 | `src/ir/passes/LoopStrengthReduction.cpp` | `ReduceLoopAddressing` 识别 GEP 中单个 affine IV 下标，把每轮重新 GEP 改成 pointer phi 递推。 |
+| 地址递推 CSE | `src/ir/passes/LoopStrengthReduction.cpp` | 对同一 base/index/stride 的递推地址复用已生成 phi，避免同一循环内生成多个重复 pointer induction。 |
+| LoopUnswitch | `src/ir/passes/LoopUnswitch.cpp` | `MatchLoopUnswitch` 找 loop 内条件由 loop 外值决定的 condbr；克隆 loop，把 invariant 条件提到 preheader 分流。 |
+| LoopUnswitch guard | `src/ir/passes/LoopUnswitch.cpp` | 只处理小 loop、innermost、无 call/memset/alloca/unreachable、未带 synthetic tag 的安全循环。 |
+| LoopFission | `src/ir/passes/LoopFission.cpp` | 在同一个 canonical loop 内寻找可切分的 payload 指令 cut，把后半段构造成第二个 loop。 |
+| LoopFission 依赖检查 | `src/ir/passes/LoopFission.cpp`、`src/ir/passes/LoopMemoryUtils.h` | `HasScalarDependenceAcrossCut` 和 `HasMemoryDependenceAcrossCut` 保证切分前后没有错误的标量/内存依赖。 |
+| LoopUnroll | `src/ir/passes/LoopUnroll.cpp` | 识别 canonical counted loop，按 body 大小和内存操作选择 factor 2 或 4，构造 unrolled loop 加 tail loop。 |
+| LoopUnroll 安全检查 | `src/ir/passes/LoopUnroll.cpp` | 要求 innermost、无 call/memset/alloca、stride 与 compare 匹配、无危险 loop-carried memory dependence。 |
+| LoopRepeatReduction | `src/ir/passes/LoopRepeatReduction.cpp` | 对无副作用、只重复累加的 counted loop，把循环执行次数压缩为一次，并在 exit 用乘法放大 accumulator。 |
+| Loop pass 大小保护 | `src/ir/passes/PassManager.cpp` | 大函数减少 IR pass 迭代次数，后几轮关闭可能膨胀 CFG 的内联/循环变换，控制 `29_long_line`、`sl*` 类编译时间。 |
+
+## 数学 idiom 和后端专门 lowering
+
+| 优化 | 实现位置 | 怎么实现 |
+| --- | --- | --- |
+| pow2 digit extract 识别 | `src/ir/passes/MathIdiomUtils.h` | `IsPow2DigitExtractShape` 识别 `while (i < pos) num /= C; return num % C` 且 `C` 为 2 的幂。 |
+| pow2 digit extract lowering | `src/mir/Lowering.cpp`、`src/mir/AsmPrinter.cpp` | Lowering 生成 `DigitExtractPow2` MIR；AsmPrinter 用移位和 mask 替代循环、除法和取模。 |
+| 递归模乘识别 | `src/mir/Lowering.cpp` | `IsRecursiveModMultiplyIdiom` 识别二分递归形式的 modular multiply，要求参数/取模常量形状稳定且无内存副作用。 |
+| 递归模乘 lowering | `src/mir/Lowering.cpp`、`src/mir/AsmPrinter.cpp` | Lowering 生成 `ModMul` MIR；AsmPrinter 生成直接计算序列，避免递归调用开销。 |
+| 递归模幂识别 | `src/mir/Lowering.cpp` | `IsRecursiveModPowerIdiom` 识别递归 exponentiation by squaring，内部调用已识别的模乘 helper。 |
+| 递归模幂 lowering | `src/mir/Lowering.cpp`、`src/mir/AsmPrinter.cpp` | Lowering 生成 `ModPow` MIR；AsmPrinter 输出迭代 loop，用 `tbz` 检查指数 bit，避免深递归。 |
+| min/max/clamp 小函数选择化 | `src/mir/Lowering.cpp` | `MatchTwoWayIntSelectFunction` 识别三块结构的两路返回小函数，Lowering 生成 `CSelect`。 |
+| CSelect 汇编输出 | `src/mir/AsmPrinter.cpp` | `CSelect` 打印为 compare + `csel`，减少小函数调用和短分支。 |
+| 浮点 min/max 小函数选择化 | `src/mir/Lowering.cpp`、`src/mir/AsmPrinter.cpp` | `MatchTwoWayFloatSelectFunction` 识别 `float` 两路返回 helper，Lowering 生成 `CSelect(F32)`，AsmPrinter 输出 `fcmp + fcsel`，避免内层循环 `bl max`。 |
+| Newton sqrt idiom 保护 | `src/ir/passes/MathIdiomUtils.h`、`src/ir/passes/Inline.cpp` | 识别特定 Newton sqrt 形状，避免被普通内联破坏；当前更多是保护和识别基础，不应夸大为完整数学库优化。 |
+
+## MIR 表示和机器无关后端优化
+
+| 优化 | 实现位置 | 怎么实现 |
+| --- | --- | --- |
+| 新增 MIR opcode | `include/mir/MIR.h`、`src/mir/MIRInstr.cpp` | 增加 `MAdd`、`MSub`、`CSelect`、`ModMul`、`ModPow`、`DigitExtractPow2`、`BitTestMask`，使后端能保留高层机器 idiom。 |
+| MIR 指令副作用建模 | `src/mir/MIRInstr.cpp` | 为新增 opcode 标注 def/use、是否可删除、是否读写内存，保证 peephole 和 DCE 不误删。 |
+| pre-RA rematerialization | `src/mir/passes/SpillReduction.cpp` | `CaptureRematerializableDef` 识别立即数 copy 和简单 `Lea`；call 后对使用点重新生成廉价值，减少跨 call 活跃。 |
+| remat 地址重写 | `src/mir/passes/SpillReduction.cpp` | `RewriteMappedAddress` 会同步改写地址表达式中的 vreg，避免只替换普通操作数而漏掉访存地址。 |
+| MIR CFG 清理 | `src/mir/passes/CFGCleanup.cpp` | 跳转链压缩、条件跳转同目标化简、不可达块删除、线性块合并。 |
+| MIR 死指令删除 | `src/mir/passes/Peephole.cpp` | `RunDeadInstrElimination` 删除无副作用且 def vreg 无 use 的机器指令。 |
+
+## 寄存器分配优化
+
+| 优化 | 实现位置 | 怎么实现 |
+| --- | --- | --- |
+| 图着色寄存器分配 | `src/mir/RegAlloc.cpp` | 构建冲突图，执行 simplify/coalesce/freeze/select-spill/coloring 流程。 |
+| GPR/FPR 分类别分配 | `src/mir/RegAlloc.cpp` | 对整数寄存器和浮点寄存器分别建图、分别着色，避免类型错误分配。 |
+| 扩展可分配寄存器集合 | `src/mir/RegAlloc.cpp` | GPR 使用 `x8, x13-x15, x19-x28`，FPR 使用 `v19-v31, v8-v15`，降低 spill 概率。 |
+| call-clobber 处理 | `src/mir/RegAlloc.cpp` | `live_across_call_` 标记跨 call 活跃的 vreg，着色时避免放入 caller-saved 寄存器。 |
+| callee-saved 记录 | `src/mir/RegAlloc.cpp`、`src/mir/FrameLowering.cpp` | 若分配到需要保存的物理寄存器，记录到函数 saved sets，FrameLowering 负责 prologue/epilogue 保存恢复。 |
+| copy coalescing | `src/mir/RegAlloc.cpp` | 根据 move list 尝试合并 copy 两端的 vreg，使用 George/Conservative 规则避免增加不可着色风险。 |
+| move 选择评分 | `src/mir/RegAlloc.cpp` | `PickBestMove` 优先处理收益更高、更可能安全合并的 move。 |
+| spill 成本模型 | `src/mir/RegAlloc.cpp` | 按 use/def 次数、循环权重、degree、rematerializable 特性估计 spill priority，尽量 spill 低收益值。 |
+| rematerializable spill 折扣 | `src/mir/RegAlloc.cpp` | 对立即数和简单地址类可重算值降低 spill 成本，因为 reload 可以替换成重新生成。 |
+
+## MIR peephole 和物理寄存器级优化
+
+| 优化 | 实现位置 | 怎么实现 |
+| --- | --- | --- |
+| copy alias propagation | `src/mir/passes/Peephole.cpp` | 维护 `AliasMap`，把后续使用改写成 copy 源操作数，压缩 copy 链。 |
+| post-RA 自拷贝删除 | `src/mir/passes/Peephole.cpp` | `SimplifyCopy` 比较解析后的物理寄存器或 spill slot；目标和源相同则删除。 |
+| 地址操作数 alias 重写 | `src/mir/passes/Peephole.cpp` | `RewriteAddress` 同步重写 address base/index 寄存器，避免 copy 消除只处理普通 operand。 |
+| zext 简化 | `src/mir/passes/Peephole.cpp` | `SimplifyZExt` 删除已知布尔或无需扩展的零扩展。 |
+| 整数二元运算立即数折叠 | `src/mir/passes/Peephole.cpp` | `TryFoldIntegerBinaryImmediate` 把常量操作数折到机器立即数字段。 |
+| 整数代数简化 | `src/mir/passes/Peephole.cpp` | `SimplifyIntegerBinary` 处理 `+0`、`-0`、`*1`、`&0` 等机器层冗余。 |
+| 比较简化 | `src/mir/passes/Peephole.cpp` | `SimplifyICmp` 对可静态判断或冗余比较做替换。 |
+| 条件分支简化 | `src/mir/passes/Peephole.cpp` | `SimplifyCondBr` 把常量条件或退化条件跳转改成更简单形式。 |
+| MIR load forwarding | `src/mir/passes/Peephole.cpp` | `TryOptimizeMemoryInstruction` 对 exact address load 使用当前 `MemoryState` 的已知值替换。 |
+| MIR 死 store 删除 | `src/mir/passes/Peephole.cpp` | 对同一地址 pending store 被覆盖且未被观察的情况删除旧 store。 |
+| MIR 重复 store 删除 | `src/mir/passes/Peephole.cpp` | 同地址 store 同值时删除冗余 store。 |
+| MIR 精确内存失效 | `src/mir/passes/Peephole.cpp` | `InvalidateMemoryState` 使用地址范围、base object、call effect 判断哪些 memory state 失效。 |
+| BitTestMask 合并 | `src/mir/passes/Peephole.cpp` | `CombineBitTestMasks` 把 `and bit; icmp zero; zext; sub 0,zext` 合成 `BitTestMask`。 |
+| MAdd/MSub 合并 | `src/mir/passes/Peephole.cpp` | `CombineMultiplyAccumulate` 在 pre-RA 识别单 use 的 `mul` 被 `add/sub` 使用，替换为 `MAdd/MSub`。 |
+
+## AArch64 汇编 lowering 和输出优化
+
+| 优化 | 实现位置 | 怎么实现 |
+| --- | --- | --- |
+| `madd/msub` 输出 | `src/mir/AsmPrinter.cpp` | `MAdd/MSub` 直接打印为 AArch64 `madd`/`msub`，减少乘法加法两条指令。 |
+| 汇编层兜底 MAdd/MSub | `src/mir/AsmPrinter.cpp` | `TryEmitFusedMulAdd` 在打印阶段对相邻 `mul + add/sub` 再做一次融合。 |
+| 常数乘法 lowering | `src/mir/AsmPrinter.cpp` | `TryEmitMulByConstant` 处理 0、1、-1、2 的幂、`2^k-1`、少量 set bits 的常数，用 shift/add/sub 代替 `mul`。 |
+| 常数有符号除法 lowering | `src/mir/AsmPrinter.cpp` | `EmitSignedDivByConstant` 对 1、-1、2 的幂走快速路径；普通常数用 magic number 乘法和移位替代 `sdiv`。 |
+| 常数有符号取模 lowering | `src/mir/AsmPrinter.cpp` | `EmitSignedRemByConstant` 对 1、-1 返回 0；2 的幂用符号修正 mask；普通常数用 div 结果配合 `msub` 得余数。 |
+| add/sub 立即数选择 | `src/mir/AsmPrinter.cpp` | `IsAddSubImm` 判断 AArch64 可编码立即数，能编码时直接打印立即数形式，减少加载常数。 |
+| cbz/cbnz | `src/mir/AsmPrinter.cpp` | `EmitZeroTestBranch` 对和 0 比较的条件分支输出 `cbz/cbnz`。 |
+| tbz/tbnz | `src/mir/AsmPrinter.cpp` | `TryEmitFusedBitTestBranch` 识别 bit test 后接分支，输出 `tbz/tbnz`。 |
+| compare + branch 融合 | `src/mir/AsmPrinter.cpp` | `TryEmitFusedCompareBranch` 在打印 compare 后跟 condbr 时直接输出合适条件跳转，避免多余中间布尔值。 |
+| fallthrough 分支优化 | `src/mir/AsmPrinter.cpp` | 打印基本块时若目标正好是下一块，省略无条件跳转或调整条件分支形态。 |
+| frame 直接访存 | `src/mir/AsmPrinter.cpp` | `TryEmitFrameObjectAccess` 和 `TryEmitDirectMemoryAccess` 尝试直接用 `[sp/fp, #imm]` 访存，避免额外 `add`。 |
+| prologue/epilogue pair | `src/mir/AsmPrinter.cpp` | 保存恢复 `x29/x30`、callee-saved GPR/FPR 时优先用 `stp/ldp` 成对访存。 |
+| 相邻 load/store pair | `src/mir/AsmPrinter.cpp` | `TryEmitLoadStorePair` 对相邻同 base、连续 offset、类型兼容的 load/store 输出 `ldp/stp`。 |
+| 非相邻安全 pair 调度 | `src/mir/AsmPrinter.cpp` | `TryEmitScheduledLoadStorePair` 在中间指令不 clobber 相关寄存器且没有内存屏障时，把可配对访存调度成 `ldp/stp`。 |
+| spill/reload pair | `src/mir/AsmPrinter.cpp` | `TryMaterializeGprUsePair` 对两个 spilled GPR use 尝试用一次 `ldp` materialize。 |
+| BitTestMask 输出 | `src/mir/AsmPrinter.cpp` | `BitTestMask` 输出 `tst` + `csetm`，生成全 1 或 0 mask。 |
+| ModPow 输出 | `src/mir/AsmPrinter.cpp` | `ModPow` 输出迭代 exponentiation by squaring，用 `tbz` 检查指数 bit，并调用内联模乘序列。 |
+| ModMul 输出 | `src/mir/AsmPrinter.cpp` | `ModMul` 输出直接模乘序列，避免递归调用；当前仍可能使用除法求余，后续可继续做 Barrett/magic modulo。 |
+| DigitExtractPow2 输出 | `src/mir/AsmPrinter.cpp` | 对 2 的幂 base，用移位提取 digit，再用 mask 得到余数。 |
+
+## 已经有但不能夸大的部分
+
+| 项目 | 当前状态 | 答辩建议 |
+| --- | --- | --- |
+| 通用 NEON 向量化 | 当前代码未看到稳定通用向量化 pass，也未形成系统 `ld1/st1/fmla` lowering。 | 不要说“已完成 NEON 向量化”；可以说后续方向是 stencil/matmul 的保守向量化。 |
+| 完整 loop blocking | 当前有 unroll、fission、strength reduction、promotion，但没有通用矩阵 blocking pass。 | 不要说“实现了通用 blocking”；可以说已有地址递推和内存提升为 blocking 打基础。 |
+| 自动并行化 | `LoopMemoryUtils.h` 有 `IsLoopParallelizable` 一类依赖判断基础，但没有完整并行库 lowering。 | 不要说“已接入并行库”；可以说后续可接南开/华为并行库。 |
+| 完整 rematerialization | 当前有 cheap immediate/Lea remat 和 spill 成本折扣，但不是完整表达式级 remat。 | 可以说“实现了保守 rematerialization”。 |
+| 完整 alias analysis | 已有 exact key、escape、affine guard，但不是 LLVM 级别 AA。 | 可以说“实现了面向 SysY 数组/GEP 的保守别名分析”。 |
+
+## 按答辩收益排序的讲法
+
+| 优先级 | 应该重点讲的优化 | 为什么值得讲 |
+| --- | --- | --- |
+| 1 | `LoadStoreElim`、`LoopMemoryPromotion`、`LICM load hoist` | 这是当前性能收益最大的中端内存优化，能解释 h_performance 类长耗时样例。 |
+| 2 | `LoopStrengthReduction`、`LoopUnroll`、`LoopFission`、`LoopUnswitch` | 体现你不只是做 peephole，而是在 IR 层做结构化循环优化。 |
+| 3 | 函数内联、尾递归消除、math idiom lowering | 能解释递归、helper 函数、模幂模乘、min/max/clamp 的专项收益。 |
+| 4 | RegAlloc 扩寄存器、call-clobber、copy coalescing、spill reduction | 体现自研后端不是直接模板输出，考虑了真实寄存器压力。 |
+| 5 | AArch64 lowering：madd/msub、常数除法、ldp/stp、cbz/tbz/csel | 最贴近 ARM 赛道，容易被问“你针对 AArch64 做了什么”。 |
+| 6 | size guard 和测试开关 | 说明你考虑了编译时间、稳定性和回归定位，不只是堆优化。 |
+
+## 一个完整例子的解释模板
+
+以循环内反复读取同一地址为例，可以这样答：
+
+```text
+优化入口在 src/ir/passes/PassManager.cpp，LoadStoreElim 在 GVN 后运行。
+它的核心实现是 src/ir/passes/LoadStoreElim.cpp。
+先用 MemoryUtils.h 的 BuildExactAddressKey 给 load/store 建精确地址 key。
+然后做跨基本块 dataflow，维护每个地址当前可用的值和 pending store。
+如果 load 的地址在状态里有可用值，就做 store-to-load forwarding。
+如果一条 store 在被观察前被同地址 store 覆盖，就删掉死 store。
+对循环场景，还加了 preheader store forwarding 和三角循环 no-alias 判断。
+安全性依赖 escape analysis、call mod-ref 和 alias invalidation，所以不会跨未知 call 或可能 alias 的 store 错误转发。
+```
+
+以 AArch64 `madd` 为例，可以这样答：
+
+```text
+优化分两层。
+第一层在 src/mir/passes/Peephole.cpp 的 CombineMultiplyAccumulate，寄存器分配前发现 mul 的唯一使用是 add/sub，就生成 MAdd/MSub MIR。
+第二层在 src/mir/AsmPrinter.cpp，打印 MAdd/MSub 为 AArch64 madd/msub。
+如果 MIR 层错过相邻模式，AsmPrinter 还有 TryEmitFusedMulAdd 做兜底。
+这样能少一条指令，也能降低一个临时寄存器的压力。
+```
+
+以尾递归消除为例，可以这样答：
+
+```text
+优化在 src/ir/passes/TailRecursionElim.cpp。
+它只匹配调用自身且调用结果立即 return 的尾调用。
+实现时给每个函数参数在新入口处建立 phi，首次进入函数使用原始实参，尾调用处把递归实参作为 phi 的新 incoming，然后把 return 改成跳回函数头。
+这样递归调用被改写成循环，不再产生函数调用和栈帧增长。
+```
--- a/study_materials/10_learning_path.md
+++ b/study_materials/10_learning_path.md
@ -0,0 +1,172 @@
+# 10 学习路线和掌握清单
+
+这份路线专门面向“学过编译原理，但没有实际编码操作经验”的读者。目标不是看完所有代码，而是让你能在答辩和后续优化时快速定位问题、解释设计、判断一个优化是否安全。
+
+学习时要接受一个现实：
+
+```text
+一开始看不懂所有 C++ 写法是正常的。
+你首先要看懂控制流、数据结构关系和优化前后 IR/ASM 的变化。
+```
+
+## 第一阶段：跑通全链路
+
+目标：知道一个 `.sy` 文件从输入到汇编输出经过哪些阶段。
+
+必须掌握的命令：
+
+```bash
+cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
+cmake --build build -j "$(nproc)"
+./build/bin/compiler --emit-ir test/test_case/functional/simple_add.sy
+./build/bin/compiler --emit-asm test/test_case/functional/simple_add.sy
+./scripts/verify_asm.sh test/test_case/functional/simple_add.sy test/test_result/function/asm --run
+```
+
+需要读的文件：
+
+| 顺序 | 文件 | 目标 |
+| --- | --- | --- |
+| 1 | `src/main.cpp` | 看清主流程如何选择 parse/IR/ASM 输出。 |
+| 2 | `src/frontend/AntlrDriver.cpp` | 理解 ANTLR 如何产出语法树。 |
+| 3 | `src/sem/Sema.cpp` | 理解语义分析如何生成符号和类型信息。 |
+| 4 | `src/irgen/IRGenDriver.cpp` | 理解 AST/Sema 信息如何进入 IRGen。 |
+| 5 | `src/ir/passes/PassManager.cpp` | 理解所有 IR 优化如何串起来。 |
+| 6 | `src/mir/Lowering.cpp` | 理解 IR 如何降到 MIR。 |
+| 7 | `src/mir/AsmPrinter.cpp` | 理解 MIR 如何打印成 AArch64 汇编。 |
+
+完成标准：
+
+```text
+你能从 main.cpp 口头解释：
+源码 -> 解析 -> 语义 -> IRGen -> IR Pass -> MIR Lowering -> MIR Pass -> RegAlloc -> FrameLowering -> ASM。
+```
+
+## 第二阶段：理解 IR 核心
+
+目标：能读懂 `--emit-ir` 输出，并能把 IR 中的一条指令定位回 C++ 类。
+
+必须掌握的概念：
+
+| 概念 | 代码位置 | 你要会说什么 |
+| --- | --- | --- |
+| `Value` | `include/ir/IR.h`、`src/ir/Value.cpp` | 所有可被使用的 IR 对象都是 Value。 |
+| `User/Use` | `include/ir/IR.h`、`src/ir/Value.cpp` | 指令使用其他 Value，use-def 链用于替换和 DCE。 |
+| `Instruction` | `src/ir/Instruction.cpp` | load/store/gep/phi/call/branch 都是 Instruction 子类。 |
+| `BasicBlock` | `src/ir/BasicBlock.cpp` | 基本块维护指令列表、前驱和后继。 |
+| `Function` | `src/ir/Function.cpp` | 函数由基本块组成，是多数 pass 的处理单位。 |
+| `IRBuilder` | `src/ir/IRBuilder.cpp` | 创建指令，保证 parent/block/use-def 关系一致。 |
+
+建议练习：
+
+```text
+1. 对 simple_add.sy 输出 IR。
+2. 找到函数、基本块、ret 指令。
+3. 修改一个简单表达式样例，例如 a+b*c。
+4. 观察 IR 中 binary 指令和 use-def 关系。
+5. 找到对应 Instruction 类和打印逻辑。
+```
+
+## 第三阶段：理解分析框架
+
+目标：知道为什么优化 pass 不能只做文本替换，必须依赖支配、循环和 alias 分析。
+
+重点文件：
+
+| 文件 | 学习重点 |
+| --- | --- |
+| `src/ir/analysis/DominatorTree.cpp` | 支配关系、支配树、支配边界。 |
+| `src/ir/analysis/LoopInfo.cpp` | back edge、loop header、latch、preheader、exit。 |
+| `src/ir/passes/MemoryUtils.h` | exact address key、escape、alias、call mod-ref。 |
+| `src/ir/passes/LoopMemoryUtils.h` | affine、loop memory dependence、parallelizable 判断基础。 |
+
+判断自己是否掌握：
+
+```text
+给你一个 while 循环，你能指出：
+header 是哪个块，latch 是哪个块，preheader 是否存在，哪些 store 可能阻止 load LICM。
+```
+
+## 第四阶段：读懂中端优化
+
+推荐阅读顺序：
+
+| 顺序 | pass | 为什么先读它 |
+| --- | --- | --- |
+| 1 | `DCE.cpp` | 最简单，理解“删除指令必须看副作用和 use”。 |
+| 2 | `CFGSimplify.cpp` | 理解基本块和终结指令修改。 |
+| 3 | `ConstFold.cpp`、`ConstProp.cpp` | 理解 value 替换和常量处理。 |
+| 4 | `Mem2Reg.cpp` | 第一次接触 phi 和支配边界。 |
+| 5 | `GVN.cpp`、`CSE.cpp` | 理解表达式等价和跨块支配。 |
+| 6 | `LoadStoreElim.cpp` | 理解内存状态、alias 失效、数据流 meet。 |
+| 7 | `LICM.cpp` | 理解 loop + dominance + alias 的组合使用。 |
+| 8 | `LoopMemoryPromotion.cpp` | 理解更复杂的循环标量替换。 |
+| 9 | `LoopStrengthReduction.cpp` | 理解归纳变量和地址递推。 |
+| 10 | `LoopUnroll.cpp`、`LoopFission.cpp`、`LoopUnswitch.cpp` | 理解改 CFG 的循环变换。 |
+
+每读一个 pass 都回答四个问题：
+
+```text
+这个 pass 消除什么冗余？
+它依赖什么分析？
+它为什么安全？
+它可能让什么后续 pass 获益？
+```
+
+## 第五阶段：读懂后端
+
+推荐阅读顺序：
+
+| 顺序 | 文件 | 学习重点 |
+| --- | --- | --- |
+| 1 | `include/mir/MIR.h` | MIR 指令、操作数、地址表达式、虚拟寄存器。 |
+| 2 | `src/mir/Lowering.cpp` | IR 指令如何变成 MIR。 |
+| 3 | `src/mir/MIRInstr.cpp` | 每种 MIR 指令的 def/use 和副作用。 |
+| 4 | `src/mir/passes/Peephole.cpp` | 机器级 copy、算术、访存、madd 优化。 |
+| 5 | `src/mir/RegAlloc.cpp` | 活跃、冲突、coalescing、spill、coloring。 |
+| 6 | `src/mir/FrameLowering.cpp` | 栈帧和保存恢复。 |
+| 7 | `src/mir/AsmPrinter.cpp` | AArch64 指令打印和最终 peephole。 |
+
+完成标准：
+
+```text
+你能解释一个 IR add 如何经过 Lowering 变成 MIR Binary，再经过 RegAlloc 获得物理寄存器，最后在 AsmPrinter 输出 add/sub/mov 等汇编。
+```
+
+## 第六阶段：能独立做优化
+
+新增优化前必须做的三件事：
+
+| 步骤 | 目的 |
+| --- | --- |
+| 找样例和 IR/ASM 证据 | 避免凭感觉优化。 |
+| 写安全条件 | 明确什么时候不能优化。 |
+| 加禁用开关或局部 guard | 出错时可以快速回退定位。 |
+
+建议从这些小任务练手：
+
+| 难度 | 任务 | 位置 |
+| --- | --- | --- |
+| 简单 | 添加一种代数化简，例如 `x - x -> 0` | `src/ir/passes/ArithmeticSimplify.cpp` |
+| 简单 | 添加一种 MIR copy/imm peephole | `src/mir/passes/Peephole.cpp` |
+| 中等 | 添加一个 GVN 表达式 opcode | `src/ir/passes/GVN.cpp` |
+| 中等 | 扩展 load/store alias 精度 | `src/ir/passes/MemoryUtils.h` |
+| 较难 | 新增 loop pass 或增强 LoopMemoryPromotion | `src/ir/passes/Loop*.cpp` |
+| 较难 | 改寄存器分配 spill 策略 | `src/mir/RegAlloc.cpp` |
+
+## 答辩前自查
+
+你至少应该能不看资料回答：
+
+```text
+1. 编译器每个阶段的输入输出是什么？
+2. IR 的 Value/User/Use 是什么？
+3. Mem2Reg 为什么需要 phi？
+4. GVN 为什么需要支配树？
+5. LICM hoist load 为什么需要 alias/mod-ref？
+6. LoopMemoryPromotion 和 Mem2Reg 有什么区别？
+7. 后端为什么还需要 MIR peephole？
+8. 寄存器分配如何处理 caller-saved/callee-saved？
+9. 为什么某些优化要有 size guard？
+10. 一个性能回退应该怎么定位？
+```
--- a/study_materials/11_pass_writing_guide.md
+++ b/study_materials/11_pass_writing_guide.md
@ -0,0 +1,179 @@
+# 11 如何安全地写一个优化 Pass
+
+这份文档用于后续继续改编译器。它默认你没有太多工程编码经验，所以重点不是追求复杂技巧，而是建立安全修改流程。
+
+核心原则：
+
+```text
+优化不是“看到模式就替换”，而是“证明替换后语义不变”。
+先写最保守版本，确认正确，再逐步放宽条件。
+```
+
+如果你暂时还不能独立写 pass，也应该能读懂本文中的“安全问题”和“常见坑”，因为这些是答辩时解释正确性的关键。
+
+## 新 pass 的基本模板
+
+一个 IR pass 通常长这样：
+
+```cpp
+bool RunMyPassOnFunction(Function& function) {
+  bool changed = false;
+
+  for (auto& block_ptr : function.GetBasicBlocks()) {
+    BasicBlock* block = block_ptr.get();
+    auto& instructions = block->GetInstructions();
+
+    for (auto it = instructions.begin(); it != instructions.end();) {
+      Instruction* inst = it->get();
+
+      if (!CanOptimize(inst)) {
+        ++it;
+        continue;
+      }
+
+      Value* replacement = BuildReplacement(inst);
+      inst->ReplaceAllUsesWith(replacement);
+      it = instructions.erase(it);
+      changed = true;
+    }
+  }
+
+  return changed;
+}
+```
+
+实际项目里要优先使用现有工具：
+
+| 需求 | 优先使用 |
+| --- | --- |
+| 创建 IR 指令 | `IRBuilder` |
+| 替换所有使用 | `Value::ReplaceAllUsesWith` |
+| 删除无用指令 | 参考 `DCE.cpp`、`LoadStoreElim.cpp` 的 erase 写法 |
+| 简化 phi | `passutils::SimplifyPhiInst` |
+| 循环 preheader | `looputils::EnsurePreheader` |
+| 精确地址 key | `memutils::BuildExactAddressKey` |
+| loop affine | `loopmem::AnalyzeAffine` |
+
+## 写 pass 前先回答的安全问题
+
+| 问题 | 如果答案不确定怎么办 |
+| --- | --- |
+| 被删指令有没有副作用？ | 不删。参考 `Instruction::MayHaveSideEffects` 或已有副作用判断。 |
+| 替换值是否支配所有使用点？ | 不替换，或先用 DominatorTree 验证。 |
+| 是否改变了浮点 NaN、除零、溢出语义？ | 保守跳过。 |
+| load/store 是否可能 alias？ | 保守认为 alias。 |
+| call 是否可能读写内存？ | 保守失效所有相关 memory state。 |
+| 改 CFG 后 phi 是否正确更新？ | 必须逐个 predecessor/incoming 修复。 |
+| 循环变换是否保持 exit 行为？ | 必须处理 exit phi 和 tail loop。 |
+
+## 修改 IR 的常见坑
+
+| 坑 | 典型后果 | 规避方式 |
+| --- | --- | --- |
+| 删除指令前没有替换 use | use-def 悬空，打印或后续 pass 崩溃。 | 先 `ReplaceAllUsesWith`，再 erase。 |
+| 新指令没有插到正确 block | parent 为空或顺序错误。 | 用 `IRBuilder` 或参考同类 pass 插入方式。 |
+| 在遍历 vector/list 时 erase 后继续自增 | 跳过指令或迭代器失效。 | erase 返回新迭代器。 |
+| 修改 CFG 后不更新 phi | 错答案或 verifier 类问题。 | 参考 `Inline.cpp`、`LoopFission.cpp`。 |
+| 把 load 当纯指令随便移动 | store/call 可能改变内存。 | 必须经过 alias/mod-ref 证明。 |
+| 循环展开没有 tail | 非整除次数会少执行。 | unrolled loop 后保留 original tail loop。 |
+| 内联递归函数 | 无限膨胀。 | callee 是 caller 或递归标记时跳过。 |
+
+## 修改 MIR 的常见坑
+
+| 坑 | 典型后果 | 规避方式 |
+| --- | --- | --- |
+| post-RA 后还新建虚拟寄存器 | 后续没有再分配，汇编无法打印。 | post-RA 只用物理寄存器或 scratch 寄存器，或不要新建 vreg。 |
+| peephole 破坏 flags | 条件分支结果错误。 | 不移动/删除会影响后续 compare/branch 的指令。 |
+| `ldp/stp` 合并跨过写同一寄存器的指令 | 读到错误值。 | 检查中间指令 def/use/clobber。 |
+| call 后继续假设 caller-saved 值有效 | 随机错答案。 | RegAlloc 和 peephole 都要尊重 call-clobber。 |
+| 删除 store 时没考虑 alias | 输出或内存结果错误。 | 只删除 exact address 且未被观察的 store。 |
+| 使用 scratch 寄存器冲突 | 覆盖程序值。 | 只在 AsmPrinter 局部展开中使用约定 scratch，并确认不跨指令保留。 |
+
+## 给现有 pipeline 添加 pass
+
+IR pass 添加步骤：
+
+```text
+1. 在 include/ir/PassManager.h 声明 RunXxx(Module&).
+2. 在 src/ir/passes/ 新增 Xxx.cpp。
+3. 在 src/ir/passes/CMakeLists.txt 加入 Xxx.cpp。
+4. 在 src/ir/passes/PassManager.cpp 选择合适位置调用。
+5. 如有风险，增加 NUDTC_DISABLE_XXX 环境变量开关。
+```
+
+MIR pass 添加步骤：
+
+```text
+1. 在 include/mir/MIR.h 声明 RunXxx(MachineModule&).
+2. 在 src/mir/passes/ 新增 Xxx.cpp。
+3. 在 src/mir/passes/CMakeLists.txt 加入 Xxx.cpp。
+4. 在 src/mir/passes/PassManager.cpp 加入 pre-RA 或 post-RA pipeline。
+5. 明确这个 pass 是 pre-RA 还是 post-RA。
+```
+
+pre-RA 和 post-RA 的区别：
+
+| 阶段 | 可以做什么 | 不能轻易做什么 |
+| --- | --- | --- |
+| pre-RA | 新建 vreg、改写表达式、合并计算、删除冗余。 | 不能依赖最终物理寄存器。 |
+| post-RA | 删除自拷贝、合并物理寄存器级冗余、局部调度。 | 不应新建未分配 vreg，不应破坏 ABI。 |
+
+## 一个安全的优化开发流程
+
+推荐流程：
+
+```text
+1. 选一个明确样例，保存优化前 IR/ASM。
+2. 写出希望消除的具体模式。
+3. 在 pass 中实现最保守版本。
+4. 只跑单样例正确性。
+5. 检查 IR/ASM 是否真的变化。
+6. 跑 3-5 个敏感样例。
+7. 再考虑放宽 guard。
+8. 重要提交前跑全量。
+```
+
+最小验证命令：
+
+```bash
+cmake --build build -j "$(nproc)"
+./scripts/analyze_case.sh test/test_case/h_performance/h-11-01.sy
+./scripts/verify_asm.sh test/test_case/h_functional/38_light2d.sy test/test_result/function/asm --run
+```
+
+## 如何判断一个优化该放在哪里
+
+| 优化类型 | 优先位置 | 原因 |
+| --- | --- | --- |
+| 常量、代数、简单表达式 | IR | IR 更抽象，容易跨后端复用。 |
+| load/store、alias、循环内存 | IR | IR 有 GEP、类型、循环和支配信息。 |
+| 内联、递归消除 | IR | 函数 CFG 和调用关系在 IR 层更清晰。 |
+| 寄存器压力、copy、spill | MIR pre-RA | 需要机器寄存器语义，但还能使用 vreg。 |
+| 自拷贝、物理寄存器冗余 | MIR post-RA | 只有分配后才能看出物理寄存器相同。 |
+| AArch64 指令形式 | AsmPrinter 或 MIR | 例如 `madd`、`csel`、`ldp/stp`、立即数编码。 |
+
+## 优化提交说明模板
+
+每次优化建议写成：
+
+```text
+优化名称：
+实现位置：
+触发模式：
+安全条件：
+主要收益样例：
+验证命令：
+已知限制：
+```
+
+示例：
+
+```text
+优化名称：loop-aware store-to-load forwarding
+实现位置：src/ir/passes/LoadStoreElim.cpp
+触发模式：preheader store 后，loop 内重复 load 同一 exact address
+安全条件：loop 内 store/call/memset 不会 clobber 该地址，三角 IV 访问可证明 no-alias
+主要收益样例：h-11-01
+验证命令：verify_asm.sh h-11-01.sy --run，外加 38_light2d 回归
+已知限制：只处理可构造 exact address key 的 GEP
+```
--- a/study_materials/12_debugging_playbook.md
+++ b/study_materials/12_debugging_playbook.md
@ -0,0 +1,239 @@
+# 12 调试和性能定位手册
+
+这份手册用于出错时快速定位问题。它面向不熟悉工程调试的读者，所以优先使用脚本、环境变量和 IR/ASM 对比，而不是一上来使用复杂 debugger。
+
+基本原则：
+
+```text
+先确认错误阶段。
+再缩小到某个 pass 或后端阶段。
+最后才改代码。
+```
+
+比赛阶段不要凭感觉改，先确定错误发生在哪个阶段。
+
+## 先区分错误类型
+
+| 现象 | 优先怀疑 |
+| --- | --- |
+| 编译器崩溃 | 前端、IRGen、pass 中悬空 use、空指针、CFG 修改错误。 |
+| 汇编失败 | AsmPrinter 输出非法指令、寄存器名错误、栈偏移编码错误。 |
+| 链接失败 | 运行时库、外部函数名、ABI 或脚本路径。 |
+| 运行超时 | 未优化递归/循环、死循环、性能回退。 |
+| 输出全 0 或明显错 | load/store 优化、全局初始化、寄存器分配、call-clobber。 |
+| 只在大样例错 | alias 过激、循环变换、stack offset、ldp/stp 调度。 |
+| 小样例全过但性能差 | 中端没命中、spill 过多、除法/访存/分支未优化。 |
+
+## 单样例定位流程
+
+推荐命令：
+
+```bash
+./scripts/analyze_case.sh test/test_case/h_performance/h-11-01.sy
+```
+
+如果脚本不可用，就手动拆：
+
+```bash
+./build/bin/compiler --emit-ir case.sy > /tmp/case.ir
+./build/bin/compiler --emit-asm case.sy > /tmp/case.s
+aarch64-linux-gnu-gcc /tmp/case.s sylib/sylib.c -static -o /tmp/case.out
+qemu-aarch64 /tmp/case.out < case.in
+```
+
+看三个文件：
+
+| 文件 | 看什么 |
+| --- | --- |
+| IR | 是否还有明显 load/store、call、mul/div、未内联小函数。 |
+| ASM | 是否有过多 `ldr/str`、`sdiv`、`bl`、spill/reload、分支链。 |
+| actual.out diff | 错误是某一行开始偏，还是整体全错。 |
+
+## 用环境变量二分 pass
+
+如果怀疑 IR pass：
+
+```bash
+NUDTC_DISABLE_MEM2REG=1 ./scripts/verify_asm.sh case.sy test/test_result/function/asm --run
+NUDTC_DISABLE_TAIL_RECURSION=1 ./scripts/verify_asm.sh case.sy test/test_result/function/asm --run
+NUDTC_DISABLE_CFG_INLINE=1 ./scripts/verify_asm.sh case.sy test/test_result/function/asm --run
+NUDTC_DISABLE_LOOP_MEM_PROMOTION=1 ./scripts/verify_asm.sh case.sy test/test_result/function/asm --run
+NUDTC_DISABLE_LOOP_UNSWITCH=1 ./scripts/verify_asm.sh case.sy test/test_result/function/asm --run
+```
+
+如果怀疑 MIR pass：
+
+```bash
+NUDTC_DISABLE_MIR_SPILL_REDUCTION=1 ./scripts/verify_asm.sh case.sy test/test_result/function/asm --run
+NUDTC_DISABLE_MIR_CFG_CLEANUP=1 ./scripts/verify_asm.sh case.sy test/test_result/function/asm --run
+NUDTC_DISABLE_MIR_PRECISE_MEMORY=1 ./scripts/verify_asm.sh case.sy test/test_result/function/asm --run
+NUDTC_DISABLE_MIR_MADD=1 ./scripts/verify_asm.sh case.sy test/test_result/function/asm --run
+```
+
+如果怀疑 AsmPrinter peephole：
+
+```bash
+NUDTC_DISABLE_ASM_MUL_CONST=1 ./scripts/verify_asm.sh case.sy test/test_result/function/asm --run
+NUDTC_DISABLE_ASM_MADD=1 ./scripts/verify_asm.sh case.sy test/test_result/function/asm --run
+NUDTC_DISABLE_ASM_FALLTHROUGH_OPT=1 ./scripts/verify_asm.sh case.sy test/test_result/function/asm --run
+```
+
+判断方法：
+
+```text
+关闭某个优化后样例恢复正确，说明 bug 大概率在该优化或它暴露出的后续优化里。
+关闭某个优化后仍错误，不代表它一定无关，继续二分相邻 pass。
+```
+
+## 正确性 bug 常见定位点
+
+### load/store 优化导致错
+
+检查文件：
+
+```text
+src/ir/passes/LoadStoreElim.cpp
+src/ir/passes/MemoryUtils.h
+src/mir/passes/Peephole.cpp
+```
+
+重点看：
+
+```text
+1. 是否把可能 alias 的两个地址当成 no-alias。
+2. 是否跨未知 call 保留了 memory state。
+3. 是否删除了会被后续 load 观察的 store。
+4. 是否 memset 后没有失效相关地址。
+5. 是否 exact address key 漏掉某个动态 index。
+```
+
+### 循环优化导致错
+
+检查文件：
+
+```text
+src/ir/passes/LICM.cpp
+src/ir/passes/LoopMemoryPromotion.cpp
+src/ir/passes/LoopStrengthReduction.cpp
+src/ir/passes/LoopUnroll.cpp
+src/ir/passes/LoopFission.cpp
+src/ir/passes/LoopUnswitch.cpp
+```
+
+重点看：
+
+```text
+1. preheader/header/latch/exit 是否识别正确。
+2. exit phi incoming 是否更新。
+3. 归纳变量 stride 和 compare 方向是否匹配。
+4. unroll 后 tail loop 是否覆盖剩余迭代。
+5. fission 前后是否存在标量或内存依赖。
+6. unswitch 是否复制了完整 loop blocks。
+```
+
+### 寄存器分配导致错
+
+检查文件：
+
+```text
+src/mir/RegAlloc.cpp
+src/mir/FrameLowering.cpp
+src/mir/AsmPrinter.cpp
+```
+
+重点看：
+
+```text
+1. 同时活跃的 vreg 是否被分到同一物理寄存器。
+2. 跨 call 活跃值是否误分配到 caller-saved。
+3. callee-saved 是否保存恢复。
+4. spill slot offset 是否正确。
+5. reload 是否插入在 use 前，store spill 是否插入在 def 后。
+```
+
+### AsmPrinter 导致错
+
+重点看：
+
+```text
+1. 立即数是否符合 AArch64 编码范围。
+2. `ldp/stp` offset 是否对齐且可编码。
+3. `w` 和 `x` 寄存器宽度是否混用错误。
+4. `cbz/tbz/csel` 条件是否反了。
+5. scratch 寄存器是否覆盖了仍需使用的值。
+```
+
+## 性能定位流程
+
+先看 speedup 表：
+
+```text
+loss = our_time - gcc_time
+```
+
+优先级：
+
+```text
+总损失秒数大的样例 > 比例难看但总时间很短的样例
+```
+
+然后检查 IR：
+
+| IR 现象 | 可能缺的优化 |
+| --- | --- |
+| 热循环内重复 load 同一地址 | LoadStoreElim、LICM load、LoopMemoryPromotion。 |
+| 热循环内大量 GEP/mul | LoopStrengthReduction、地址递推。 |
+| 小函数在循环内 call | Inline、CSelect、math idiom lowering。 |
+| 递归 call 很深 | TailRecursionElim、math idiom lowering。 |
+| `% 2^k` 还在 | ArithmeticSimplify、Asm 常数取模 lowering。 |
+
+再检查 ASM：
+
+| ASM 现象 | 可能缺的优化 |
+| --- | --- |
+| 大量 `sdiv` | 常数除法/取模 lowering、Barrett/magic modulo。 |
+| 大量 `ldr/str [sp]` | RegAlloc spill、SpillReduction、copy coalescing。 |
+| `mul` 后紧跟 `add/sub` | MAdd/MSub peephole 未命中。 |
+| 连续栈 load/store 没有 `ldp/stp` | pair 合并或调度未命中。 |
+| 短分支很多 | IfConversion、CSelect、cbz/tbz。 |
+| 函数调用很多 | Inline、tail recursion、math idiom。 |
+
+## 回归测试策略
+
+不要每改一次就跑全量，太慢。建议分层：
+
+| 改动类型 | 先跑 |
+| --- | --- |
+| 前端/IRGen | `simple_add`、数组初始化、作用域、const 数组。 |
+| load/store/alias | `38_light2d`、`h-11-01`、`h-4-*`。 |
+| 循环优化 | `matmul*`、`transpose*`、`spmv*`、`gameoflife*`。 |
+| 函数内联/尾递归 | `65_color`、`h-1-*`、`h-4-*`。 |
+| 后端 peephole | `register_alloc`、`many_params`、`nested_calls`、`h_performance` 小集合。 |
+| AsmPrinter | 至少跑一个整数、一个数组、一个浮点、一个函数调用样例。 |
+
+重要提交前：
+
+```bash
+./scripts/verify_asm_all_time.sh test/test_case
+```
+
+## 性能记录模板
+
+每轮优化后建议保存：
+
+```text
+日期：
+commit：
+测试命令：
+whole.log：
+timing.tsv：
+summary：
+build elapsed：
+validation elapsed：
+total elapsed：
+top 10 loss：
+top 10 gain：
+本轮改动：
+是否有正确性风险：
+下一步：
+```
--- a/study_materials/13_aarch64_backend_cheatsheet.md
+++ b/study_materials/13_aarch64_backend_cheatsheet.md
@ -0,0 +1,197 @@
+# 13 AArch64 后端速查
+
+这份资料面向 ARM 赛道答辩和后端优化。它不要求你一开始掌握完整 ARM 手册，只要求你能看懂本项目生成汇编中的常见指令，并知道哪些 AArch64 指令值得优化出来。
+
+阅读方法：
+
+```text
+先看寄存器和调用约定。
+再看 add/sub/mul/ldr/str/b/cmp 这些高频指令。
+最后看 madd/csel/ldp/stp/cbz/tbz 这些优化目标。
+```
+
+最低掌握目标：
+
+```text
+看到一段热点汇编，能指出主要成本来自除法、访存、分支、函数调用还是 spill。
+```
+
+## 寄存器约定
+
+| 寄存器 | 用途 |
+| --- | --- |
+| `x0-x7` / `w0-w7` | 整数参数和返回值。 |
+| `v0-v7` / `s0-s7` / `d0-d7` | 浮点参数和返回值。 |
+| `x8-x18` | 常见 caller-saved 临时寄存器，其中 `x18` 平台保留时要谨慎。 |
+| `x19-x28` | callee-saved，函数用到要保存恢复。 |
+| `x29` | frame pointer。 |
+| `x30` | link register。 |
+| `sp` | stack pointer，通常 16 字节对齐。 |
+| `wN` | `xN` 的低 32 位视图，写 `wN` 会清零高 32 位。 |
+
+项目相关位置：
+
+```text
+src/mir/Register.cpp
+src/mir/RegAlloc.cpp
+src/mir/FrameLowering.cpp
+src/mir/AsmPrinter.cpp
+```
+
+## 调用约定要点
+
+函数调用时必须注意：
+
+| 项目 | 规则 |
+| --- | --- |
+| 整数返回值 | `x0/w0`。 |
+| 浮点返回值 | `v0/s0/d0`。 |
+| caller-saved | 调用者如果还需要这些寄存器的值，必须自己保存。 |
+| callee-saved | 被调用者使用后必须恢复。 |
+| 栈对齐 | 调用边界通常保持 16 字节对齐。 |
+
+这也是 `RegAlloc.cpp` 中 `live_across_call_` 重要的原因：跨 call 活跃的虚拟寄存器不能随便分到 caller-saved 物理寄存器。
+
+## 常见整数指令
+
+| 指令 | 含义 | 优化相关 |
+| --- | --- | --- |
+| `add dst, a, b/#imm` | 加法 | 立即数可编码时避免先 mov。 |
+| `sub dst, a, b/#imm` | 减法 | 常用于比较和地址偏移。 |
+| `mul dst, a, b` | 乘法低位 | 可和 add/sub 融成 `madd/msub`。 |
+| `madd dst, a, b, c` | `dst = a*b + c` | 本项目已有 MIR/ASM 融合。 |
+| `msub dst, a, b, c` | `dst = c - a*b` | 对 `%` 余数和乘减有用。 |
+| `sdiv dst, a, b` | 有符号除法 | 很慢，常数除法应 lowering。 |
+| `and/orr/eor` | 位运算 | `% 2^k`、mask、bit test 常用。 |
+| `lsl/lsr/asr` | 移位 | 常数乘除可替代 mul/div。 |
+
+## 分支和条件选择
+
+| 指令 | 含义 | 用法 |
+| --- | --- | --- |
+| `cmp a, b` | 设置条件码 | 后面接 `b.eq`、`csel` 等。 |
+| `b label` | 无条件跳转 | fallthrough 时可省略。 |
+| `b.eq label` | 条件跳转 | compare + branch。 |
+| `cbz x, label` | x 为 0 跳转 | 比 `cmp x,#0; b.eq` 更短。 |
+| `cbnz x, label` | x 非 0 跳转 | 同上。 |
+| `tbz x, #bit, label` | 某 bit 为 0 跳转 | bit test 分支优化。 |
+| `tbnz x, #bit, label` | 某 bit 非 0 跳转 | bit test 分支优化。 |
+| `csel dst, a, b, cond` | 条件选择 | min/max/clamp、小 if 转无分支。 |
+| `cset dst, cond` | 条件置 0/1 | boolean materialize。 |
+| `csetm dst, cond` | 条件置 0/-1 | 生成 mask。 |
+
+项目相关位置：
+
+```text
+src/ir/passes/IfConversion.cpp
+src/mir/Lowering.cpp
+src/mir/passes/Peephole.cpp
+src/mir/AsmPrinter.cpp
+```
+
+## 访存和寻址
+
+常见格式：
+
+```asm
+ldr w0, [x1]
+ldr w0, [x1, #16]
+str w0, [sp, #32]
+ldp x19, x20, [sp, #16]
+stp x19, x20, [sp, #16]
+```
+
+优化重点：
+
+| 优化 | 价值 |
+| --- | --- |
+| 直接 frame offset 访存 | 避免额外 `add` 计算地址。 |
+| `ldp/stp` | 两次相邻访存合成一条，减少指令数。 |
+| 地址递推 | 循环内避免每轮重新 `mul + add`。 |
+| load/store forwarding | 能在 IR 或 MIR 层删掉冗余访存。 |
+| spill/reload pair | 栈上连续 spill 可以用 `ldp/stp` 降低开销。 |
+
+项目相关位置：
+
+```text
+src/ir/passes/LoopStrengthReduction.cpp
+src/ir/passes/LoadStoreElim.cpp
+src/mir/AddressHoisting.cpp
+src/mir/passes/Peephole.cpp
+src/mir/AsmPrinter.cpp
+```
+
+## 浮点指令
+
+| 指令 | 含义 |
+| --- | --- |
+| `fadd` | 浮点加。 |
+| `fsub` | 浮点减。 |
+| `fmul` | 浮点乘。 |
+| `fdiv` | 浮点除。 |
+| `fcmp` | 浮点比较。 |
+| `fcsel` | 浮点条件选择。 |
+
+注意：
+
+```text
+浮点优化比整数更危险，因为 NaN、正负零、舍入误差会影响语义。
+除非 SysY/比赛规则明确允许 fast-math，否则不要随便重排浮点表达式。
+```
+
+## 立即数编码
+
+AArch64 不是所有立即数都能直接编码。
+
+常见可直接编码：
+
+| 指令类型 | 特点 |
+| --- | --- |
+| add/sub immediate | 12 位立即数，或者左移 12 位。 |
+| logical immediate | 有特定 bitmask 编码规则，不是任意数。 |
+| mov immediate | 可能需要 `movz/movk` 多条构造。 |
+
+项目里相关逻辑在：
+
+```text
+src/mir/AsmPrinter.cpp
+```
+
+答辩时可以说：
+
+```text
+后端在 AsmPrinter 中尝试使用可编码立即数，不能编码时再 materialize 常数，避免无谓 mov。
+```
+
+## 常数除法为什么要优化
+
+`sdiv` 通常比 add/mul/shift 慢得多。对常数除法：
+
+```text
+x / 8    -> 带符号修正的 shift
+x % 8    -> 带符号修正的 mask
+x / C    -> magic number multiply + shift
+x % C    -> x - (x / C) * C，常用 msub
+```
+
+项目相关位置：
+
+```text
+src/mir/AsmPrinter.cpp
+EmitSignedDivByConstant
+EmitSignedRemByConstant
+ComputeSignedDivMagic
+```
+
+## ARM 赛道最应该讲的后端优化
+
+优先讲：
+
+| 优化 | 为什么适合 ARM 赛道 |
+| --- | --- |
+| `madd/msub` | AArch64 有专门融合乘加指令。 |
+| `csel` | ARM 条件选择强，适合小分支。 |
+| `cbz/cbnz/tbz/tbnz` | ARM 有紧凑分支指令。 |
+| `ldp/stp` | AArch64 成对访存对栈和数组访问很重要。 |
+| 常数除法/取模 lowering | 避免慢 `sdiv`。 |
+| caller/callee saved 处理 | 体现 ABI 正确性。 |
--- a/study_materials/14_case_study_playbook.md
+++ b/study_materials/14_case_study_playbook.md
@ -0,0 +1,291 @@
+# 14 热点样例分析模板
+
+这份文档用于把“某个样例慢”转化成“应该看哪段 IR/ASM，应该改哪个 pass”。对没有实际编码经验的读者，它更像一套问诊流程。
+
+不要直接问“我要写什么优化”。先问：
+
+```text
+IR 里有没有重复计算？
+IR 里有没有没消掉的 call/load/store/div？
+ASM 里有没有 sdiv、过多 ldr/str、过多 bl、过多 spill？
+现有哪个 pass 理论上应该处理它？
+```
+
+最低掌握目标：
+
+```text
+能拿到一个慢样例后，写出一页分析报告，而不是直接盲改代码。
+```
+
+## 通用分析步骤
+
+```text
+1. 从 timing.tsv 找到 total loss 最大的样例。
+2. 用 analyze_case.sh 保存该样例 IR 和 ASM。
+3. 先看 IR 是否存在高层冗余。
+4. 再看 ASM 是否存在后端冗余。
+5. 找到一个可泛化模式，而不是按样例名特判。
+6. 实现最保守版本。
+7. 跑该样例和敏感回归样例。
+```
+
+## `h-11-01` 类三角循环
+
+典型现象：
+
+```text
+内层循环反复读取一个在 preheader 已经写入或计算过的地址。
+同时循环里还写相似数组区域，例如 B[j][k]，容易让普通 alias 分析保守失败。
+```
+
+IR 中重点找：
+
+```text
+load B[i][k]
+store B[j][k]
+j = i + 1 ...
+```
+
+对应优化：
+
+| 问题 | 优化位置 |
+| --- | --- |
+| 循环内重复 load | `src/ir/passes/LoadStoreElim.cpp` |
+| load 不敢 hoist | `src/ir/passes/LICM.cpp`、`LoopMemoryUtils.h` |
+| 地址计算重复 | `src/ir/passes/LoopStrengthReduction.cpp` |
+| scalar 内存反复读写 | `src/ir/passes/LoopMemoryPromotion.cpp` |
+
+答辩讲法：
+
+```text
+这不是样例特判，而是识别 preheader store、loop load、三角 IV no-alias 的通用模式。
+```
+
+## `h-4-*` 类 min/max/clamp
+
+典型现象：
+
+```text
+大量小函数或小分支实现 min/max/clamp。
+如果没有内联或选择化，会产生很多 call 或 branch。
+```
+
+IR 中重点找：
+
+```text
+call min/max helper
+if cond return a else return b
+```
+
+ASM 中重点找：
+
+```text
+bl helper
+cmp ...
+b.xx ...
+```
+
+对应优化：
+
+| 问题 | 优化位置 |
+| --- | --- |
+| 小函数 call | `src/ir/passes/Inline.cpp` |
+| 两路返回函数 | `src/mir/Lowering.cpp` 的 `MatchTwoWayIntSelectFunction` |
+| 小分支 | `src/ir/passes/IfConversion.cpp` |
+| ARM 条件选择 | `src/mir/AsmPrinter.cpp` 输出 `csel` |
+
+## `crypto-*` 类模乘/模幂
+
+典型现象：
+
+```text
+递归模乘、递归模幂、常数取模、位测试。
+如果没优化，会有大量 call、sdiv、rem。
+```
+
+IR 中重点找：
+
+```text
+recursive call
+% constant
+/ constant
+if (exp & 1)
+```
+
+ASM 中重点找：
+
+```text
+bl modmul
+bl modpow
+sdiv
+msub
+tbz/tbnz 是否出现
+```
+
+对应优化：
+
+| 问题 | 优化位置 |
+| --- | --- |
+| 递归模乘 | `src/mir/Lowering.cpp`、`src/mir/AsmPrinter.cpp` 的 `ModMul` |
+| 递归模幂 | `src/mir/Lowering.cpp`、`src/mir/AsmPrinter.cpp` 的 `ModPow` |
+| `% 2^k` | `src/ir/passes/ArithmeticSimplify.cpp`、`src/mir/AsmPrinter.cpp` |
+| bit test branch | `src/mir/AsmPrinter.cpp` 的 `tbz/tbnz` |
+
+后续仍值得做：
+
+```text
+Barrett reduction 或更完整 magic modulo，减少 ModMul 中的 sdiv。
+```
+
+## `matmul*` / `01_mm*` 类矩阵乘
+
+典型现象：
+
+```text
+三重循环，地址计算多，访存局部性强。
+如果没有地址递推和 promotion，会有大量 GEP/mul/load/store。
+```
+
+IR 中重点找：
+
+```text
+i * N + k
+k * N + j
+sum load/store
+```
+
+ASM 中重点找：
+
+```text
+循环内 mul 是否多
+ldr/str 是否多
+是否有 madd
+```
+
+对应优化：
+
+| 问题 | 优化位置 |
+| --- | --- |
+| 循环内地址乘法 | `src/ir/passes/LoopStrengthReduction.cpp` |
+| sum 反复 load/store | `src/ir/passes/LoopMemoryPromotion.cpp` |
+| 乘加 | `src/mir/passes/Peephole.cpp`、`src/mir/AsmPrinter.cpp` |
+| 冗余 load | `src/ir/passes/LoadStoreElim.cpp` |
+
+当前不要夸大：
+
+```text
+项目当前没有完整通用 loop blocking 或 NEON matmul kernel。
+```
+
+## `gameoflife-*` 类 stencil
+
+典型现象：
+
+```text
+二维邻域访问，每个 cell 读多个相邻位置。
+性能瓶颈通常是内存带宽、重复地址计算、边界分支。
+```
+
+IR/ASM 中重点找：
+
+```text
+重复读取相邻行
+重复计算 row * width
+大量边界 if
+```
+
+已有关联优化：
+
+| 问题 | 当前可命中的优化 |
+| --- | --- |
+| row base 重复计算 | LoopStrengthReduction、AddressHoisting |
+| load 冗余 | LoadStoreElim、LICM load |
+| 小分支 | IfConversion、cbz/cbnz |
+
+后续方向：
+
+```text
+行缓存、stencil 专项复用、保守 NEON 向量化。
+```
+
+## `29_long_line` / `sl*` 类编译时间样例
+
+典型现象：
+
+```text
+源码巨大或 CFG/表达式很大，运行时间不一定最差，但编译时间容易被多轮 pass 放大。
+```
+
+看点：
+
+```text
+build elapsed 是否高
+analyze_case 生成 IR 是否非常大
+PassManager 是否跑满 8 轮
+```
+
+对应优化：
+
+| 问题 | 优化位置 |
+| --- | --- |
+| pass 轮数过多 | `src/ir/passes/PassManager.cpp` size guard |
+| CFG 膨胀 | 关闭后几轮内联/loop transform |
+| 无收益重复跑 | 根据 changed 和大小阈值提前停止 |
+
+## `38_light2d` 类正确性敏感样例
+
+典型现象：
+
+```text
+输出是二维图像，某个内存优化错了会出现整片 0、整列 255 或边界错误。
+```
+
+优先怀疑：
+
+```text
+load/store forwarding
+memset/global init
+alias 判断
+loop transformation
+```
+
+建议用途：
+
+```text
+每次改内存优化、循环优化、AsmPrinter 访存合并后，都用它做 smoke test。
+```
+
+## `65_color` / `h-1-*` 类递归样例
+
+典型现象：
+
+```text
+递归调用深，函数调用成本高，甚至可能栈压力大。
+```
+
+对应优化：
+
+| 问题 | 优化位置 |
+| --- | --- |
+| 尾递归 | `src/ir/passes/TailRecursionElim.cpp` |
+| 小函数递归 helper | `src/ir/passes/Inline.cpp`，但要防止递归内联 |
+| call-clobber spill | `src/mir/RegAlloc.cpp`、`SpillReduction.cpp` |
+
+## 分析报告模板
+
+建议每个重点样例写一页：
+
+```text
+样例：
+当前 our time：
+GCC time：
+loss：
+IR 主要问题：
+ASM 主要问题：
+已经命中的优化：
+没有命中的优化：
+计划修改文件：
+安全条件：
+验证样例：
+结果：
+```
--- a/study_materials/15_glossary.md
+++ b/study_materials/15_glossary.md
@ -0,0 +1,103 @@
+# 15 编译器术语速查
+
+这份文件用于答辩前快速复习术语。它面向已经学过概念但不熟悉工程代码的读者，所以每个术语都尽量对应到项目位置或代码对象。
+
+使用方式：
+
+```text
+遇到一个代码文件看不懂时，先查它涉及的术语。
+遇到一个答辩问题答不准时，先查术语定义，再回到对应文件。
+```
+
+能把这些词讲清楚，基本就能解释大部分代码。
+
+## 前端和语义
+
+| 术语 | 含义 | 项目位置 |
+| --- | --- | --- |
+| Lexer | 把源码拆成 token。 | ANTLR 生成代码、`SysY.g4` |
+| Parser | 把 token 组织成语法树。 | `src/frontend/AntlrDriver.cpp` |
+| AST/Parse Tree | 源码结构树。 | ANTLR parse tree |
+| Sema | 语义分析，检查类型、作用域、常量等。 | `src/sem/Sema.cpp` |
+| Symbol Table | 符号表，记录变量/函数/作用域信息。 | `include/sem/SymbolTable.h` |
+| ConstEval | 编译期常量表达式求值。 | `src/sem/ConstEval.cpp` |
+
+## IR 基础
+
+| 术语 | 含义 | 项目位置 |
+| --- | --- | --- |
+| IR | 中间表示，连接前端和后端。 | `include/ir/IR.h` |
+| SSA | 每个变量定义一次，合流处用 phi。 | Mem2Reg 后的 IR |
+| Value | 可被使用的值。 | `src/ir/Value.cpp` |
+| Use-Def | 值和使用者之间的关系。 | `src/ir/Value.cpp` |
+| BasicBlock | 只有入口和出口的一段指令。 | `src/ir/BasicBlock.cpp` |
+| CFG | 控制流图，节点是基本块，边是跳转。 | `Function` blocks |
+| Phi | SSA 合流指令，根据前驱选择值。 | `PhiInst` |
+| GEP | 计算数组/指针元素地址。 | `GetElementPtrInst` |
+
+## 分析
+
+| 术语 | 含义 | 为什么重要 |
+| --- | --- | --- |
+| Dominator | A 支配 B 表示从入口到 B 必经 A。 | GVN、LICM、Mem2Reg 都依赖。 |
+| Dominator Tree | 支配关系形成的树。 | 沿树传播 available expression。 |
+| Dominance Frontier | 支配边界。 | Mem2Reg 插 phi。 |
+| Loop Header | 循环入口块。 | loop pass 的核心。 |
+| Latch | 回跳到 header 的块。 | 识别归纳变量和循环次数。 |
+| Preheader | 进入循环前的唯一块。 | LICM 外提位置。 |
+| Exit Block | 循环退出到达的块。 | promotion 写回、unroll tail。 |
+| Alias Analysis | 判断两个地址是否可能同一内存。 | load/store 优化安全性核心。 |
+| Mod-Ref | 判断 call 是否修改或读取某内存。 | LICM/load forwarding 不能跨未知 call。 |
+| Escape Analysis | 判断局部地址是否逃逸。 | 未逃逸 alloca 可更激进优化。 |
+
+## 中端优化
+
+| 术语 | 含义 |
+| --- | --- |
+| Mem2Reg | 把局部标量 alloca 的 load/store 提升成 SSA。 |
+| ConstProp | 常量传播。 |
+| ConstFold | 常量折叠。 |
+| CSE | 公共子表达式消除，通常偏局部。 |
+| GVN | 全局值编号，跨块消除等价表达式。 |
+| DCE | 死代码删除。 |
+| CFGSimplify | 控制流简化。 |
+| LICM | 循环不变代码外提。 |
+| LoopMemoryPromotion | 循环内存访问提升成标量。 |
+| Strength Reduction | 把昂贵运算变成便宜递推，例如乘法地址变加法。 |
+| Loop Unroll | 循环展开，减少分支和暴露 ILP。 |
+| Loop Fission | 循环拆分，改善局部性或消除依赖。 |
+| Loop Unswitch | 把循环内不变条件提到循环外。 |
+| If Conversion | 把小分支变成条件选择或 mask 运算。 |
+
+## 后端
+
+| 术语 | 含义 | 项目位置 |
+| --- | --- | --- |
+| MIR | 机器相关中间表示。 | `include/mir/MIR.h` |
+| Lowering | IR 降到 MIR。 | `src/mir/Lowering.cpp` |
+| Virtual Register | 无限个临时寄存器。 | MIR vreg |
+| Physical Register | 真实 AArch64 寄存器。 | `src/mir/Register.cpp` |
+| Register Allocation | 把 vreg 映射到 preg。 | `src/mir/RegAlloc.cpp` |
+| Spill | 寄存器不够时把值放到栈上。 | `RegAlloc.cpp` |
+| Reload | 从 spill slot 重新加载值。 | `RegAlloc.cpp`、`AsmPrinter.cpp` |
+| Copy Coalescing | 合并 copy 两端寄存器，减少 mov。 | `RegAlloc.cpp`、`Peephole.cpp` |
+| Rematerialization | 不 reload，直接重新计算廉价值。 | `SpillReduction.cpp` |
+| Frame Lowering | 生成栈帧、保存恢复寄存器。 | `src/mir/FrameLowering.cpp` |
+| Peephole | 局部小窗口机器级优化。 | `src/mir/passes/Peephole.cpp` |
+
+## AArch64 指令
+
+| 指令 | 含义 |
+| --- | --- |
+| `add/sub` | 加减。 |
+| `mul` | 乘法。 |
+| `madd/msub` | 融合乘加/乘减。 |
+| `sdiv` | 有符号除法。 |
+| `ldr/str` | 加载/存储。 |
+| `ldp/stp` | 成对加载/存储。 |
+| `cmp` | 比较并设置条件码。 |
+| `b.xx` | 条件跳转。 |
+| `cbz/cbnz` | 零测试跳转。 |
+| `tbz/tbnz` | bit 测试跳转。 |
+| `csel` | 条件选择。 |
+| `cset/csetm` | 条件生成 0/1 或 0/-1。 |
--- a/study_materials/16_code_reading_for_non_coders.md
+++ b/study_materials/16_code_reading_for_non_coders.md
@ -0,0 +1,267 @@
+# 16 给缺少编码经验的代码阅读指南
+
+这份资料专门面向下面这种情况：
+
+```text
+你学过编译原理，知道理论概念；
+但你没有实际参与过大型 C++ 编译器项目；
+看到大量 .h/.cpp、类、指针、迭代器、智能指针时容易失去主线。
+```
+
+目标不是让你马上成为 C++ 工程师，而是让你能有效读懂这个编译器，并在答辩中准确解释代码。
+
+## 先建立正确心态
+
+不要这样读：
+
+```text
+从第一个文件第一行开始逐行读。
+遇到一个 C++ 语法点就停下来查很久。
+试图一次记住所有类和函数。
+```
+
+应该这样读：
+
+```text
+先找入口。
+再看数据从哪里来到哪里去。
+再找核心数据结构。
+最后只读和当前问题相关的函数。
+```
+
+一个编译器项目不是小说，不需要顺序阅读。它更像地图，应该按问题检索。
+
+## 你需要的最低 C++ 知识
+
+不需要先系统学完整 C++，但下面这些必须能看懂。
+
+| C++ 写法 | 你只需要理解 |
+| --- | --- |
+| `class` / `struct` | 一组数据和函数，表示一个编译器对象。 |
+| 继承 | `Instruction` 下面有 `LoadInst`、`StoreInst` 等具体指令。 |
+| 指针 `T*` | 指向某个对象，不拥有它。 |
+| `std::unique_ptr<T>` | 独占拥有对象，常用于基本块、指令的生命周期管理。 |
+| `std::shared_ptr<T>` | 多处共享对象，项目中常用于类型。 |
+| `std::vector<T>` | 动态数组，常用于 basic blocks、operands。 |
+| `std::unordered_map<K,V>` | 哈希表，常用于 value numbering、memory state。 |
+| `auto` | 让编译器推导类型，读代码时要看右边表达式。 |
+| `dynamic_cast` 或项目里的 `dyncast` | 判断某个基类对象实际是不是某个子类。 |
+| `enum class` | 一组枚举值，例如 opcode、寄存器类别。 |
+
+遇到复杂模板时不要慌，先看变量名和用途。
+
+例如：
+
+```cpp
+std::unordered_map<memutils::AddressKey, AvailableValue,
+                   memutils::AddressKeyHash> available_values;
+```
+
+你只需要先读成：
+
+```text
+这是一个 map：地址 key -> 当前可用的内存值。
+```
+
+## 读 `.h` 和 `.cpp` 的方法
+
+头文件 `.h` 通常回答：
+
+```text
+有哪些类？
+每个类有什么字段？
+对外提供哪些函数？
+```
+
+实现文件 `.cpp` 通常回答：
+
+```text
+这些函数具体怎么做？
+遍历什么数据？
+修改什么对象？
+返回 changed 还是 void？
+```
+
+建议顺序：
+
+```text
+先读 include/ir/IR.h 的类名和关系。
+再读 src/ir/IRPrinter.cpp 看它们怎么输出。
+最后读某个 pass 的 .cpp 看它怎么修改 IR。
+```
+
+## 如何顺着调用链读代码
+
+以 IR 优化为例：
+
+```text
+src/main.cpp
+  -> ir::RunIRPassPipeline
+  -> src/ir/passes/PassManager.cpp
+  -> RunGVN / RunLoadStoreElim / RunLICM ...
+  -> 某个具体 pass 文件
+```
+
+以后端为例：
+
+```text
+src/main.cpp
+  -> mir::LowerToMIR
+  -> RunMIRPreRegAllocPassPipeline
+  -> RunRegAlloc
+  -> RunMIRPostRegAllocPassPipeline
+  -> RunFrameLowering
+  -> PrintAsm
+```
+
+读代码时永远先问：
+
+```text
+我现在在哪个阶段？
+这个函数是被谁调用的？
+它调用了谁？
+它修改的是 IR、MIR 还是纯文本汇编？
+```
+
+## 如何读一个 pass
+
+一个优化 pass 通常由四部分组成。
+
+| 部分 | 你要找什么 |
+| --- | --- |
+| 匹配函数 | 什么模式可以优化。 |
+| 安全检查 | 什么情况下必须跳过。 |
+| 改写逻辑 | 如何替换 value、删除指令或改 CFG。 |
+| 入口函数 | `RunXxx(Module&)` 如何遍历函数和基本块。 |
+
+以 `LoadStoreElim.cpp` 为例，阅读顺序应该是：
+
+```text
+先看 RunLoadStoreElim。
+再看它对每个 Function 做什么。
+再看 MemoryState 表示什么。
+再看 load/store 分别如何处理。
+最后看 alias/call/memset 如何让状态失效。
+```
+
+不要一开始就钻进最深的 helper。
+
+## 如何把理论概念对应到代码
+
+| 理论概念 | 代码里通常是什么 |
+| --- | --- |
+| 表达式 | `Instruction*` 或 `Value*`。 |
+| 变量定义 | `Instruction`、`Argument`、`Constant`。 |
+| 基本块 | `BasicBlock*`。 |
+| CFG 边 | terminator 的 successors、block 的 predecessors。 |
+| 数据流状态 | map、set，通常叫 `state`、`available`、`live`。 |
+| transfer function | 遍历指令时更新 state 的代码。 |
+| meet | 多个 predecessor 的状态合并。 |
+| alias 分析 | `AddressKey`、`MayAlias`、`EscapeSummary`。 |
+| 循环不变量 | 定义在 loop 外，或操作数都 invariant。 |
+| 活跃变量 | use/def 反向传播出的 live set。 |
+| spill | vreg 被分配到 stack slot。 |
+
+## 如何看 IR 输出
+
+先用最小样例：
+
+```bash
+./build/bin/compiler --emit-ir test/test_case/functional/simple_add.sy
+```
+
+读 IR 时按顺序看：
+
+```text
+函数名和参数。
+基本块标签。
+phi。
+load/store/gep。
+call。
+terminator：br/condbr/ret。
+```
+
+看到不懂的指令，回到：
+
+```text
+include/ir/IR.h
+src/ir/Instruction.cpp
+src/ir/IRPrinter.cpp
+```
+
+## 如何看 MIR 和汇编
+
+先生成汇编：
+
+```bash
+./build/bin/compiler --emit-asm test/test_case/functional/simple_add.sy
+```
+
+读汇编时不要逐条翻译。先看：
+
+```text
+有没有函数 label。
+有没有 prologue/epilogue。
+参数从哪个寄存器来。
+返回值放到哪个寄存器。
+循环里最多的指令是什么。
+有没有 sdiv、ldr/str、bl、madd、csel、ldp/stp。
+```
+
+如果你看不懂 ARM 指令，先查 `13_aarch64_backend_cheatsheet.md`。
+
+## 如何判断自己是否真的懂了一个文件
+
+不要用“我看完了”作为标准。用下面问题检查：
+
+```text
+这个文件属于哪个编译阶段？
+它主要处理什么对象？
+它的入口函数是什么？
+它最重要的数据结构是什么？
+它什么时候会拒绝优化？
+它可能造成什么 bug？
+```
+
+例如你读完 `LoopUnroll.cpp`，应该能回答：
+
+```text
+它处理 IR loop。
+入口是 RunLoopUnroll。
+它只处理 counted canonical loop。
+它需要 factor 和 tail loop。
+它遇到 call/memset/危险依赖会跳过。
+它最容易错在 exit phi、循环次数、tail 处理。
+```
+
+## 代码新手最容易犯的阅读错误
+
+| 错误 | 结果 | 更好的做法 |
+| --- | --- | --- |
+| 只看语法，不看数据流 | 看了很多行但不知道目的。 | 画出输入、输出、修改对象。 |
+| 只看一个函数，不看调用者 | 不知道为什么这样写。 | 先从入口调用链读下来。 |
+| 看到 helper 就跳进去 | 迷失在细节里。 | 先读主函数，再回头读 helper。 |
+| 以为注释就是全部 | 容易误解实际条件。 | 结合代码里的 guard 和返回值。 |
+| 看到优化名就套教材 | 忽略工程保守性。 | 看它实际支持哪些模式。 |
+| 只读 C++，不看 IR/ASM 输出 | 不知道优化效果。 | 每个优化都配一个样例输出。 |
+
+## 建议的每日学习节奏
+
+如果你有一周准备答辩，可以这样安排：
+
+| 天数 | 任务 |
+| --- | --- |
+| 第 1 天 | 跑通构建和 simple_add，读 README、16、01。 |
+| 第 2 天 | 读 IR 核心，输出 3 个小样例 IR。 |
+| 第 3 天 | 读 PassManager、DCE、CFGSimplify、ConstFold。 |
+| 第 4 天 | 读 Mem2Reg、GVN、LoadStoreElim。 |
+| 第 5 天 | 读 LICM、LoopMemoryPromotion、LoopStrengthReduction。 |
+| 第 6 天 | 读 Lowering、RegAlloc、AsmPrinter、AArch64 速查。 |
+| 第 7 天 | 按 07 和 09 准备答辩问答，挑 2 个热点样例做完整分析。 |
+
+如果时间更短，至少完成：
+
+```text
+README -> 16 -> 01 -> 04 的 pipeline -> 09 的优化总清单 -> 07 的问答。
+```
+
--- a/study_materials/17_code_companion.md
+++ b/study_materials/17_code_companion.md
@ -0,0 +1,934 @@
+# 17 源码伴读：从主流程到优化实现
+
+本文是代码版学习资料。写法固定为：
+
+```text
+先说明这段代码解决什么问题。
+再贴出项目里的真实代码片段。
+最后解释读代码时要看什么。
+```
+
+如果你只学过编译原理、没有实际写过编译器，建议不要从类定义逐行背诵。更有效的方法是按“编译阶段”读：
+
+```text
+main.cpp
+  -> IR pass pipeline
+  -> IR 数据结构和分析
+  -> 重点优化 pass
+  -> MIR lowering
+  -> MIR pass pipeline
+  -> RegAlloc / FrameLowering / AsmPrinter
+  -> 测试脚本
+```
+
+## 1. 编译器主流程
+
+说明：`src/main.cpp` 是整个编译器的总控。它先解析 SysY 源码，再做语义分析，然后根据命令行选项输出 IR 或 AArch64 汇编。
+
+关键代码：
+
+```cpp
+auto sema = RunSema(*comp_unit);
+
+std::unique_ptr<ir::Module> asm_module;
+if (opts.emit_asm) {
+  asm_module = GenerateIR(*comp_unit, sema);
+  ir::RunIRPassPipeline(*asm_module);
+}
+
+if (opts.emit_asm) {
+  auto machine_module = mir::LowerToMIR(*asm_module);
+  mir::RunMIRPreRegAllocPassPipeline(*machine_module);
+  mir::RunRegAlloc(*machine_module);
+  mir::RunMIRPostRegAllocPassPipeline(*machine_module);
+  mir::RunFrameLowering(*machine_module);
+  mir::PrintAsm(*machine_module, std::cout);
+}
+```
+
+读法：
+
+```text
+GenerateIR 之前属于前端和语义分析。
+RunIRPassPipeline 属于中端优化。
+LowerToMIR 之后进入后端。
+RunRegAlloc 是寄存器分配。
+RunFrameLowering 负责栈帧和被调用者保存寄存器。
+PrintAsm 负责输出 AArch64 汇编。
+```
+
+答辩时可以说：
+
+```text
+我们的编译器不是直接从 AST 打印汇编，而是经过自研 IR、中端优化、MIR lowering、MIR 优化、寄存器分配、栈帧生成和汇编输出。
+```
+
+## 2. IR pass 总 pipeline
+
+说明：`src/ir/passes/PassManager.cpp` 决定中端优化顺序。优化不是只跑一次，而是迭代运行，直到没有变化或达到轮数限制。
+
+关键代码：
+
+```cpp
+RunMem2Reg(module);
+if (run_tail_recursion) {
+  RunTailRecursionElim(module);
+}
+
+const auto initial_shape = AnalyzePipelineShape(module);
+const bool huge_cfg =
+    !disable_size_guard &&
+    (initial_shape.blocks > 1000 || initial_shape.instructions > 7000);
+const bool large_cfg =
+    !disable_size_guard &&
+    (huge_cfg || initial_shape.blocks > 300 || initial_shape.instructions > 2500);
+const int max_iterations = huge_cfg ? 3 : (large_cfg ? 5 : 8);
+
+for (int iteration = 0; iteration < max_iterations; ++iteration) {
+  bool changed = false;
+  const bool run_growth_passes = !large_cfg || iteration < 2;
+  const bool run_loop_passes =
+      initial_shape.may_have_loop && (!large_cfg || iteration < 2);
+
+  if (run_cfg_inline && run_growth_passes) {
+    changed |= RunFunctionInlining(module);
+  }
+  changed |= RunInterproceduralConstProp(module);
+  changed |= RunArithmeticSimplify(module);
+  changed |= RunConstProp(module);
+  changed |= RunConstFold(module);
+  changed |= RunGVN(module);
+  changed |= RunLoadStoreElim(module);
+  changed |= RunCSE(module);
+  changed |= RunIfConversion(module);
+  changed |= RunDCE(module);
+  changed |= RunCFGSimplify(module);
+  if (run_loop_passes) {
+    changed |= RunLICM(module);
+  }
+  if (run_loop_passes && run_loop_mem_promotion) {
+    changed |= RunLoopMemoryPromotion(module);
+  }
+  if (!changed) {
+    break;
+  }
+}
+```
+
+读法：
+
+```text
+Mem2Reg 要先跑，因为很多优化都依赖 SSA-like 值。
+ConstProp/ConstFold/ArithmeticSimplify 会制造更多可删除代码。
+GVN/CSE 消除重复计算。
+LoadStoreElim 处理内存冗余。
+DCE/CFGSimplify 清理优化后留下的死代码和空 CFG。
+循环优化只在函数可能有 loop 时跑。
+large_cfg/huge_cfg 是编译时间保护，避免大函数反复膨胀。
+```
+
+## 3. IR pass 声明表
+
+说明：`include/ir/PassManager.h` 是 IR pass 对外入口。你想新增 pass，通常先在这里声明，再在 `PassManager.cpp` 接入。
+
+关键代码：
+
+```cpp
+void RunMem2Reg(Module& module);
+bool RunConstFold(Module& module);
+bool RunConstProp(Module& module);
+bool RunFunctionInlining(Module& module);
+bool RunTailRecursionElim(Module& module);
+bool RunInterproceduralConstProp(Module& module);
+bool RunArithmeticSimplify(Module& module);
+bool RunCSE(Module& module);
+bool RunGVN(Module& module);
+bool RunLoadStoreElim(Module& module);
+bool RunDCE(Module& module);
+bool RunCFGSimplify(Module& module);
+bool RunLICM(Module& module);
+bool RunLoopMemoryPromotion(Module& module);
+bool RunLoopUnswitch(Module& module);
+bool RunLoopStrengthReduction(Module& module);
+bool RunLoopUnroll(Module& module);
+bool RunLoopFission(Module& module);
+bool RunLoopRepeatReduction(Module& module);
+bool RunIfConversion(Module& module);
+void RunIRPassPipeline(Module& module);
+```
+
+读法：
+
+```text
+返回 bool 的 pass 表示“是否改变 IR”。
+返回 void 的 pass 通常是必须执行或没有变化反馈。
+PassManager 用 changed 控制是否继续迭代。
+```
+
+## 4. 跨块 Load/Store Elimination
+
+说明：`src/ir/passes/LoadStoreElim.cpp` 做 IR 级内存优化。它不是只看一个基本块，而是先做数据流分析，计算每个块入口和出口可用的内存状态。
+
+关键代码：
+
+```cpp
+std::unordered_map<BasicBlock*, MemoryState> in_states;
+std::unordered_map<BasicBlock*, MemoryState> out_states;
+
+bool dataflow_changed = true;
+while (dataflow_changed) {
+  dataflow_changed = false;
+  for (auto* block : reachable_blocks) {
+    MemoryState in_state;
+    if (block != function.GetEntryBlock()) {
+      std::vector<MemoryState*> predecessors;
+      for (auto* pred : block->GetPredecessors()) {
+        auto it = out_states.find(pred);
+        if (it != out_states.end()) {
+          predecessors.push_back(&it->second);
+        }
+      }
+      in_state = MeetMemoryStates(predecessors);
+    }
+
+    auto out_state = SimulateBlock(escapes, block, in_state);
+    if (in_it == in_states.end() || !SameMemoryState(in_it->second, in_state)) {
+      in_states[block] = in_state;
+      dataflow_changed = true;
+    }
+    if (out_it == out_states.end() || !SameMemoryState(out_it->second, out_state)) {
+      out_states[block] = std::move(out_state);
+      dataflow_changed = true;
+    }
+  }
+}
+
+changed |= OptimizeLoopPreheaderStoreForwarding(function, escapes);
+for (auto* block : reachable_blocks) {
+  changed |= OptimizeBlock(escapes, block, in_states[block]);
+}
+```
+
+读法：
+
+```text
+MemoryState 保存“某个地址当前已知是什么值”以及“是否存在尚未被观察的 pending store”。
+MeetMemoryStates 是控制流合流点的交汇操作，只有所有前驱都一致的信息才能保留。
+OptimizeBlock 用入口状态做 store-to-load forwarding、重复 store 删除和死 store 删除。
+```
+
+为什么安全：
+
+```text
+它依赖 MemoryUtils 的 alias/mod-ref 判断。
+只在确认两个地址相同或不冲突时转发。
+遇到可能读写内存的 call/memset 会失效相关状态。
+```
+
+## 5. LICM load 外提的安全判断
+
+说明：`LICM` 不只外提普通循环不变算术，也会尝试外提循环不变 load。难点是证明循环内没有 store/call 会修改这个地址。
+
+关键代码：
+
+```cpp
+inline bool IsSafeInvariantLoadToHoist(const Loop& loop, LoadInst* load,
+                                       PhiInst* iv, int iv_stride,
+                                       const std::vector<MemoryAccessInfo>& accesses,
+                                       const memutils::EscapeSummary* escapes = nullptr) {
+  auto ptr = AnalyzePointer(load->GetPtr(), iv, loop, load->GetType()->GetSize(), escapes);
+  if (!ptr.invariant_address) {
+    return false;
+  }
+  if (ptr.readonly_root) {
+    return true;
+  }
+
+  for (auto* block : loop.block_list) {
+    for (const auto& inst_ptr : block->GetInstructions()) {
+      auto* inst = inst_ptr.get();
+      if (inst == load) {
+        continue;
+      }
+      if (auto* call = dyncast<CallInst>(inst)) {
+        if (CallMayWritePointer(call->GetCallee(), ptr)) {
+          return false;
+        }
+      }
+    }
+  }
+
+  for (const auto& access : accesses) {
+    if (access.inst == load || !access.is_write) {
+      continue;
+    }
+    if (MayAliasSameIteration(ptr, access.ptr)) {
+      return false;
+    }
+    if (HasCrossIterationDependence(ptr, access.ptr, iv_stride)) {
+      return false;
+    }
+  }
+  return true;
+}
+```
+
+读法：
+
+```text
+invariant_address 表示地址本身不随迭代变化。
+readonly_root 表示根对象只读，可以直接外提。
+CallMayWritePointer 防止未知调用修改该地址。
+MayAliasSameIteration 防止同一次迭代内 store 覆盖 load。
+HasCrossIterationDependence 防止跨迭代读写依赖。
+```
+
+## 6. LoopMemoryPromotion
+
+说明：`LoopMemoryPromotion` 把循环里反复 load/store 的标量内存提升成 SSA 标量。它类似 Mem2Reg，但范围是循环内部。
+
+关键代码：
+
+```cpp
+bool RunLoopMemoryPromotionOnFunction(Function& function) {
+  if (function.IsExternal() || function.GetEntryBlock() == nullptr) {
+    return false;
+  }
+  if (!ShouldAnalyzeFunction(function)) {
+    return false;
+  }
+
+  bool changed = false;
+  while (true) {
+    DominatorTree dom_tree(function);
+    LoopInfo loop_info(function, dom_tree);
+
+    bool cfg_changed = false;
+    for (auto* loop : loop_info.GetLoopsInPostOrder()) {
+      auto* old_preheader = loop->preheader;
+      auto* preheader = looputils::EnsurePreheader(function, *loop);
+      if (preheader != old_preheader) {
+        changed = true;
+        cfg_changed = true;
+        break;
+      }
+    }
+    if (cfg_changed) {
+      continue;
+    }
+
+    auto dom_info = BuildDominatorInfo(function);
+    bool local_changed = false;
+    for (auto* loop : loop_info.GetLoopsInPostOrder()) {
+      local_changed |= PromoteLoopMemory(function, *loop, dom_info);
+    }
+    changed |= local_changed;
+    if (!local_changed) {
+      break;
+    }
+  }
+
+  return changed;
+}
+```
+
+读法：
+
+```text
+先确保循环有 preheader，因为 promotion 需要在循环入口准备初始值。
+每次 CFG 改变后重新构建 DominatorTree 和 LoopInfo。
+按 post-order 处理内层循环，避免外层先改导致内层结构失效。
+PromoteLoopMemory 负责候选收集、phi 插入、load/store 重写和 exit store。
+```
+
+## 7. 函数内联
+
+说明：`src/ir/passes/Inline.cpp` 支持单基本块和小型多基本块函数内联。内联可以减少 call 开销，也能让常量传播、GVN、LoadStoreElim 看见更多上下文。
+
+关键代码：
+
+```cpp
+bool RunFunctionInlining(Module& module) {
+  std::unordered_map<Function*, InlineCandidateInfo> callee_info;
+  for (const auto& function_ptr : module.GetFunctions()) {
+    if (function_ptr) {
+      callee_info.emplace(function_ptr.get(), AnalyzeInlineCandidate(*function_ptr));
+    }
+  }
+
+  const auto call_counts = CountDirectCalls(module);
+  bool changed = false;
+  for (const auto& function_ptr : module.GetFunctions()) {
+    if (function_ptr) {
+      changed |= RunFunctionInliningOnFunction(*function_ptr, callee_info, call_counts);
+    }
+  }
+  return changed;
+}
+```
+
+读法：
+
+```text
+AnalyzeInlineCandidate 先给每个函数建成本模型。
+CountDirectCalls 统计调用次数，避免把热点大函数盲目膨胀。
+RunFunctionInliningOnFunction 遍历 call site，根据成本模型决定是否内联。
+```
+
+## 8. 尾递归消除
+
+说明：尾递归消除把“函数最后一步调用自己”改成循环，减少递归 call 开销和栈压力。
+
+关键代码：
+
+```cpp
+bool RunTailRecursionElim(Module& module) {
+  bool changed = false;
+  for (const auto& function_ptr : module.GetFunctions()) {
+    if (function_ptr) {
+      changed |= RunOnFunction(*function_ptr);
+    }
+  }
+  if (changed) {
+    RecomputeRecursiveFlags(module);
+  }
+  return changed;
+}
+```
+
+读法：
+
+```text
+RunOnFunction 负责匹配 tail-recursive call。
+改写后参数会变成入口 phi 的 incoming。
+尾调用被替换成跳回入口块。
+RecomputeRecursiveFlags 很重要，否则后续 pass 还会误以为函数递归。
+```
+
+## 9. LoopStrengthReduction
+
+说明：循环强度削弱把循环内重复乘法替换成递推加法。典型例子是 `i * stride`，每轮只需要在上轮基础上加 `stride`。
+
+关键代码：
+
+```cpp
+bool ReduceLoopMultiplications(Function& function, const Loop& loop,
+                               BasicBlock* preheader) {
+  if (!preheader || loop.latches.size() != 1) {
+    return false;
+  }
+
+  std::vector<InductionVarInfo> induction_vars;
+  for (const auto& inst_ptr : loop.header->GetInstructions()) {
+    auto* phi = dyncast<PhiInst>(inst_ptr.get());
+    if (!phi) {
+      break;
+    }
+    InductionVarInfo info;
+    if (MatchSimpleInductionVariable(loop, preheader, phi, info)) {
+      induction_vars.push_back(info);
+    }
+  }
+
+  for (const auto& iv : induction_vars) {
+    std::vector<std::pair<Instruction*, Value*>> candidates;
+    for (auto* block : loop.block_list) {
+      for (const auto& inst_ptr : block->GetInstructions()) {
+        auto* inst = inst_ptr.get();
+        Value* factor = nullptr;
+        if (IsMulCandidate(loop, inst, iv.phi, factor)) {
+          candidates.push_back({inst, factor});
+        }
+      }
+    }
+    // 后续为每个 factor 建递推 phi，并替换原乘法。
+  }
+}
+```
+
+读法：
+
+```text
+MatchSimpleInductionVariable 找标准 induction variable。
+IsMulCandidate 找循环里的 iv * factor。
+CreateReducedPhi 生成递推变量。
+inst->ReplaceAllUsesWith(replacement) 用递推值替换原乘法。
+```
+
+## 10. MIR 数据结构
+
+说明：MIR 是后端中间表示。它比 IR 更接近机器指令，但还保留虚拟寄存器、抽象地址和高级机器 idiom。
+
+关键代码：
+
+```cpp
+enum class ValueType { Void, I1, I32, F32, Ptr };
+enum class RegClass { GPR, FPR };
+enum class AddrBaseKind { None, FrameObject, Global, VReg };
+enum class OperandKind { Invalid, VReg, Imm, Block, Symbol };
+
+class MachineInstr {
+ public:
+  enum class Opcode {
+    Arg,
+    Copy,
+    Load,
+    Store,
+    Lea,
+    Add,
+    Sub,
+    Mul,
+    MAdd,
+    MSub,
+    Div,
+    Rem,
+    ModMul,
+    ModPow,
+    DigitExtractPow2,
+    BitTestMask,
+    And,
+    Or,
+    Xor,
+    Shl,
+    AShr,
+    LShr,
+    FAdd,
+    FSub,
+    FMul,
+    FDiv,
+    FSqrt,
+    FNeg,
+    ICmp,
+    FCmp,
+    CSelect,
+    ZExt,
+    ItoF,
+    FtoI,
+    Br,
+    CondBr,
+    Call,
+    Ret,
+    Memset,
+    Unreachable,
+  };
+};
+```
+
+读法：
+
+```text
+MAdd/MSub 保留乘加融合机会。
+ModMul/ModPow/DigitExtractPow2 是数学 idiom lowering 后的专门机器操作。
+CSelect 表示条件选择，可输出整数 csel 或浮点 fcsel。
+AddressExpr 把 FrameObject、Global、VReg base 统一描述，方便地址优化。
+```
+
+## 11. MIR pass pipeline
+
+说明：MIR pass 分 pre-RA 和 post-RA。pre-RA 面向虚拟寄存器，post-RA 面向物理寄存器和 spill slot。
+
+关键代码：
+
+```cpp
+void RunMIRPreRegAllocPassPipeline(MachineModule& module) {
+  const bool run_spill_reduction =
+      utils::IsEnabledUnlessEnvFlag("NUDTC_DISABLE_MIR_SPILL_REDUCTION");
+  const bool run_cfg_cleanup =
+      utils::IsEnabledUnlessEnvFlag("NUDTC_DISABLE_MIR_CFG_CLEANUP");
+
+  if (run_spill_reduction) {
+    RunSpillReduction(module);
+  }
+  RunAddressHoisting(module);
+
+  constexpr int kMaxIterations = 4;
+  for (int iteration = 0; iteration < kMaxIterations; ++iteration) {
+    bool changed = false;
+    changed |= RunPeephole(module);
+    if (run_cfg_cleanup) {
+      changed |= RunCFGCleanup(module);
+    }
+    if (!changed) {
+      break;
+    }
+  }
+}
+
+void RunMIRPostRegAllocPassPipeline(MachineModule& module) {
+  constexpr int kMaxIterations = 2;
+  for (int iteration = 0; iteration < kMaxIterations; ++iteration) {
+    bool changed = false;
+    changed |= RunPeephole(module);
+    changed |= RunCFGCleanup(module);
+    if (!changed) {
+      break;
+    }
+  }
+}
+```
+
+读法：
+
+```text
+SpillReduction 要在寄存器分配前降低活跃范围压力。
+AddressHoisting 提前物化高频地址。
+Peephole 既在 pre-RA 跑，也在 post-RA 跑。
+post-RA Peephole 能删除物理寄存器层面的自拷贝和冗余访存。
+```
+
+## 12. AddressHoisting 正确性 guard
+
+说明：地址提升可以减少重复地址计算，但必须保证被提升的 base 在所有使用点之前定义。最近修复过的问题就是“复用非入口块里的旧 `lea` 可能不支配所有使用点”。
+
+关键代码：
+
+```cpp
+std::unordered_map<std::string, int> global_base_vregs;
+for (const auto& [symbol, count] : global_use_counts) {
+  if (count >= 2) {
+    global_base_vregs.emplace(symbol, -1);
+  }
+}
+
+// Existing LEA instructions may be inside non-entry blocks and do not
+// necessarily dominate all rewritten users. Always create a fresh base in
+// the entry block for function-wide hoisting.
+
+auto& entry_block = *function->GetBlocks().front();
+auto& entry_insts = entry_block.GetInstructions();
+std::size_t insert_pos = FindEntryInsertPos(entry_block);
+
+for (auto& [symbol, base_vreg] : global_base_vregs) {
+  base_vreg = function->NewVReg(ValueType::Ptr);
+  MachineInstr lea(MachineInstr::Opcode::Lea, {MachineOperand::VReg(base_vreg)});
+  AddressExpr address;
+  address.base_kind = AddrBaseKind::Global;
+  address.symbol = symbol;
+  lea.SetAddress(std::move(address));
+  entry_insts.insert(entry_insts.begin() + static_cast<std::ptrdiff_t>(insert_pos),
+                     std::move(lea));
+  ++insert_pos;
+}
+```
+
+读法：
+
+```text
+function-wide hoisting 必须插入到 entry block。
+不能随便复用某个已有 Lea，因为它可能只在某条分支上执行。
+块内 hoisting 可以插在当前 block 开头，因为它只服务当前 block。
+```
+
+## 13. MAdd/MSub 融合和函数范围 use-count
+
+说明：`mul + add/sub` 可以融合成 AArch64 `madd/msub`。但如果 `mul` 结果还被其他块使用，删除 `mul` 会破坏程序。当前实现会同时检查块内 use-count 和函数范围 use-count。
+
+关键代码：
+
+```cpp
+std::unordered_map<int, int> CountFunctionUsesWithReplacement(
+    const MachineFunction& function, const MachineBasicBlock& replaced_block,
+    const std::vector<MachineInstr>& replacement) {
+  std::unordered_map<int, int> counts;
+  auto accumulate = [&](const std::vector<MachineInstr>& instructions) {
+    for (const auto& inst : instructions) {
+      for (int use : inst.GetUses()) {
+        ++counts[use];
+      }
+    }
+  };
+
+  for (const auto& block : function.GetBlocks()) {
+    if (block.get() == &replaced_block) {
+      accumulate(replacement);
+    } else if (block) {
+      accumulate(block->GetInstructions());
+    }
+  }
+  return counts;
+}
+
+bool MatchMulOperand(const std::unordered_map<int, std::size_t>& def_index,
+                     const std::unordered_map<int, int>& use_counts,
+                     const std::unordered_map<int, int>& function_use_counts,
+                     const std::vector<bool>& removed,
+                     const MachineOperand& operand,
+                     const std::vector<MachineInstr>& instructions,
+                     std::size_t user_index,
+                     std::size_t* mul_index) {
+  const int vreg = operand.GetVReg();
+  auto use_it = use_counts.find(vreg);
+  if (use_it == use_counts.end() || use_it->second != 1) {
+    return false;
+  }
+  auto function_use_it = function_use_counts.find(vreg);
+  if (function_use_it == function_use_counts.end() ||
+      function_use_it->second != 1) {
+    return false;
+  }
+  // 只有确认函数范围单 use，才允许删掉 mul。
+}
+```
+
+读法：
+
+```text
+块内单 use 不等于函数内单 use。
+跨块 use 是很多后端 peephole 的典型坑。
+这个 guard 保留了安全融合，同时避免误删 producer。
+```
+
+## 14. 整数和浮点 CSelect
+
+说明：`CSelect` 让小型 `min/max/clamp/select` 函数不再发生真实调用。整数输出 `cmp + csel`，浮点输出 `fcmp + fcsel`。
+
+新增的浮点匹配代码：
+
+```cpp
+bool MatchTwoWayFloatSelectFunction(const ir::Function& function,
+                                    FloatSelectCallShape* shape) {
+  if (shape == nullptr || function.IsExternal() || function.GetReturnType() == nullptr ||
+      !function.GetReturnType()->IsFloat() || function.GetBlocks().size() != 3 ||
+      function.IsRecursive()) {
+    return false;
+  }
+
+  auto* entry = function.GetEntryBlock();
+  auto* cmp = ir::dyncast<ir::BinaryInst>(entry->GetInstructions()[0].get());
+  auto* branch = ir::dyncast<ir::CondBrInst>(entry->GetInstructions()[1].get());
+  if (cmp == nullptr || branch == nullptr || branch->GetCondition() != cmp ||
+      !IsFloatCompareOpcode(cmp->GetOpcode()) ||
+      !IsMappedF32Value(function, cmp->GetLhs()) ||
+      !IsMappedF32Value(function, cmp->GetRhs())) {
+    return false;
+  }
+
+  ir::Value* true_value = nullptr;
+  ir::Value* false_value = nullptr;
+  if (!MatchSingleReturnValue(branch->GetThenBlock(), &true_value) ||
+      !MatchSingleReturnValue(branch->GetElseBlock(), &false_value) ||
+      !IsMappedF32Value(function, true_value) ||
+      !IsMappedF32Value(function, false_value)) {
+    return false;
+  }
+
+  shape->cond = LowerFloatCond(cmp->GetOpcode());
+  shape->cmp_lhs = cmp->GetLhs();
+  shape->cmp_rhs = cmp->GetRhs();
+  shape->true_value = true_value;
+  shape->false_value = false_value;
+  return true;
+}
+```
+
+对应的汇编输出：
+
+```cpp
+case MachineInstr::Opcode::CSelect: {
+  const int vreg = inst.GetOperands()[0].GetVReg();
+  const auto result_type = function.GetVRegInfo(vreg).type;
+  if (IsFPR(result_type)) {
+    const auto def = PrepareFprDef(function, vreg, 16);
+    const auto true_value =
+        MaterializeFprUse(function, inst.GetOperands()[1], 17, 9, os);
+    const auto false_value =
+        MaterializeFprUse(function, inst.GetOperands()[2], 18, 9, os);
+    const auto cmp_lhs =
+        MaterializeFprUse(function, inst.GetOperands()[3], 19, 10, os);
+    const auto cmp_rhs =
+        MaterializeFprUse(function, inst.GetOperands()[4], 20, 11, os);
+    os << "  fcmp " << cmp_lhs << ", " << cmp_rhs << "\n";
+    os << "  fcsel " << def.reg_name << ", " << true_value << ", "
+       << false_value << ", " << GetIntCondMnemonic(inst.GetCondCode()) << "\n";
+    FinalizeDef(function, vreg, def, os);
+    break;
+  }
+  // 整数路径输出 cmp + csel。
+}
+```
+
+读法：
+
+```text
+MatchTwoWayFloatSelectFunction 只接受三块结构：entry compare+branch、then return、else return。
+返回值和比较操作数必须是参数或常量，避免复杂副作用。
+AsmPrinter 根据 result_type 决定输出 csel 还是 fcsel。
+这个优化命中 h-13-01 里的 float max，也适用于其他 float min/max helper。
+```
+
+## 15. MIR peephole 主循环
+
+说明：`RunPeephole` 做局部机器优化，也带有简单跨块内存数据流。它会在每个 block 上运行 `RunPeepholeOnBlock`。
+
+关键代码：
+
+```cpp
+bool RunPeephole(MachineModule& module) {
+  bool changed = false;
+  for (auto& function : module.GetFunctions()) {
+    bool function_changed = false;
+    const auto cfg = BuildCFG(*function);
+    std::vector<MemoryMap> in_states(function->GetBlocks().size());
+    std::vector<MemoryMap> out_states(function->GetBlocks().size());
+
+    bool dataflow_changed = true;
+    while (dataflow_changed) {
+      dataflow_changed = false;
+      for (std::size_t i = 0; i < function->GetBlocks().size(); ++i) {
+        MemoryMap in_state;
+        if (i != 0) {
+          std::vector<const MemoryMap*> predecessors;
+          for (int pred : cfg.predecessors[i]) {
+            predecessors.push_back(&out_states[static_cast<std::size_t>(pred)]);
+          }
+          in_state = MeetMemoryStates(predecessors);
+        }
+
+        auto out_state =
+            SimulateBlockMemory(module, *function->GetBlocks()[i], in_state);
+        // 更新 in/out state，直到不再变化。
+      }
+    }
+
+    for (std::size_t i = 0; i < function->GetBlocks().size(); ++i) {
+      function_changed |=
+          RunPeepholeOnBlock(module, *function, *function->GetBlocks()[i], in_states[i]);
+    }
+    if (!HasAssignedAllocations(*function)) {
+      function_changed |= RunDeadInstrElimination(*function);
+    }
+    changed |= function_changed;
+  }
+  return changed;
+}
+```
+
+读法：
+
+```text
+pre-RA 阶段 HasAssignedAllocations 为 false，可以删虚拟寄存器死定义。
+post-RA 阶段已有物理寄存器和 spill slot，peephole 主要删除 copy、冗余 load/store 和跳转。
+内存状态必须经过 CFG meet，不能只做单块线性扫描。
+```
+
+## 16. AArch64 常数乘法 lowering
+
+说明：常数乘法不一定要输出 `mul`。对 0、1、-1、2 的幂、`2^k - 1` 等常数，可以用更便宜的 shift/add/sub。
+
+关键代码：
+
+```cpp
+bool TryEmitMulByConstant(const MachineFunction& function, const MachineOperand& value,
+                          std::int64_t multiplier, const char* dst,
+                          std::ostream& os) {
+  const auto src = MaterializeGprUse(function, value, ValueType::I32, 10, os);
+  if (multiplier == 0) {
+    EmitMoveImm(os, dst, 0);
+    return true;
+  }
+  if (multiplier == 1) {
+    EmitCopy(os, dst, src.c_str(), false);
+    return true;
+  }
+  if (multiplier == -1) {
+    os << "  neg " << dst << ", " << src << "\n";
+    return true;
+  }
+
+  const bool negative = multiplier < 0;
+  const std::uint64_t abs_multiplier =
+      negative ? static_cast<std::uint64_t>(-multiplier)
+               : static_cast<std::uint64_t>(multiplier);
+
+  if (IsPowerOfTwo(static_cast<std::int64_t>(abs_multiplier))) {
+    EmitShiftedCopy(os, dst, src.c_str(),
+                    Log2(static_cast<std::int64_t>(abs_multiplier)));
+    if (negative) {
+      os << "  neg " << dst << ", " << dst << "\n";
+    }
+    return true;
+  }
+}
+```
+
+读法：
+
+```text
+这个优化在汇编打印阶段做，因为此时能直接选择 AArch64 指令形式。
+若 dst 和 src 相同，需要先保存 stable_src，避免覆盖输入。
+常数乘法 lowering 对 crypto、hash、矩阵下标计算都有帮助。
+```
+
+## 17. 测试脚本如何验证正确性
+
+说明：`scripts/verify_asm.sh` 做单样例全链路验证：源码到汇编、汇编到 ELF、qemu 运行、和 `.out` 比较。
+
+关键代码：
+
+```bash
+compiler=""
+for candidate in "$REPO_ROOT/build_lab3/bin/compiler" \
+                 "$REPO_ROOT/build_lab2/bin/compiler" \
+                 "$REPO_ROOT/build/bin/compiler"; do
+  if [[ -x "$candidate" ]]; then
+    compiler="$candidate"
+    break
+  fi
+done
+
+_compile_start_ns=$(now_ns)
+"$compiler" --emit-asm "$input" > "$asm_file"
+aarch64-linux-gnu-gcc "$asm_file" "$REPO_ROOT/sylib/sylib.c" -O2 -o "$exe"
+_compile_ns=$(($(now_ns) - _compile_start_ns))
+
+if [[ -f "$stdin_file" ]]; then
+  timeout "$timeout_sec" qemu-aarch64 -L /usr/aarch64-linux-gnu "$exe" \
+    < "$stdin_file" > "$stdout_file"
+else
+  timeout "$timeout_sec" qemu-aarch64 -L /usr/aarch64-linux-gnu "$exe" \
+    > "$stdout_file"
+fi
+```
+
+读法：
+
+```text
+verify_asm.sh 会优先使用 build_lab3/bin/compiler。
+compile_ns 包含我方 compiler 生成汇编，以及 aarch64-linux-gnu-gcc 汇编/链接。
+run_ns 才是生成程序在 qemu 下的运行时间。
+性能分析不要把 compile_ns 误认为生成代码运行慢。
+```
+
+## 18. 修改 pass 时的最低安全流程
+
+说明：每次修改优化 pass 都应该按这个顺序验证，避免“单个热点样例变快但整体错”。
+
+推荐流程：
+
+```text
+1. 用 analyze_case.sh 保存 IR/ASM，确认优化是否真的命中。
+2. 单跑被修改 pass 影响最大的样例。
+3. 单跑历史敏感样例，例如 38_light2d、82_long_func、26_scope4、28_side_effect2。
+4. 如果涉及循环/内存优化，至少跑一个 h_performance 长样例。
+5. 最后再跑 lab3_build_test.sh 全量。
+```
+
+常用命令：
+
+```bash
+cmake --build build_lab3 -j "$(nproc)"
+
+./scripts/verify_asm.sh test/test_case/h_functional/38_light2d.sy /tmp/check --run
+./scripts/verify_asm.sh test/test_case/functional/82_long_func.sy /tmp/check --run
+./scripts/verify_asm.sh test/test_case/h_functional/26_scope4.sy /tmp/check --run
+./scripts/verify_asm.sh test/test_case/h_functional/28_side_effect2.sy /tmp/check --run
+
+./scripts/analyze_case.sh test/test_case/h_performance/h-13-01.sy
+```
+
+答辩时可以强调：
+
+```text
+我们不是靠样例特判，而是通过 alias 分析、支配关系、use-count、call-clobber 等 guard 保证优化安全。
+每个新优化都可以用环境变量或单样例脚本定位和回归。
+```
--- a/study_materials/18_current_optimization_targets.md
+++ b/study_materials/18_current_optimization_targets.md
@ -0,0 +1,415 @@
+# 18 当前优化空间和本轮修改记录
+
+本文回答两个问题：
+
+```text
+当前编译器还有哪些值得优化的地方？
+本轮实际做了什么优化，代码在哪里？
+```
+
+数据依据是最近一次完整测试：
+
+```text
+日志目录：output/logs/lab3/lab3_20260513_181606
+结果：214 PASS / 0 FAIL / total 214
+validation elapsed: 592.18423s
+timing.tsv 记录的程序运行总时间约 394.66s
+```
+
+注意：`validation elapsed` 包含每个样例的汇编、链接和 qemu 运行；`timing.tsv` 里的 `our_ns` 更接近“生成程序运行时间”。
+
+## 1. 当前最值得优化的热点
+
+说明：最新 timing 中，真正值得优先关注的是总耗时长且落后 GCC 的样例。极小样例即使 speedup 难看，总收益也很小。
+
+代表性结果：
+
+```text
+h-11-01       our 59.20s, gcc 52.58s, speedup 0.89x
+h-14-01       our 31.78s, gcc 26.20s, speedup 0.82x
+h-1-01        our 22.84s, gcc 20.48s, speedup 0.90x
+h-12-01       our 18.75s, gcc 15.69s, speedup 0.84x
+h-1-03        our 11.00s, gcc  9.67s, speedup 0.88x
+h-13-01       our  9.79s, gcc  7.77s, speedup 0.79x
+h-9-02        our  5.73s, gcc  3.43s, speedup 0.60x
+crypto-*      our 大约 0.62s~0.81s，仍慢于 GCC
+gameoflife-*  our 大约 7s~8s，略慢于 GCC
+```
+
+解读：
+
+```text
+h-9-* 的源码包含非尾递归 fib，通用优化空间主要是递归函数内联/特化/数学识别，但不能做样例名特判。
+crypto-* 主要吃常数除法、取模、位运算、函数内联和寄存器分配。
+gameoflife 是 stencil 模式，长期方向是行缓存、局部数组提升和保守 NEON。
+h-13-01 有 float max helper，之前仍保留 bl max，本轮已做通用浮点 select lowering。
+```
+
+## 2. 本轮已做优化：浮点两路返回函数选择化
+
+说明：项目之前已经能识别整数 `min/max/clamp` 这种两路返回小函数，并 lowering 成 `cmp + csel`。但浮点版本没有处理，导致 `float max(float,float)` 在内层循环里仍然是真实函数调用。
+
+源代码例子：
+
+```c
+float max(float a, float b) {
+    if (a > b) {
+        return a;
+    } else {
+        return b;
+    }
+}
+```
+
+优化前汇编中存在：
+
+```asm
+bl max
+```
+
+优化后汇编中变为：
+
+```asm
+fcmp s19, s20
+fcsel s19, s19, s20, gt
+```
+
+实现位置：
+
+```text
+src/mir/Lowering.cpp
+src/mir/AsmPrinter.cpp
+```
+
+## 3. 浮点选择化的匹配代码
+
+说明：只匹配非常保守的函数形状，避免错误优化。要求：
+
+```text
+函数返回 float。
+函数不是 external，也不是 recursive。
+函数刚好 3 个基本块。
+entry 块只有 fcmp 和 condbr。
+then/else 块各自只有一个 return。
+比较操作数和返回值只能是参数或浮点常量。
+```
+
+关键代码：
+
+```cpp
+bool MatchTwoWayFloatSelectFunction(const ir::Function& function,
+                                    FloatSelectCallShape* shape) {
+  if (shape == nullptr || function.IsExternal() || function.GetReturnType() == nullptr ||
+      !function.GetReturnType()->IsFloat() || function.GetBlocks().size() != 3 ||
+      function.IsRecursive()) {
+    return false;
+  }
+
+  auto* entry = function.GetEntryBlock();
+  if (entry == nullptr || entry->GetInstructions().size() != 2) {
+    return false;
+  }
+  auto* cmp = ir::dyncast<ir::BinaryInst>(entry->GetInstructions()[0].get());
+  auto* branch = ir::dyncast<ir::CondBrInst>(entry->GetInstructions()[1].get());
+  if (cmp == nullptr || branch == nullptr || branch->GetCondition() != cmp ||
+      !IsFloatCompareOpcode(cmp->GetOpcode()) ||
+      !IsMappedF32Value(function, cmp->GetLhs()) ||
+      !IsMappedF32Value(function, cmp->GetRhs())) {
+    return false;
+  }
+
+  ir::Value* true_value = nullptr;
+  ir::Value* false_value = nullptr;
+  if (!MatchSingleReturnValue(branch->GetThenBlock(), &true_value) ||
+      !MatchSingleReturnValue(branch->GetElseBlock(), &false_value) ||
+      !IsMappedF32Value(function, true_value) ||
+      !IsMappedF32Value(function, false_value)) {
+    return false;
+  }
+
+  shape->cond = LowerFloatCond(cmp->GetOpcode());
+  shape->cmp_lhs = cmp->GetLhs();
+  shape->cmp_rhs = cmp->GetRhs();
+  shape->true_value = true_value;
+  shape->false_value = false_value;
+  return true;
+}
+```
+
+为什么不是样例特判：
+
+```text
+代码没有判断函数名 max，也没有判断 h-13-01。
+它只看函数 CFG 和 IR 指令形状。
+任何 SysY 程序里符合该形状的 float min/max/select helper 都会命中。
+```
+
+## 4. 浮点选择化的 lowering 代码
+
+说明：在 lower call 时，如果 callee 符合 `MatchTwoWayFloatSelectFunction`，就不生成 `Call` MIR，而是生成 `CSelect` MIR。
+
+关键代码：
+
+```cpp
+FloatSelectCallShape float_select_shape;
+if (callee != nullptr && call->GetType() != nullptr && call->GetType()->IsFloat() &&
+    MatchTwoWayFloatSelectFunction(*callee, &float_select_shape)) {
+  auto lowered = NewVRegValue(ValueType::F32);
+  MachineInstr instr(
+      MachineInstr::Opcode::CSelect,
+      {MachineOperand::VReg(lowered.index),
+       ResolveMappedCallOperand(call, float_select_shape.true_value, inline_values),
+       ResolveMappedCallOperand(call, float_select_shape.false_value, inline_values),
+       ResolveMappedCallOperand(call, float_select_shape.cmp_lhs, inline_values),
+       ResolveMappedCallOperand(call, float_select_shape.cmp_rhs, inline_values)});
+  instr.SetCondCode(float_select_shape.cond);
+  current_block_->Append(std::move(instr));
+  values_[call] = lowered;
+  return true;
+}
+```
+
+读法：
+
+```text
+NewVRegValue(ValueType::F32) 创建浮点结果虚拟寄存器。
+ResolveMappedCallOperand 把 callee 的形参映射成 call site 的实参。
+CSelect 的五个操作数是：dst、true_value、false_value、cmp_lhs、cmp_rhs。
+CondCode 保存比较条件，例如 GT 对应 a > b。
+```
+
+## 5. 浮点选择化的 AArch64 输出
+
+说明：`CSelect` 现在根据结果类型分两条路径。整数结果输出 `cmp + csel`；浮点结果输出 `fcmp + fcsel`。
+
+关键代码：
+
+```cpp
+case MachineInstr::Opcode::CSelect: {
+  const int vreg = inst.GetOperands()[0].GetVReg();
+  const auto result_type = function.GetVRegInfo(vreg).type;
+  if (IsFPR(result_type)) {
+    const auto def = PrepareFprDef(function, vreg, 16);
+    const auto true_value =
+        MaterializeFprUse(function, inst.GetOperands()[1], 17, 9, os);
+    const auto false_value =
+        MaterializeFprUse(function, inst.GetOperands()[2], 18, 9, os);
+    const auto cmp_lhs =
+        MaterializeFprUse(function, inst.GetOperands()[3], 19, 10, os);
+    const auto cmp_rhs =
+        MaterializeFprUse(function, inst.GetOperands()[4], 20, 11, os);
+    os << "  fcmp " << cmp_lhs << ", " << cmp_rhs << "\n";
+    os << "  fcsel " << def.reg_name << ", " << true_value << ", "
+       << false_value << ", " << GetIntCondMnemonic(inst.GetCondCode()) << "\n";
+    FinalizeDef(function, vreg, def, os);
+    break;
+  }
+  // 整数路径继续输出 cmp + csel。
+}
+```
+
+安全点：
+
+```text
+PrepareFprDef 会处理目标虚拟寄存器是物理寄存器还是 spill slot。
+MaterializeFprUse 会处理实参来自物理寄存器、spill slot 或浮点常量。
+FinalizeDef 会在结果被 spill 时写回栈槽。
+```
+
+## 6. 本轮验证
+
+说明：本轮没有跑全量测试，只做了针对性验证和热点样例验证。
+
+执行过的验证：
+
+```bash
+cmake --build build_lab3 -j "$(nproc)"
+
+./scripts/verify_asm.sh test/test_case/h_performance/h-13-01.sy /tmp/fcsel_h13 --run
+./scripts/verify_asm.sh test/test_case/h_functional/39_fp_params.sy /tmp/fcsel_fp --run
+./scripts/verify_asm.sh test/test_case/h_functional/28_side_effect2.sy /tmp/fcsel_side --run
+```
+
+结果：
+
+```text
+h-13-01: PASS
+39_fp_params: PASS
+28_side_effect2: PASS
+```
+
+汇编命中确认：
+
+```text
+优化后 h-13-01.s 中出现：
+  fcmp s19, s20
+  fcsel s19, s19, s20, gt
+
+优化后没有继续出现：
+  bl max
+```
+
+性能观察：
+
+```text
+在当前机器上用 build/bin/compiler 和 build_lab3/bin/compiler 对比 h-13-01：
+旧版本大约 15.23s / 14.81s
+新版本大约 15.00s / 14.70s
+
+单次 qemu 性能波动较大，但该优化确实删除了内层循环里的函数调用。
+```
+
+## 7. 还值得继续做的优化
+
+### 7.1 h-9 系列：递归和除模
+
+说明：`h-9-02` 主要慢在递归 `fib` 和大量 `% /`。这类优化风险较高，不能写成样例名特判。
+
+源码形状：
+
+```c
+int fib(int c,int n,int d){
+    if(n == 0 || n==1){
+        return ((c+1)/2 + (d*2)%3);
+    }
+    return fib(c+1, n-1, (d+1)/2) +
+           fib((c-2)/2, n-2, (d-3)%2);
+}
+```
+
+可做方向：
+
+```text
+递归函数参数常量/范围特化。
+小递归函数局部展开，但要有深度和代码大小 guard。
+更强常数除法/取模 lowering，尤其负数取模的快速路径。
+```
+
+相关代码入口：
+
+```text
+src/ir/passes/Inline.cpp
+src/ir/passes/InterproceduralConstProp.cpp
+src/mir/AsmPrinter.cpp
+```
+
+### 7.2 crypto 系列：位运算和常数取模
+
+说明：`crypto-*` 中有大量 `x * 2^k + x % 2^k`、常数除法、常数取模、helper 函数。
+
+代表源码：
+
+```c
+int rotl5(int x) {
+  return x * 32 + x % 32;
+}
+
+int get_random() {
+  state = state + (state * 8192);
+  state = state + (state / 131072);
+  state = state + (state * 32);
+  return state;
+}
+```
+
+可做方向：
+
+```text
+识别更多 signed power-of-two remainder idiom。
+如果能证明非负，x % 2^k 可以转成 x & (2^k - 1)。
+识别 rotate-like idiom 时必须非常谨慎，因为 SysY 的负数 % 结果不是 unsigned low bits。
+```
+
+相关代码入口：
+
+```text
+src/ir/passes/ArithmeticSimplify.cpp
+src/mir/AsmPrinter.cpp
+src/ir/passes/ValueRangeAnalysis.cpp（如果后续新增）
+```
+
+### 7.3 gameoflife：stencil 行缓存
+
+说明：`gameoflife` 是典型 stencil，瓶颈通常是邻域访存。当前优化主要是标量和局部访存，缺少专门 stencil 行缓存。
+
+可做方向：
+
+```text
+识别二维数组相邻行访问。
+把上一行/当前行/下一行的局部值缓存成标量。
+保守处理边界和输出数组 alias。
+后续可以尝试 NEON，但要先确保标量行缓存正确。
+```
+
+相关代码入口：
+
+```text
+src/ir/passes/LoopMemoryUtils.h
+src/ir/passes/LICM.cpp
+src/ir/passes/LoopMemoryPromotion.cpp
+src/ir/passes/LoopUnroll.cpp
+```
+
+### 7.4 h-13/h-14：浮点循环和卷积
+
+说明：本轮已经补了 float max 的 `fcsel`，但卷积类样例仍有大量浮点乘加和二维数组访存。
+
+可做方向：
+
+```text
+浮点 FMul + FAdd 融合成 FMA，如果目标允许使用 fmadd。
+更强二维数组地址递推，减少每次 A[i+k][j+l] 的地址重算。
+小 kernel 循环 unroll，尤其 ks 固定为 15 时要谨慎控制代码大小。
+```
+
+相关代码入口：
+
+```text
+src/ir/passes/LoopStrengthReduction.cpp
+src/ir/passes/LoopUnroll.cpp
+src/mir/passes/Peephole.cpp
+src/mir/AsmPrinter.cpp
+```
+
+### 7.5 编译时间：避免无收益重复迭代
+
+说明：`validation elapsed` 偏高不全是我方 compiler 慢，脚本会每个样例都汇编链接 `sylib.c`。但大函数 IR pass 仍应继续加 guard。
+
+当前已有 guard：
+
+```cpp
+const bool huge_cfg =
+    !disable_size_guard &&
+    (initial_shape.blocks > 1000 || initial_shape.instructions > 7000);
+const bool large_cfg =
+    !disable_size_guard &&
+    (huge_cfg || initial_shape.blocks > 300 || initial_shape.instructions > 2500);
+const int max_iterations = huge_cfg ? 3 : (large_cfg ? 5 : 8);
+```
+
+可做方向：
+
+```text
+给个别 loop pass 增加 per-function instruction growth cap。
+记录每轮 pass 是否持续无收益，动态关闭高成本 pass。
+测试脚本层面可以缓存 sylib.o，但这属于测评脚本优化，不属于生成代码质量优化。
+```
+
+## 8. 下一轮建议优先级
+
+建议顺序：
+
+```text
+1. FMul + FAdd -> FMA / fmadd，优先服务 h-13/h-14 浮点卷积。
+2. crypto 的 power-of-two rem/div idiom，需要先做非负证明或 value range。
+3. gameoflife stencil 行缓存，先做标量版本，不急着上 NEON。
+4. h-9 递归优化，只做通用递归展开/特化，不做样例名特判。
+5. RegAlloc spill 质量继续优化，重点看 h-11/h-14 是否有异常 spill/reload。
+```
+
+答辩时建议说法：
+
+```text
+当前优化重点已经从“能跑通后端”转向“热点样例针对性但非特判的通用优化”。
+本轮新增的 float select lowering 是对已有 int select lowering 的类型扩展，属于通用后端 lowering，不依赖样例名。
+```
--- a/study_materials/README.md
+++ b/study_materials/README.md
@ -0,0 +1,131 @@
+# NUDT Compiler 学习资料总览
+
+这套资料现在按下面的读者背景组织：
+
+```text
+你学过编译原理，知道词法、语法、语义、IR、优化、寄存器分配这些概念；
+但你没有完整参与过一个编译器工程的编码实现，也不一定熟悉大型 C++ 项目的阅读方式。
+```
+
+因此阅读目标不是马上独立重写代码，而是分三步：
+
+```text
+第一步：能从文件和函数名判断代码属于编译器哪个阶段。
+第二步：能把教材概念对应到本项目的类、函数和 pass。
+第三步：能在答辩或验收中解释“这个优化在哪里、为什么安全、怎么验证”。
+```
+
+不要一开始逐行读代码。正确方式是先看主流程，再看数据结构，再看一个最小样例如何穿过 pipeline，最后再读优化细节。
+
+## 推荐阅读顺序
+
+1. `00_prerequisites.md`
+   - 补齐编译原理、SSA、数据流分析、循环优化、寄存器分配、AArch64 的前置知识。
+
+2. `01_project_architecture.md`
+   - 建立整个项目的端到端结构：前端、语义、IR、中端、MIR、后端、测试脚本。
+
+3. `02_frontend_sema_irgen.md`
+   - 理解词法语法、语义检查、符号表、常量求值、IR 生成。
+
+4. `03_ir_core_and_analyses.md`
+   - 理解 IR 基础设施、Use-Def、CFG、DominatorTree、LoopInfo。
+
+5. `04_ir_optimization_passes.md`
+   - 理解当前中端优化，包括 Mem2Reg、GVN、LICM、内联、循环优化、Load/Store 消除。
+
+6. `05_backend_mir_and_codegen.md`
+   - 理解 MIR、指令选择、寄存器分配、栈帧、AArch64 汇编输出和后端 peephole。
+
+7. `06_testing_and_performance.md`
+   - 理解如何验证正确性、如何建立 GCC baseline、如何读 timing/speedup 表。
+
+8. `07_defense_qa.md`
+   - 答辩高频问题、建议回答方式、容易被追问的风险点。
+
+9. `08_file_index.md`
+   - 逐目录、逐文件索引，用来快速定位代码。
+
+10. `09_optimization_inventory.md`
+    - 所有当前已实现优化的总清单，逐项说明实现位置、触发条件和实现方式；答辩前应重点阅读。
+
+11. `10_learning_path.md`
+    - 从全链路、IR、分析、优化、后端到独立改 pass 的学习路线和掌握清单。
+
+12. `11_pass_writing_guide.md`
+    - 安全编写 IR/MIR pass 的流程、常见坑、pipeline 接入方式和优化提交模板。
+
+13. `12_debugging_playbook.md`
+    - 正确性和性能问题的定位流程，包含环境变量二分、敏感样例和回归策略。
+
+14. `13_aarch64_backend_cheatsheet.md`
+    - ARM/AArch64 后端速查，包括寄存器约定、常见指令、访存、分支和常数除法。
+
+15. `14_case_study_playbook.md`
+    - 针对 `h-11`、`h-4`、`crypto`、`matmul`、`gameoflife` 等热点样例的分析模板。
+
+16. `15_glossary.md`
+    - 编译器术语速查，覆盖前端、IR、分析、中端优化、后端和 AArch64 指令。
+
+17. `16_code_reading_for_non_coders.md`
+    - 给缺少工程编码经验的读者准备的代码阅读方法，包含 C++ 项目最低限度知识、如何顺着调用链读代码、如何看懂 pass。
+
+18. `17_code_companion.md`
+    - 源码伴读文档。按主流程、IR pass、循环优化、MIR pass、后端输出、测试脚本逐段贴真实代码并解释。
+
+19. `18_current_optimization_targets.md`
+    - 当前性能热点、本轮新增优化、验证结果和后续优化优先级。适合比赛优化复盘和答辩准备。
+
+## 如果你没实际写过大型 C++ 项目
+
+建议采用这个顺序：
+
+```text
+README.md
+16_code_reading_for_non_coders.md
+17_code_companion.md
+10_learning_path.md
+01_project_architecture.md
+08_file_index.md
+03_ir_core_and_analyses.md
+04_ir_optimization_passes.md
+05_backend_mir_and_codegen.md
+09_optimization_inventory.md
+18_current_optimization_targets.md
+07_defense_qa.md
+```
+
+每次读代码只回答四个问题：
+
+```text
+这个文件属于哪个阶段？
+这个函数的输入是什么？
+这个函数的输出或副作用是什么？
+它为什么不会破坏程序语义？
+```
+
+## 一句话架构
+
+当前编译器主流程是：
+
+```text
+SysY 源码
+  -> ANTLR 语法树
+  -> 语义分析和符号表
+  -> 自研 SSA-like IR
+  -> IR 中端优化流水线
+  -> MIR/AArch64 指令选择
+  -> MIR 优化、寄存器分配、栈帧生成
+  -> AArch64 汇编
+  -> gcc 链接 sylib 后用 qemu-aarch64 验证
+```
+
+## 答辩时最应该讲清楚的点
+
+- 代码不是只完成前端，而是有完整的自研 IR、中端优化、MIR 后端和 AArch64 汇编输出。
+- 正确性验证依赖 `verify_ir.sh`、`verify_asm.sh`、`verify_asm_all_time.sh` 等脚本。
+- 性能评估使用 GCC baseline，并以样例运行时间和 speedup 表定位优化方向。
+- 主要优化收益来自中端循环和内存优化、函数内联、尾递归消除、后端 peephole、访存和分支优化。
+- 当前仍可继续优化的方向包括更强别名分析、循环变换、后端寄存器分配和更系统的 AArch64 指令调度。
+- 具体优化清单见 `09_optimization_inventory.md`，其中也标明了哪些方向当前不能夸大为已完成。
+- 如果需要“说明下面直接看代码”，优先读 `17_code_companion.md`，它是当前最详细的源码伴读版本。
--- a/sylib/sylib.c
+++ b/sylib/sylib.c
@ -1,8 +1,145 @@
 #include "sylib.h"

+#include <pthread.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>

+#if defined(__aarch64__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
+
+#ifndef NUDTC_PARALLEL_THREADS
+#define NUDTC_PARALLEL_THREADS 4
+#endif
+
+#ifndef NUDTC_PARALLEL_MAX_THREADS
+#define NUDTC_PARALLEL_MAX_THREADS 8
+#endif
+
+#if NUDTC_PARALLEL_THREADS < 1
+#undef NUDTC_PARALLEL_THREADS
+#define NUDTC_PARALLEL_THREADS 1
+#endif
+
+#if NUDTC_PARALLEL_MAX_THREADS < NUDTC_PARALLEL_THREADS
+#undef NUDTC_PARALLEL_MAX_THREADS
+#define NUDTC_PARALLEL_MAX_THREADS NUDTC_PARALLEL_THREADS
+#endif
+
+#ifndef NUDTC_PARALLEL_MIN_TRIP
+#define NUDTC_PARALLEL_MIN_TRIP 8192
+#endif
+
+#ifndef NUDTC_PARALLEL_MIN_CHUNK
+#define NUDTC_PARALLEL_MIN_CHUNK 2048
+#endif
+
+#define NUDTC_PARALLEL_WORKERS                                                \
+  (NUDTC_PARALLEL_MAX_THREADS > 1 ? NUDTC_PARALLEL_MAX_THREADS - 1 : 1)
+
+typedef void (*nudtc_parallel_body_i32)(int, int);
+
+struct nudtc_parallel_task {
+  nudtc_parallel_body_i32 body;
+  int begin;
+  int end;
+};
+
+static void nudtc_parallel_empty_body(int begin, int end) {
+  (void)begin;
+  (void)end;
+}
+
+static pthread_t nudtc_parallel_threads[NUDTC_PARALLEL_WORKERS];
+static struct nudtc_parallel_task nudtc_parallel_tasks[NUDTC_PARALLEL_MAX_THREADS];
+static pthread_mutex_t nudtc_parallel_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t nudtc_parallel_start = PTHREAD_COND_INITIALIZER;
+static pthread_cond_t nudtc_parallel_done = PTHREAD_COND_INITIALIZER;
+static int nudtc_parallel_initialized = 0;
+static int nudtc_parallel_worker_count = 0;
+static int nudtc_parallel_generation = 0;
+static int nudtc_parallel_active = 0;
+static __thread int nudtc_parallel_depth = 0;
+
+static int nudtc_parallel_read_int_env(const char* name, int fallback, int min_value,
+                                       int max_value) {
+  const char* raw = getenv(name);
+  if (raw == NULL || raw[0] == '\0') {
+    return fallback;
+  }
+  char* end = NULL;
+  long value = strtol(raw, &end, 10);
+  if (end == raw || value < min_value || value > max_value) {
+    return fallback;
+  }
+  return (int)value;
+}
+
+static int nudtc_parallel_requested_threads(void) {
+  return nudtc_parallel_read_int_env("NUDTC_PARALLEL_THREADS",
+                                    NUDTC_PARALLEL_THREADS, 1,
+                                    NUDTC_PARALLEL_MAX_THREADS);
+}
+
+static int nudtc_parallel_min_trip(void) {
+  return nudtc_parallel_read_int_env("NUDTC_PARALLEL_MIN_TRIP",
+                                    NUDTC_PARALLEL_MIN_TRIP, 1, 1 << 30);
+}
+
+static int nudtc_parallel_min_chunk(void) {
+  return nudtc_parallel_read_int_env("NUDTC_PARALLEL_MIN_CHUNK",
+                                    NUDTC_PARALLEL_MIN_CHUNK, 1, 1 << 30);
+}
+
+static void* nudtc_parallel_pool_worker(void* arg) {
+  const int tid = (int)(long)arg;
+  int seen_generation = 0;
+
+  for (;;) {
+    pthread_mutex_lock(&nudtc_parallel_lock);
+    while (nudtc_parallel_generation == seen_generation) {
+      pthread_cond_wait(&nudtc_parallel_start, &nudtc_parallel_lock);
+    }
+    seen_generation = nudtc_parallel_generation;
+    const struct nudtc_parallel_task task = nudtc_parallel_tasks[tid];
+    pthread_mutex_unlock(&nudtc_parallel_lock);
+
+    nudtc_parallel_depth = 1;
+    task.body(task.begin, task.end);
+    nudtc_parallel_depth = 0;
+
+    pthread_mutex_lock(&nudtc_parallel_lock);
+    --nudtc_parallel_active;
+    if (nudtc_parallel_active == 0) {
+      pthread_cond_signal(&nudtc_parallel_done);
+    }
+    pthread_mutex_unlock(&nudtc_parallel_lock);
+  }
+  return NULL;
+}
+
+static void nudtc_parallel_pool_init(int requested_threads) {
+  pthread_mutex_lock(&nudtc_parallel_lock);
+  if (!nudtc_parallel_initialized) {
+    nudtc_parallel_initialized = 1;
+  }
+  for (int tid = nudtc_parallel_worker_count + 1; tid < requested_threads; ++tid) {
+    if (tid >= NUDTC_PARALLEL_MAX_THREADS) {
+      break;
+    }
+    pthread_t thread;
+    if (pthread_create(&thread, NULL, nudtc_parallel_pool_worker,
+                       (void*)(long)tid) != 0) {
+      break;
+    }
+    pthread_detach(thread);
+    nudtc_parallel_threads[nudtc_parallel_worker_count] = thread;
+    ++nudtc_parallel_worker_count;
+  }
+  pthread_mutex_unlock(&nudtc_parallel_lock);
+}
+
 static int read_char_normalized(void) {
  int ch = getchar();
  if (ch == '\r') {
@ -78,4 +215,257 @@ void putfarray(int n, const float a[]) {

 void starttime(void) {}

-void stoptime(void) {}
+void stoptime(void) {}
+
+static int nudtc_ranges_disjoint(const void* lhs, const void* rhs, int n,
+                                 int elem_size) {
+  if (n <= 0) {
+    return 1;
+  }
+  const uintptr_t bytes = (uintptr_t)(unsigned int)n * (uintptr_t)elem_size;
+  const uintptr_t lhs_begin = (uintptr_t)lhs;
+  const uintptr_t rhs_begin = (uintptr_t)rhs;
+  const uintptr_t lhs_end = lhs_begin + bytes;
+  const uintptr_t rhs_end = rhs_begin + bytes;
+  if (lhs_end < lhs_begin || rhs_end < rhs_begin) {
+    return 0;
+  }
+  return lhs_end <= rhs_begin || rhs_end <= lhs_begin;
+}
+
+static int nudtc_can_vector_binary(const void* dst, const void* lhs,
+                                   const void* rhs, int n, int elem_size) {
+  return (dst == lhs || nudtc_ranges_disjoint(dst, lhs, n, elem_size)) &&
+         (dst == rhs || nudtc_ranges_disjoint(dst, rhs, n, elem_size));
+}
+
+void __nudtc_neon_i32_add(int dst[], const int lhs[], const int rhs[], int n) {
+  if (dst == NULL || lhs == NULL || rhs == NULL || n <= 0) {
+    return;
+  }
+  int i = 0;
+#if defined(__aarch64__) || defined(__ARM_NEON)
+  if (nudtc_can_vector_binary(dst, lhs, rhs, n, 4)) {
+    for (; i + 4 <= n; i += 4) {
+      const int32x4_t a = vld1q_s32(lhs + i);
+      const int32x4_t b = vld1q_s32(rhs + i);
+      vst1q_s32(dst + i, vaddq_s32(a, b));
+    }
+  }
+#endif
+  for (; i < n; ++i) {
+    dst[i] = lhs[i] + rhs[i];
+  }
+}
+
+void __nudtc_neon_i32_sub(int dst[], const int lhs[], const int rhs[], int n) {
+  if (dst == NULL || lhs == NULL || rhs == NULL || n <= 0) {
+    return;
+  }
+  int i = 0;
+#if defined(__aarch64__) || defined(__ARM_NEON)
+  if (nudtc_can_vector_binary(dst, lhs, rhs, n, 4)) {
+    for (; i + 4 <= n; i += 4) {
+      const int32x4_t a = vld1q_s32(lhs + i);
+      const int32x4_t b = vld1q_s32(rhs + i);
+      vst1q_s32(dst + i, vsubq_s32(a, b));
+    }
+  }
+#endif
+  for (; i < n; ++i) {
+    dst[i] = lhs[i] - rhs[i];
+  }
+}
+
+void __nudtc_neon_i32_mul(int dst[], const int lhs[], const int rhs[], int n) {
+  if (dst == NULL || lhs == NULL || rhs == NULL || n <= 0) {
+    return;
+  }
+  int i = 0;
+#if defined(__aarch64__) || defined(__ARM_NEON)
+  if (nudtc_can_vector_binary(dst, lhs, rhs, n, 4)) {
+    for (; i + 4 <= n; i += 4) {
+      const int32x4_t a = vld1q_s32(lhs + i);
+      const int32x4_t b = vld1q_s32(rhs + i);
+      vst1q_s32(dst + i, vmulq_s32(a, b));
+    }
+  }
+#endif
+  for (; i < n; ++i) {
+    dst[i] = lhs[i] * rhs[i];
+  }
+}
+
+void __nudtc_neon_f32_add(float dst[], const float lhs[], const float rhs[], int n) {
+  if (dst == NULL || lhs == NULL || rhs == NULL || n <= 0) {
+    return;
+  }
+  int i = 0;
+#if defined(__aarch64__) || defined(__ARM_NEON)
+  if (nudtc_can_vector_binary(dst, lhs, rhs, n, 4)) {
+    for (; i + 4 <= n; i += 4) {
+      const float32x4_t a = vld1q_f32(lhs + i);
+      const float32x4_t b = vld1q_f32(rhs + i);
+      vst1q_f32(dst + i, vaddq_f32(a, b));
+    }
+  }
+#endif
+  for (; i < n; ++i) {
+    dst[i] = lhs[i] + rhs[i];
+  }
+}
+
+void __nudtc_neon_f32_sub(float dst[], const float lhs[], const float rhs[], int n) {
+  if (dst == NULL || lhs == NULL || rhs == NULL || n <= 0) {
+    return;
+  }
+  int i = 0;
+#if defined(__aarch64__) || defined(__ARM_NEON)
+  if (nudtc_can_vector_binary(dst, lhs, rhs, n, 4)) {
+    for (; i + 4 <= n; i += 4) {
+      const float32x4_t a = vld1q_f32(lhs + i);
+      const float32x4_t b = vld1q_f32(rhs + i);
+      vst1q_f32(dst + i, vsubq_f32(a, b));
+    }
+  }
+#endif
+  for (; i < n; ++i) {
+    dst[i] = lhs[i] - rhs[i];
+  }
+}
+
+void __nudtc_neon_f32_mul(float dst[], const float lhs[], const float rhs[], int n) {
+  if (dst == NULL || lhs == NULL || rhs == NULL || n <= 0) {
+    return;
+  }
+  int i = 0;
+#if defined(__aarch64__) || defined(__ARM_NEON)
+  if (nudtc_can_vector_binary(dst, lhs, rhs, n, 4)) {
+    for (; i + 4 <= n; i += 4) {
+      const float32x4_t a = vld1q_f32(lhs + i);
+      const float32x4_t b = vld1q_f32(rhs + i);
+      vst1q_f32(dst + i, vmulq_f32(a, b));
+    }
+  }
+#endif
+  for (; i < n; ++i) {
+    dst[i] = lhs[i] * rhs[i];
+  }
+}
+
+void __nudtc_neon_i32_fill(int dst[], int value, int n) {
+  if (dst == NULL || n <= 0) {
+    return;
+  }
+  int i = 0;
+#if defined(__aarch64__) || defined(__ARM_NEON)
+  const int32x4_t v = vdupq_n_s32(value);
+  for (; i + 4 <= n; i += 4) {
+    vst1q_s32(dst + i, v);
+  }
+#endif
+  for (; i < n; ++i) {
+    dst[i] = value;
+  }
+}
+
+void __nudtc_neon_f32_fill(float dst[], float value, int n) {
+  if (dst == NULL || n <= 0) {
+    return;
+  }
+  int i = 0;
+#if defined(__aarch64__) || defined(__ARM_NEON)
+  const float32x4_t v = vdupq_n_f32(value);
+  for (; i + 4 <= n; i += 4) {
+    vst1q_f32(dst + i, v);
+  }
+#endif
+  for (; i < n; ++i) {
+    dst[i] = value;
+  }
+}
+
+void __nudtc_parallel_for_i32(int begin, int end, int step,
+                              void (*body)(int, int)) {
+  if (body == NULL || step != 1 || end <= begin) {
+    return;
+  }
+
+  const int trip_count = end - begin;
+  if (trip_count < nudtc_parallel_min_trip()) {
+    body(begin, end);
+    return;
+  }
+
+  if (nudtc_parallel_depth != 0) {
+    body(begin, end);
+    return;
+  }
+
+  const int requested_threads = nudtc_parallel_requested_threads();
+  if (requested_threads < 2) {
+    body(begin, end);
+    return;
+  }
+
+  nudtc_parallel_pool_init(requested_threads);
+
+  int thread_count = nudtc_parallel_worker_count + 1;
+  if (thread_count > requested_threads) {
+    thread_count = requested_threads;
+  }
+  if (thread_count < 2) {
+    body(begin, end);
+    return;
+  }
+  if (thread_count > trip_count) {
+    thread_count = trip_count;
+  }
+  const int min_chunk = nudtc_parallel_min_chunk();
+  if (trip_count / thread_count < min_chunk) {
+    thread_count = (trip_count + min_chunk - 1) / min_chunk;
+    if (thread_count < 1) {
+      thread_count = 1;
+    }
+    if (thread_count > nudtc_parallel_worker_count + 1) {
+      thread_count = nudtc_parallel_worker_count + 1;
+    }
+  }
+  if (thread_count < 2) {
+    body(begin, end);
+    return;
+  }
+
+  pthread_mutex_lock(&nudtc_parallel_lock);
+  int chunk_begin = begin;
+  for (int tid = 0; tid < thread_count; ++tid) {
+    const int remaining = end - chunk_begin;
+    const int chunks_left = thread_count - tid;
+    const int chunk_size = (remaining + chunks_left - 1) / chunks_left;
+    nudtc_parallel_tasks[tid].body = body;
+    nudtc_parallel_tasks[tid].begin = chunk_begin;
+    nudtc_parallel_tasks[tid].end = chunk_begin + chunk_size;
+    chunk_begin = nudtc_parallel_tasks[tid].end;
+  }
+  for (int tid = thread_count; tid <= nudtc_parallel_worker_count; ++tid) {
+    nudtc_parallel_tasks[tid].body = nudtc_parallel_empty_body;
+    nudtc_parallel_tasks[tid].begin = begin;
+    nudtc_parallel_tasks[tid].end = begin;
+  }
+
+  const struct nudtc_parallel_task main_task = nudtc_parallel_tasks[0];
+  nudtc_parallel_active = nudtc_parallel_worker_count;
+  ++nudtc_parallel_generation;
+  pthread_cond_broadcast(&nudtc_parallel_start);
+  pthread_mutex_unlock(&nudtc_parallel_lock);
+
+  nudtc_parallel_depth = 1;
+  main_task.body(main_task.begin, main_task.end);
+  nudtc_parallel_depth = 0;
+
+  pthread_mutex_lock(&nudtc_parallel_lock);
+  while (nudtc_parallel_active > 0) {
+    pthread_cond_wait(&nudtc_parallel_done, &nudtc_parallel_lock);
+  }
+  pthread_mutex_unlock(&nudtc_parallel_lock);
+}
--- a/sylib/sylib.h
+++ b/sylib/sylib.h
@ -13,5 +13,15 @@ void putarray(int n, const int a[]);
 void putfarray(int n, const float a[]);
 void starttime(void);
 void stoptime(void);
+void __nudtc_parallel_for_i32(int begin, int end, int step,
+                              void (*body)(int, int));
+void __nudtc_neon_i32_add(int dst[], const int lhs[], const int rhs[], int n);
+void __nudtc_neon_i32_sub(int dst[], const int lhs[], const int rhs[], int n);
+void __nudtc_neon_i32_mul(int dst[], const int lhs[], const int rhs[], int n);
+void __nudtc_neon_f32_add(float dst[], const float lhs[], const float rhs[], int n);
+void __nudtc_neon_f32_sub(float dst[], const float lhs[], const float rhs[], int n);
+void __nudtc_neon_f32_mul(float dst[], const float lhs[], const float rhs[], int n);
+void __nudtc_neon_i32_fill(int dst[], int value, int n);
+void __nudtc_neon_f32_fill(float dst[], float value, int n);

-#endif
+#endif
Author	SHA1	Message	Date
tangttangtang	93ff6fad02	并行优化分支	1 week ago
tangttangtang	cbf1e6ba83	并行分支	1 week ago