diff --git a/include/mir/MIR.h b/include/mir/MIR.h index 72c6a08..3100efc 100644 --- a/include/mir/MIR.h +++ b/include/mir/MIR.h @@ -22,8 +22,10 @@ MIRContext& DefaultContext(); enum class PhysReg { W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, + W19, W20, W21, W22, W23, W24, X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X29, X30, SP, + X19, X20, X21, X22, X23, X24, S0, S1, S2, S3, S4, S5, S6, S7, // 单精度浮点寄存器 S8, S9, S10 }; @@ -236,6 +238,7 @@ class MachineModule { std::unique_ptr LowerToMIR(const ir::Module& module); void RunPeephole(MachineFunction& function); void RunRegAlloc(MachineFunction& function); +void RunLoopSlotPromotion(MachineFunction& function); void RunFrameLowering(MachineFunction& function); void PrintAsm(const MachineModule& module, std::ostream& os); diff --git a/src/ir/passes/CSE.cpp b/src/ir/passes/CSE.cpp index b6d4b9b..642e974 100644 --- a/src/ir/passes/CSE.cpp +++ b/src/ir/passes/CSE.cpp @@ -5,11 +5,18 @@ // 算法:在每个基本块内,使用哈希表记录已出现的表达式。 // 当遇到相同操作码 + 相同操作数的指令时,复用之前的结果。 // 这是局部 CSE(Local CSE),只在基本块内消除。 +// +// 对 Load 采用保守内存值编号:同一基本块内相同指针、且中间没有 +// 可能别名的 store/call 时才复用;同时支持 store 后紧跟同指针 load +// 的局部转发。 #include "ir/IR.h" +#include +#include #include #include +#include #include namespace ir { @@ -80,6 +87,243 @@ ExprKey MakeKey(Instruction* inst) { return key; } +bool IsDistinctLocalOrGlobalObject(Value* lhs, Value* rhs) { + if (lhs == rhs) return false; + const bool lhs_known = dynamic_cast(lhs) != nullptr || + dynamic_cast(lhs) != nullptr; + const bool rhs_known = dynamic_cast(rhs) != nullptr || + dynamic_cast(rhs) != nullptr; + return lhs_known && rhs_known; +} + +struct AffineExpr { + int64_t constant = 0; + std::vector> terms; + + bool operator==(const AffineExpr& other) const { + return constant == other.constant && terms == other.terms; + } +}; + +void Normalize(AffineExpr* expr) { + std::sort(expr->terms.begin(), expr->terms.end(), + [](const auto& lhs, const auto& rhs) { + return lhs.first < rhs.first; + }); + std::vector> normalized; + for (const auto& [value, coeff] : expr->terms) { + if (coeff == 0) continue; + if (!normalized.empty() && normalized.back().first == value) { + normalized.back().second += coeff; + if (normalized.back().second == 0) { + normalized.pop_back(); + } + } else { + normalized.push_back({value, coeff}); + } + } + expr->terms = std::move(normalized); +} + +bool ScaleAffine(const AffineExpr& input, int64_t scale, AffineExpr* out) { + out->constant = input.constant * scale; + out->terms.clear(); + out->terms.reserve(input.terms.size()); + for (const auto& [value, coeff] : input.terms) { + out->terms.push_back({value, coeff * scale}); + } + Normalize(out); + return true; +} + +bool BuildAffineExprImpl(Value* value, AffineExpr* out, + std::unordered_set& visiting, int depth) { + if (depth > 64) { + return false; + } + if (auto* constant = dynamic_cast(value)) { + out->constant = constant->GetValue(); + out->terms.clear(); + return true; + } + + auto* bin = dynamic_cast(value); + if (!bin) { + out->constant = 0; + out->terms = {{value, 1}}; + return true; + } + + if (!visiting.insert(value).second) { + return false; + } + + AffineExpr lhs; + AffineExpr rhs; + bool ok = false; + switch (bin->GetOpcode()) { + case Opcode::Add: + case Opcode::Sub: + if (!BuildAffineExprImpl(bin->GetLhs(), &lhs, visiting, depth + 1) || + !BuildAffineExprImpl(bin->GetRhs(), &rhs, visiting, depth + 1)) { + break; + } + out->constant = lhs.constant + + (bin->GetOpcode() == Opcode::Add ? rhs.constant + : -rhs.constant); + out->terms = lhs.terms; + for (const auto& [term, coeff] : rhs.terms) { + out->terms.push_back( + {term, bin->GetOpcode() == Opcode::Add ? coeff : -coeff}); + } + Normalize(out); + ok = true; + break; + case Opcode::Mul: { + auto* lhs_const = dynamic_cast(bin->GetLhs()); + auto* rhs_const = dynamic_cast(bin->GetRhs()); + if (lhs_const && + BuildAffineExprImpl(bin->GetRhs(), &rhs, visiting, depth + 1)) { + ok = ScaleAffine(rhs, lhs_const->GetValue(), out); + break; + } + if (rhs_const && + BuildAffineExprImpl(bin->GetLhs(), &lhs, visiting, depth + 1)) { + ok = ScaleAffine(lhs, rhs_const->GetValue(), out); + break; + } + break; + } + default: + out->constant = 0; + out->terms = {{value, 1}}; + ok = true; + break; + } + visiting.erase(value); + return ok; +} + +bool BuildAffineExpr(Value* value, AffineExpr* out) { + std::unordered_set visiting; + return BuildAffineExprImpl(value, out, visiting, 0); +} + +struct MemoryKey { + bool affine = false; + Value* exact = nullptr; + Value* base = nullptr; + AffineExpr index; + + bool operator==(const MemoryKey& other) const { + if (affine != other.affine) return false; + if (!affine) return exact == other.exact; + return base == other.base && index == other.index; + } +}; + +struct MemoryKeyHash { + size_t operator()(const MemoryKey& key) const { + if (!key.affine) { + return std::hash()(key.exact); + } + size_t h = std::hash()(key.base); + h ^= std::hash()(key.index.constant) + 0x9e3779b9 + (h << 6) + + (h >> 2); + for (const auto& [value, coeff] : key.index.terms) { + h ^= std::hash()(value) + 0x9e3779b9 + (h << 6) + (h >> 2); + h ^= std::hash()(coeff) + 0x9e3779b9 + (h << 6) + (h >> 2); + } + return h; + } +}; + +bool BuildMemoryKey(Value* ptr, MemoryKey* key) { + if (auto* gep = dynamic_cast(ptr)) { + MemoryKey base_key; + if (!BuildMemoryKey(gep->GetBase(), &base_key)) { + return false; + } + if (!base_key.affine) { + key->affine = false; + key->exact = ptr; + return true; + } + AffineExpr index; + if (!BuildAffineExpr(gep->GetIndex(), &index)) { + key->affine = false; + key->exact = ptr; + return true; + } + key->affine = true; + key->exact = nullptr; + key->base = base_key.base; + key->index = base_key.index; + key->index.constant += index.constant; + key->index.terms.insert(key->index.terms.end(), index.terms.begin(), + index.terms.end()); + Normalize(&key->index); + return true; + } + + key->affine = true; + key->exact = nullptr; + key->base = ptr; + key->index = {}; + return true; +} + +bool SameAffineSlope(const AffineExpr& lhs, const AffineExpr& rhs) { + return lhs.terms == rhs.terms; +} + +bool MayAlias(const MemoryKey& lhs, const MemoryKey& rhs) { + if (lhs == rhs) return true; + if (lhs.affine && rhs.affine) { + if (lhs.base != rhs.base) { + return !IsDistinctLocalOrGlobalObject(lhs.base, rhs.base); + } + if (SameAffineSlope(lhs.index, rhs.index) && + lhs.index.constant != rhs.index.constant) { + return false; + } + return true; + } + return true; +} + +void ClearMemoryState( + std::unordered_map& load_values, + std::unordered_map& store_values) { + load_values.clear(); + store_values.clear(); +} + +void InvalidateMayAliasMemory( + std::unordered_map& load_values, + std::unordered_map& store_values, + const MemoryKey& store_key) { + for (auto it = load_values.begin(); it != load_values.end();) { + MemoryKey load_key; + BuildMemoryKey(it->first, &load_key); + if (MayAlias(load_key, store_key)) { + it = load_values.erase(it); + } else { + ++it; + } + } + + for (auto it = store_values.begin(); it != store_values.end();) { + MemoryKey prior_store_key; + BuildMemoryKey(it->first, &prior_store_key); + if (MayAlias(prior_store_key, store_key)) { + it = store_values.erase(it); + } else { + ++it; + } + } +} + } // namespace bool RunCSE(Function& func) { @@ -91,11 +335,42 @@ bool RunCSE(Function& func) { if (!bb) continue; std::unordered_map expr_map; + std::unordered_map load_values; + std::unordered_map store_values; std::vector to_remove; for (const auto& inst_ptr : bb->GetInstructions()) { auto* inst = inst_ptr.get(); + if (inst->GetOpcode() == Opcode::Call) { + ClearMemoryState(load_values, store_values); + } else if (auto* store = dynamic_cast(inst)) { + MemoryKey store_key; + BuildMemoryKey(store->GetPtr(), &store_key); + InvalidateMayAliasMemory(load_values, store_values, store_key); + store_values[store->GetPtr()] = store->GetValue(); + } + + if (auto* load = dynamic_cast(inst)) { + auto it = store_values.find(load->GetPtr()); + if (it != store_values.end() && it->second && + it->second->GetType()->GetKind() == load->GetType()->GetKind()) { + load->ReplaceAllUsesWith(it->second); + to_remove.push_back(load); + changed = true; + continue; + } + auto load_it = load_values.find(load->GetPtr()); + if (load_it != load_values.end()) { + load->ReplaceAllUsesWith(load_it->second); + to_remove.push_back(load); + changed = true; + } else { + load_values[load->GetPtr()] = load; + } + continue; + } + if (!IsCSECandidate(inst)) continue; ExprKey key = MakeKey(inst); diff --git a/src/main.cpp b/src/main.cpp index 160c06b..2cd6a2a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -159,6 +159,7 @@ int main(int argc, char** argv) { for (const auto& func_ptr : machine_module->GetFunctions()) { mir::RunPeephole(*func_ptr); mir::RunRegAlloc(*func_ptr); + mir::RunLoopSlotPromotion(*func_ptr); mir::RunFrameLowering(*func_ptr); mir::RunPeephole(*func_ptr); } diff --git a/src/mir/CMakeLists.txt b/src/mir/CMakeLists.txt index 0b0996b..0b74bf6 100644 --- a/src/mir/CMakeLists.txt +++ b/src/mir/CMakeLists.txt @@ -6,6 +6,7 @@ add_library(mir_core STATIC Register.cpp Lowering.cpp RegAlloc.cpp + LoopSlotPromotion.cpp FrameLowering.cpp AsmPrinter.cpp ) diff --git a/src/mir/FrameLowering.cpp b/src/mir/FrameLowering.cpp index 2a901a0..e01fab5 100644 --- a/src/mir/FrameLowering.cpp +++ b/src/mir/FrameLowering.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include "utils/Log.h" @@ -15,6 +16,12 @@ int AlignTo(int value, int align) { // 获取 W 寄存器对应的 X 寄存器 PhysReg WRegToXReg(PhysReg w) { + if (w == PhysReg::W19) return PhysReg::X19; + if (w == PhysReg::W20) return PhysReg::X20; + if (w == PhysReg::W21) return PhysReg::X21; + if (w == PhysReg::W22) return PhysReg::X22; + if (w == PhysReg::W23) return PhysReg::X23; + if (w == PhysReg::W24) return PhysReg::X24; int idx = static_cast(w) - static_cast(PhysReg::W0); if (idx >= 0 && idx <= 11) { return static_cast(static_cast(PhysReg::X0) + idx); @@ -22,12 +29,32 @@ PhysReg WRegToXReg(PhysReg w) { return w; } +std::unordered_set CollectUsedFrameSlots(const MachineFunction& function) { + std::unordered_set used; + for (const auto& bb_ptr : function.GetBlocks()) { + for (const auto& inst : bb_ptr->GetInstructions()) { + for (const auto& op : inst.GetOperands()) { + if (op.IsFrameIndex()) { + used.insert(op.GetFrameIndex()); + } + } + } + } + return used; +} + } // namespace void RunFrameLowering(MachineFunction& function) { + const auto used_frame_slots = CollectUsedFrameSlots(function); + // 计算栈槽偏移 int cursor = 0; for (const auto& slot : function.GetFrameSlots()) { + if (!used_frame_slots.count(slot.index)) { + function.GetFrameSlot(slot.index).offset = 0; + continue; + } cursor += slot.size; function.GetFrameSlot(slot.index).offset = -cursor; } @@ -38,7 +65,10 @@ void RunFrameLowering(MachineFunction& function) { for (size_t i = 0; i < callee_saved.size(); ++i) { PhysReg save_reg = callee_saved[i]; PhysReg x_reg = save_reg; - if (save_reg >= PhysReg::W0 && save_reg <= PhysReg::W11) { + if ((save_reg >= PhysReg::W0 && save_reg <= PhysReg::W11) || + save_reg == PhysReg::W19 || save_reg == PhysReg::W20 || + save_reg == PhysReg::W21 || save_reg == PhysReg::W22 || + save_reg == PhysReg::W23 || save_reg == PhysReg::W24) { x_reg = WRegToXReg(save_reg); } // 浮点 callee-saved 直接用 s 寄存器保存(4字节) diff --git a/src/mir/LoopSlotPromotion.cpp b/src/mir/LoopSlotPromotion.cpp new file mode 100644 index 0000000..a6134c2 --- /dev/null +++ b/src/mir/LoopSlotPromotion.cpp @@ -0,0 +1,623 @@ +#include "mir/MIR.h" + +#include +#include +#include +#include +#include +#include + +namespace mir { +namespace { + +bool IsControlTransfer(const MachineInstr& inst) { + switch (inst.GetOpcode()) { + case Opcode::B: + case Opcode::Bcond: + case Opcode::FBcond: + case Opcode::Cbnz: + case Opcode::Cbz: + case Opcode::Ret: + return true; + default: + return false; + } +} + +std::optional GetLoadSlot(const MachineInstr& inst) { + const auto& ops = inst.GetOperands(); + if (inst.GetOpcode() != Opcode::LoadStack || ops.size() < 2 || + !ops[1].IsFrameIndex()) { + return std::nullopt; + } + return ops[1].GetFrameIndex(); +} + +std::optional GetStoreSlot(const MachineInstr& inst) { + const auto& ops = inst.GetOperands(); + if (inst.GetOpcode() != Opcode::StoreStack || ops.size() < 2 || + !ops[1].IsFrameIndex()) { + return std::nullopt; + } + return ops[1].GetFrameIndex(); +} + +bool IsOpaqueSlotUse(const MachineInstr& inst, int* slot) { + const auto& ops = inst.GetOperands(); + switch (inst.GetOpcode()) { + case Opcode::LoadStackOffset: + case Opcode::StoreStackOffset: + case Opcode::LoadStackAddr: + if (ops.size() >= 2 && ops[1].IsFrameIndex()) { + *slot = ops[1].GetFrameIndex(); + return true; + } + return false; + default: + return false; + } +} + +bool SameReg(PhysReg lhs, PhysReg rhs) { + return lhs == rhs; +} + +bool IsPromotableWReg(PhysReg reg) { + if (reg >= PhysReg::W0 && reg <= PhysReg::W11) return true; + return reg == PhysReg::W19 || reg == PhysReg::W20 || reg == PhysReg::W21 || + reg == PhysReg::W22 || reg == PhysReg::W23 || reg == PhysReg::W24; +} + +bool IsPromotableXReg(PhysReg reg) { + if (reg >= PhysReg::X0 && reg <= PhysReg::X11) return true; + return reg == PhysReg::X19 || reg == PhysReg::X20 || reg == PhysReg::X21 || + reg == PhysReg::X22 || reg == PhysReg::X23 || reg == PhysReg::X24; +} + +bool IsPromotableSReg(PhysReg reg) { + return reg >= PhysReg::S0 && reg <= PhysReg::S10; +} + +size_t FirstTerminatorIndex(const std::vector& insts) { + for (size_t i = 0; i < insts.size(); ++i) { + if (IsControlTransfer(insts[i])) return i; + } + return insts.size(); +} + +void InsertBeforeTerminators(std::vector& insts, + const std::vector& inserted) { + const size_t pos = FirstTerminatorIndex(insts); + insts.insert(insts.begin() + static_cast(pos), inserted.begin(), + inserted.end()); +} + +struct SlotUseInfo { + enum class RegKind { Unknown, W, X, S, Invalid }; + + int slot = -1; + int loads = 0; + int stores = 0; + int body_loads = 0; + int body_stores = 0; + int after_call_uses = 0; + RegKind reg_kind = RegKind::Unknown; + std::unordered_set use_blocks; +}; + +struct SlotPick { + int slot = -1; + SlotUseInfo::RegKind reg_kind = SlotUseInfo::RegKind::Unknown; + bool write_back = true; +}; + +struct LoopCandidate { + size_t header = 0; + size_t latch = 0; + int score = 0; + std::vector slots; + std::unordered_set blocks; +}; + +struct Promotion { + int slot = -1; + PhysReg reg = PhysReg::W19; + SlotUseInfo::RegKind reg_kind = SlotUseInfo::RegKind::Unknown; + bool write_back = true; +}; + +SlotUseInfo::RegKind ClassifyPromotableReg(PhysReg reg) { + if (IsPromotableWReg(reg)) return SlotUseInfo::RegKind::W; + if (IsPromotableXReg(reg)) return SlotUseInfo::RegKind::X; + if (IsPromotableSReg(reg)) return SlotUseInfo::RegKind::S; + return SlotUseInfo::RegKind::Invalid; +} + +void NoteSlotRegUse(SlotUseInfo& info, PhysReg reg) { + SlotUseInfo::RegKind use_kind = ClassifyPromotableReg(reg); + if (use_kind == SlotUseInfo::RegKind::Invalid || + (info.reg_kind != SlotUseInfo::RegKind::Unknown && + info.reg_kind != use_kind)) { + info.reg_kind = SlotUseInfo::RegKind::Invalid; + return; + } + info.reg_kind = use_kind; +} + +int SlotScore(const SlotUseInfo& info) { + int score = (info.body_loads + info.body_stores) * 4 + info.loads + + info.stores; + if (info.stores == 0) { + score += 80 + info.body_loads * 6; + } + if (info.body_loads > 0 && info.body_stores > 0) { + score += info.use_blocks.size() > 1 ? 140 : 20; + } + if (info.use_blocks.size() > 1) { + score += static_cast(info.use_blocks.size() - 1) * 24; + } + if (info.reg_kind == SlotUseInfo::RegKind::S && info.after_call_uses > 0) { + score += 180 + info.after_call_uses * 8; + } + return score; +} + +PhysReg GprForIndex(SlotUseInfo::RegKind kind, size_t index) { + static const std::vector w_regs = {PhysReg::W19, PhysReg::W20, + PhysReg::W21, PhysReg::W22, + PhysReg::W23, PhysReg::W24}; + static const std::vector x_regs = {PhysReg::X19, PhysReg::X20, + PhysReg::X21, PhysReg::X22, + PhysReg::X23, PhysReg::X24}; + if (kind == SlotUseInfo::RegKind::X) return x_regs[index]; + return w_regs[index]; +} + +std::vector GetSuccessors( + const MachineFunction& function, size_t block_index, + const std::unordered_map& block_index_by_name) { + const auto& blocks = function.GetBlocks(); + const auto& insts = blocks[block_index]->GetInstructions(); + std::vector succs; + for (const auto& inst : insts) { + const auto& ops = inst.GetOperands(); + switch (inst.GetOpcode()) { + case Opcode::B: + case Opcode::Bcond: + case Opcode::FBcond: + if (!ops.empty() && ops[0].IsSymbol()) { + auto it = block_index_by_name.find(ops[0].GetSymbol()); + if (it != block_index_by_name.end()) succs.push_back(it->second); + } + break; + case Opcode::Cbnz: + case Opcode::Cbz: + if (ops.size() > 1 && ops[1].IsSymbol()) { + auto it = block_index_by_name.find(ops[1].GetSymbol()); + if (it != block_index_by_name.end()) succs.push_back(it->second); + } + break; + default: + break; + } + } + if (!insts.empty()) { + Opcode last = insts.back().GetOpcode(); + if (last != Opcode::B && last != Opcode::Ret && + block_index + 1 < blocks.size()) { + succs.push_back(block_index + 1); + } + } + std::sort(succs.begin(), succs.end()); + succs.erase(std::unique(succs.begin(), succs.end()), succs.end()); + return succs; +} + +bool InLoop(const LoopCandidate& loop, size_t index) { + return loop.blocks.count(index) != 0; +} + +std::vector SortedLoopBlocks(const LoopCandidate& loop) { + std::vector blocks(loop.blocks.begin(), loop.blocks.end()); + std::sort(blocks.begin(), blocks.end()); + return blocks; +} + +std::vector> BuildSuccessors( + const MachineFunction& function, + const std::unordered_map& block_index_by_name) { + std::vector> succs(function.GetBlocks().size()); + for (size_t i = 0; i < succs.size(); ++i) { + succs[i] = GetSuccessors(function, i, block_index_by_name); + } + return succs; +} + +std::vector> BuildPredecessors( + const std::vector>& succs) { + std::vector> preds(succs.size()); + for (size_t i = 0; i < succs.size(); ++i) { + for (size_t succ : succs[i]) { + preds[succ].push_back(i); + } + } + for (auto& pred_list : preds) { + std::sort(pred_list.begin(), pred_list.end()); + pred_list.erase(std::unique(pred_list.begin(), pred_list.end()), + pred_list.end()); + } + return preds; +} + +std::vector> ComputeDominators( + size_t block_count, const std::vector>& preds) { + std::vector> doms(block_count); + if (block_count == 0) return doms; + + doms[0].insert(0); + for (size_t i = 1; i < block_count; ++i) { + for (size_t j = 0; j < block_count; ++j) doms[i].insert(j); + } + + bool changed = true; + while (changed) { + changed = false; + for (size_t block = 1; block < block_count; ++block) { + std::set next; + bool first_pred = true; + for (size_t pred : preds[block]) { + if (first_pred) { + next = doms[pred]; + first_pred = false; + continue; + } + std::set intersection; + std::set_intersection(next.begin(), next.end(), doms[pred].begin(), + doms[pred].end(), + std::inserter(intersection, + intersection.begin())); + next = std::move(intersection); + } + next.insert(block); + if (next != doms[block]) { + doms[block] = std::move(next); + changed = true; + } + } + } + return doms; +} + +std::unordered_set BuildNaturalLoop( + size_t header, size_t latch, + const std::vector>& preds) { + std::unordered_set loop_blocks; + std::vector worklist; + loop_blocks.insert(header); + loop_blocks.insert(latch); + worklist.push_back(latch); + + while (!worklist.empty()) { + size_t block = worklist.back(); + worklist.pop_back(); + for (size_t pred : preds[block]) { + if (loop_blocks.insert(pred).second && pred != header) { + worklist.push_back(pred); + } + } + } + return loop_blocks; +} + +bool HasSingleEntry(size_t header, const std::unordered_set& loop_blocks, + const std::vector>& preds) { + for (size_t block : loop_blocks) { + if (block == header) continue; + for (size_t pred : preds[block]) { + if (loop_blocks.count(pred) == 0) return false; + } + } + return true; +} + +std::vector FindLoopCandidates(MachineFunction& function) { + const auto& blocks = function.GetBlocks(); + std::unordered_map block_index_by_name; + for (size_t i = 0; i < blocks.size(); ++i) { + block_index_by_name[blocks[i]->GetName()] = i; + } + + std::unordered_set opaque_slots; + for (const auto& bb : blocks) { + for (const auto& inst : bb->GetInstructions()) { + int slot = -1; + if (IsOpaqueSlotUse(inst, &slot)) opaque_slots.insert(slot); + } + } + + auto succs = BuildSuccessors(function, block_index_by_name); + auto preds = BuildPredecessors(succs); + auto doms = ComputeDominators(blocks.size(), preds); + + std::vector candidates; + for (size_t latch = 0; latch < blocks.size(); ++latch) { + for (size_t header : succs[latch]) { + if (header == latch) continue; + if (header >= doms.size() || doms[latch].count(header) == 0) continue; + + auto loop_blocks = BuildNaturalLoop(header, latch, preds); + if (loop_blocks.size() > 24) continue; + if (!HasSingleEntry(header, loop_blocks, preds)) continue; + + std::unordered_map slot_info; + for (size_t bi : loop_blocks) { + bool seen_call = false; + for (const auto& cur : blocks[bi]->GetInstructions()) { + if (cur.GetOpcode() == Opcode::Bl) { + seen_call = true; + } + if (auto slot = GetLoadSlot(cur); + slot.has_value() && !opaque_slots.count(*slot)) { + auto& info = slot_info[*slot]; + info.slot = *slot; + const auto& ops = cur.GetOperands(); + if (ops.empty() || !ops[0].IsReg()) { + info.reg_kind = SlotUseInfo::RegKind::Invalid; + } else { + NoteSlotRegUse(info, ops[0].GetReg()); + } + ++info.loads; + info.use_blocks.insert(bi); + if (seen_call) ++info.after_call_uses; + if (bi != header) ++info.body_loads; + } + if (auto slot = GetStoreSlot(cur); + slot.has_value() && !opaque_slots.count(*slot)) { + auto& info = slot_info[*slot]; + info.slot = *slot; + const auto& ops = cur.GetOperands(); + if (ops.empty() || !ops[0].IsReg()) { + info.reg_kind = SlotUseInfo::RegKind::Invalid; + } else { + NoteSlotRegUse(info, ops[0].GetReg()); + } + ++info.stores; + info.use_blocks.insert(bi); + if (seen_call) ++info.after_call_uses; + if (bi != header) ++info.body_stores; + } + } + } + + std::vector ranked; + for (const auto& [slot, info] : slot_info) { + if (info.reg_kind == SlotUseInfo::RegKind::Invalid || + info.reg_kind == SlotUseInfo::RegKind::Unknown) { + continue; + } + const int slot_size = function.GetFrameSlot(slot).size; + if (info.reg_kind == SlotUseInfo::RegKind::X) { + if (slot_size != 8) continue; + } else if (slot_size != 4) { + continue; + } + if (info.loads == 0) continue; + if (info.stores == 0 && info.loads < 2) continue; + if (info.stores > 0 && info.loads + info.stores < 2) continue; + ranked.push_back(info); + } + std::sort(ranked.begin(), ranked.end(), + [](const SlotUseInfo& lhs, const SlotUseInfo& rhs) { + int lhs_score = SlotScore(lhs); + int rhs_score = SlotScore(rhs); + if (lhs_score != rhs_score) return lhs_score > rhs_score; + return lhs.slot < rhs.slot; + }); + if (ranked.empty()) continue; + + LoopCandidate cand; + cand.header = header; + cand.latch = latch; + cand.blocks = std::move(loop_blocks); + int gpr_slots = 0; + int s_slots = 0; + constexpr int kMaxGprSlots = 6; + constexpr int kMaxSSlots = 3; + for (const auto& info : ranked) { + if (info.reg_kind == SlotUseInfo::RegKind::W || + info.reg_kind == SlotUseInfo::RegKind::X) { + if (gpr_slots >= kMaxGprSlots) continue; + ++gpr_slots; + } else if (info.reg_kind == SlotUseInfo::RegKind::S) { + if (s_slots >= kMaxSSlots) continue; + ++s_slots; + } else { + continue; + } + cand.slots.push_back( + SlotPick{info.slot, info.reg_kind, info.stores > 0}); + cand.score += SlotScore(info); + } + if (cand.slots.empty()) continue; + candidates.push_back(std::move(cand)); + } + } + std::sort(candidates.begin(), candidates.end(), + [](const LoopCandidate& lhs, const LoopCandidate& rhs) { + if (lhs.score != rhs.score) return lhs.score > rhs.score; + if (lhs.blocks.size() != rhs.blocks.size()) { + return lhs.blocks.size() > rhs.blocks.size(); + } + return lhs.header < rhs.header; + }); + return candidates; +} + +void PromoteLoopSlots(MachineFunction& function, const LoopCandidate& loop) { + const std::vector s_regs = {PhysReg::S8, PhysReg::S9, + PhysReg::S10}; + std::unordered_map slot_to_promotion; + std::vector promotions; + size_t next_gpr_reg = 0; + size_t next_s_reg = 0; + for (const auto& slot : loop.slots) { + PhysReg reg = PhysReg::W19; + if (slot.reg_kind == SlotUseInfo::RegKind::W || + slot.reg_kind == SlotUseInfo::RegKind::X) { + if (next_gpr_reg >= 6) continue; + reg = GprForIndex(slot.reg_kind, next_gpr_reg++); + } else if (slot.reg_kind == SlotUseInfo::RegKind::S) { + if (next_s_reg >= s_regs.size()) continue; + reg = s_regs[next_s_reg++]; + } else { + continue; + } + Promotion promotion{slot.slot, reg, slot.reg_kind, slot.write_back}; + slot_to_promotion[slot.slot] = promotion; + promotions.push_back(promotion); + function.AddUsedCalleeSaved(reg); + } + + const auto& blocks = function.GetBlocks(); + std::unordered_map block_index_by_name; + for (size_t i = 0; i < blocks.size(); ++i) { + block_index_by_name[blocks[i]->GetName()] = i; + } + auto succs = BuildSuccessors(function, block_index_by_name); + auto preds = BuildPredecessors(succs); + + for (size_t bi : SortedLoopBlocks(loop)) { + auto& insts = blocks[bi]->GetInstructions(); + std::vector rewritten; + rewritten.reserve(insts.size()); + for (const auto& inst : insts) { + if (auto slot = GetLoadSlot(inst); slot.has_value()) { + auto it = slot_to_promotion.find(*slot); + if (it != slot_to_promotion.end()) { + const auto& ops = inst.GetOperands(); + PhysReg dst = ops[0].GetReg(); + if (!SameReg(dst, it->second.reg)) { + Opcode mov_opcode = + it->second.reg_kind == SlotUseInfo::RegKind::S + ? Opcode::FMovReg + : Opcode::MovReg; + rewritten.emplace_back( + mov_opcode, + std::vector{Operand::Reg(dst), + Operand::Reg(it->second.reg)}); + } + continue; + } + } + if (auto slot = GetStoreSlot(inst); slot.has_value()) { + auto it = slot_to_promotion.find(*slot); + if (it != slot_to_promotion.end()) { + const auto& ops = inst.GetOperands(); + PhysReg src = ops[0].GetReg(); + if (!SameReg(src, it->second.reg)) { + Opcode mov_opcode = + it->second.reg_kind == SlotUseInfo::RegKind::S + ? Opcode::FMovReg + : Opcode::MovReg; + rewritten.emplace_back( + mov_opcode, + std::vector{Operand::Reg(it->second.reg), + Operand::Reg(src)}); + } + continue; + } + } + rewritten.push_back(inst); + } + insts = std::move(rewritten); + } + + for (size_t pred = 0; pred < blocks.size(); ++pred) { + if (std::find(succs[pred].begin(), succs[pred].end(), loop.header) == + succs[pred].end()) { + continue; + } + if (InLoop(loop, pred)) continue; + std::vector loads; + for (const auto& promotion : promotions) { + loads.emplace_back(Opcode::LoadStack, + std::vector{ + Operand::Reg(promotion.reg), + Operand::FrameIndex(promotion.slot)}); + } + InsertBeforeTerminators(blocks[pred]->GetInstructions(), loads); + } + + std::unordered_set exit_blocks_with_stores; + for (size_t bi : SortedLoopBlocks(loop)) { + bool needs_local_exit_store = false; + for (size_t succ : succs[bi]) { + if (InLoop(loop, succ)) continue; + + bool exit_has_only_loop_preds = true; + for (size_t pred : preds[succ]) { + if (!InLoop(loop, pred)) { + exit_has_only_loop_preds = false; + break; + } + } + if (exit_has_only_loop_preds) { + if (exit_blocks_with_stores.insert(succ).second) { + std::vector stores; + for (const auto& promotion : promotions) { + if (!promotion.write_back) continue; + stores.emplace_back( + Opcode::StoreStack, + std::vector{ + Operand::Reg(promotion.reg), + Operand::FrameIndex(promotion.slot)}); + } + auto& exit_insts = blocks[succ]->GetInstructions(); + exit_insts.insert(exit_insts.begin(), stores.begin(), stores.end()); + } + } else { + needs_local_exit_store = true; + } + } + if (!needs_local_exit_store) continue; + std::vector stores; + for (const auto& promotion : promotions) { + if (!promotion.write_back) continue; + stores.emplace_back(Opcode::StoreStack, + std::vector{ + Operand::Reg(promotion.reg), + Operand::FrameIndex(promotion.slot)}); + } + InsertBeforeTerminators(blocks[bi]->GetInstructions(), stores); + } +} + +} // namespace + +void RunLoopSlotPromotion(MachineFunction& function) { + auto candidates = FindLoopCandidates(function); + std::unordered_set promoted_blocks; + int promoted_loop_count = 0; + constexpr int kMaxPromotedLoops = 4; + constexpr int kMinLoopScore = 32; + + for (const auto& loop : candidates) { + if (loop.score < kMinLoopScore) break; + + bool overlaps_existing_loop = false; + for (size_t block : loop.blocks) { + if (promoted_blocks.count(block) != 0) { + overlaps_existing_loop = true; + break; + } + } + if (overlaps_existing_loop) continue; + + PromoteLoopSlots(function, loop); + promoted_blocks.insert(loop.blocks.begin(), loop.blocks.end()); + ++promoted_loop_count; + if (promoted_loop_count >= kMaxPromotedLoops) break; + } +} + +} // namespace mir diff --git a/src/mir/MIRFunction.cpp b/src/mir/MIRFunction.cpp index a267dc7..a54cdc7 100644 --- a/src/mir/MIRFunction.cpp +++ b/src/mir/MIRFunction.cpp @@ -15,6 +15,12 @@ PhysReg CanonicalCalleeSavedReg(PhysReg reg) { int idx = static_cast(reg) - static_cast(PhysReg::W0); return static_cast(static_cast(PhysReg::X0) + idx); } + if (reg == PhysReg::W19) return PhysReg::X19; + if (reg == PhysReg::W20) return PhysReg::X20; + if (reg == PhysReg::W21) return PhysReg::X21; + if (reg == PhysReg::W22) return PhysReg::X22; + if (reg == PhysReg::W23) return PhysReg::X23; + if (reg == PhysReg::W24) return PhysReg::X24; return reg; } diff --git a/src/mir/Register.cpp b/src/mir/Register.cpp index 6e97788..3deb6d9 100644 --- a/src/mir/Register.cpp +++ b/src/mir/Register.cpp @@ -20,6 +20,12 @@ const char* PhysRegName(PhysReg reg) { case PhysReg::W9: return "w9"; case PhysReg::W10: return "w10"; case PhysReg::W11: return "w11"; + case PhysReg::W19: return "w19"; + case PhysReg::W20: return "w20"; + case PhysReg::W21: return "w21"; + case PhysReg::W22: return "w22"; + case PhysReg::W23: return "w23"; + case PhysReg::W24: return "w24"; case PhysReg::X0: return "x0"; case PhysReg::X1: return "x1"; case PhysReg::X2: return "x2"; @@ -35,6 +41,12 @@ const char* PhysRegName(PhysReg reg) { case PhysReg::X29: return "x29"; case PhysReg::X30: return "x30"; case PhysReg::SP: return "sp"; + case PhysReg::X19: return "x19"; + case PhysReg::X20: return "x20"; + case PhysReg::X21: return "x21"; + case PhysReg::X22: return "x22"; + case PhysReg::X23: return "x23"; + case PhysReg::X24: return "x24"; case PhysReg::S0: return "s0"; case PhysReg::S1: return "s1"; case PhysReg::S2: return "s2"; diff --git a/src/mir/passes/Peephole.cpp b/src/mir/passes/Peephole.cpp index a1408c9..df6a9fa 100644 --- a/src/mir/passes/Peephole.cpp +++ b/src/mir/passes/Peephole.cpp @@ -1,6 +1,9 @@ #include "mir/MIR.h" +#include #include +#include +#include #include #include @@ -24,7 +27,13 @@ bool IsAbiArgReg(PhysReg reg) { bool IsWxReg(PhysReg reg) { return (reg >= PhysReg::W0 && reg <= PhysReg::W10) || - (reg >= PhysReg::X0 && reg <= PhysReg::X10); + (reg >= PhysReg::X0 && reg <= PhysReg::X10) || + reg == PhysReg::W19 || reg == PhysReg::W20 || + reg == PhysReg::W21 || reg == PhysReg::W22 || + reg == PhysReg::W23 || reg == PhysReg::W24 || + reg == PhysReg::X19 || reg == PhysReg::X20 || + reg == PhysReg::X21 || reg == PhysReg::X22 || + reg == PhysReg::X23 || reg == PhysReg::X24; } int WxIndex(PhysReg reg) { @@ -34,6 +43,12 @@ int WxIndex(PhysReg reg) { if (reg >= PhysReg::X0 && reg <= PhysReg::X10) { return static_cast(reg) - static_cast(PhysReg::X0); } + if (reg == PhysReg::W19 || reg == PhysReg::X19) return 19; + if (reg == PhysReg::W20 || reg == PhysReg::X20) return 20; + if (reg == PhysReg::W21 || reg == PhysReg::X21) return 21; + if (reg == PhysReg::W22 || reg == PhysReg::X22) return 22; + if (reg == PhysReg::W23 || reg == PhysReg::X23) return 23; + if (reg == PhysReg::W24 || reg == PhysReg::X24) return 24; return -1; } @@ -223,11 +238,17 @@ void RecordStore(std::unordered_map& slot_to_reg, } bool IsWReg(PhysReg reg) { - return reg >= PhysReg::W0 && reg <= PhysReg::W11; + return (reg >= PhysReg::W0 && reg <= PhysReg::W11) || + reg == PhysReg::W19 || reg == PhysReg::W20 || + reg == PhysReg::W21 || reg == PhysReg::W22 || + reg == PhysReg::W23 || reg == PhysReg::W24; } bool IsXReg(PhysReg reg) { return (reg >= PhysReg::X0 && reg <= PhysReg::X11) || + reg == PhysReg::X19 || reg == PhysReg::X20 || + reg == PhysReg::X21 || reg == PhysReg::X22 || + reg == PhysReg::X23 || reg == PhysReg::X24 || reg == PhysReg::X29 || reg == PhysReg::X30; } @@ -303,6 +324,507 @@ bool IsNoopImmArithmetic(const MachineInstr& inst) { return ops[2].GetImm() == 0 && RegAlias(ops[0].GetReg(), ops[1].GetReg()); } +std::optional GetFrameIndexOperand(const MachineInstr& inst, size_t idx) { + const auto& ops = inst.GetOperands(); + if (idx >= ops.size() || ops[idx].GetKind() != Operand::Kind::FrameIndex) { + return std::nullopt; + } + return ops[idx].GetFrameIndex(); +} + +bool IsControlTransfer(const MachineInstr& inst) { + switch (inst.GetOpcode()) { + case Opcode::B: + case Opcode::Bcond: + case Opcode::FBcond: + case Opcode::Cbnz: + case Opcode::Cbz: + case Opcode::Ret: + return true; + default: + return false; + } +} + +bool MayTouchFrameSlot(const MachineInstr& inst, int slot) { + switch (inst.GetOpcode()) { + case Opcode::LoadStack: + case Opcode::StoreStack: + case Opcode::LoadStackOffset: + case Opcode::StoreStackOffset: + case Opcode::LoadStackAddr: { + auto inst_slot = GetFrameIndexOperand(inst, 1); + return inst_slot.has_value() && *inst_slot == slot; + } + default: + return false; + } +} + +std::optional GetLoadStackSlot(const MachineInstr& inst) { + if (inst.GetOpcode() != Opcode::LoadStack) { + return std::nullopt; + } + return GetFrameIndexOperand(inst, 1); +} + +std::optional GetStoreStackSlot(const MachineInstr& inst) { + if (inst.GetOpcode() != Opcode::StoreStack) { + return std::nullopt; + } + return GetFrameIndexOperand(inst, 1); +} + +bool IsStoreOverwrittenBeforeRead(const std::vector& insts, + size_t store_index) { + const auto slot = GetFrameIndexOperand(insts[store_index], 1); + if (!slot.has_value()) { + return false; + } + + for (size_t i = store_index + 1; i < insts.size(); ++i) { + const auto& inst = insts[i]; + if (IsControlTransfer(inst) || inst.GetOpcode() == Opcode::Bl) { + return false; + } + if (!MayTouchFrameSlot(inst, *slot)) { + continue; + } + if (inst.GetOpcode() == Opcode::StoreStack) { + return true; + } + return false; + } + return false; +} + +void RemoveOverwrittenStores(std::vector& insts) { + std::vector filtered; + filtered.reserve(insts.size()); + for (size_t i = 0; i < insts.size(); ++i) { + if (IsStoreStack(insts[i]) && IsStoreOverwrittenBeforeRead(insts, i)) { + continue; + } + filtered.push_back(std::move(insts[i])); + } + insts = std::move(filtered); +} + +bool IsOpaqueFrameSlotUse(const MachineInstr& inst, int* slot) { + switch (inst.GetOpcode()) { + case Opcode::LoadStackOffset: + case Opcode::StoreStackOffset: + case Opcode::LoadStackAddr: { + auto frame_index = GetFrameIndexOperand(inst, 1); + if (!frame_index.has_value()) { + return false; + } + *slot = *frame_index; + return true; + } + default: + return false; + } +} + +bool HasFrameSlotTouch(const std::vector& insts, size_t begin, + size_t end, int slot) { + end = std::min(end, insts.size()); + for (size_t i = begin; i < end; ++i) { + if (MayTouchFrameSlot(insts[i], slot)) { + return true; + } + } + return false; +} + +bool IsStackCopyTail(const std::vector& insts, size_t begin) { + for (size_t i = begin; i < insts.size(); ++i) { + const auto opcode = insts[i].GetOpcode(); + if (IsControlTransfer(insts[i])) { + continue; + } + if (opcode != Opcode::LoadStack && opcode != Opcode::StoreStack) { + return false; + } + } + return true; +} + +bool LoadedRegUsedAfterRemovedStore(const std::vector& insts, + size_t begin, PhysReg reg) { + for (size_t i = begin; i < insts.size(); ++i) { + if (IsControlTransfer(insts[i])) { + continue; + } + if (ReadsReg(insts[i], reg)) { + return true; + } + if (auto written = GetWrittenReg(insts[i]); + written.has_value() && RegAlias(*written, reg)) { + return false; + } + } + return false; +} + +bool RegTouched(const MachineInstr& inst, PhysReg reg) { + if (ReadsReg(inst, reg)) return true; + if (auto written = GetWrittenReg(inst); + written.has_value() && RegAlias(*written, reg)) { + return true; + } + return false; +} + +std::unordered_map> +BuildSuccessorMap(const MachineFunction& function) { + std::unordered_map> succs; + const auto& blocks = function.GetBlocks(); + auto find_block = [&](const std::string& name) -> const MachineBasicBlock* { + for (const auto& candidate : blocks) { + if (candidate->GetName() == name) { + return candidate.get(); + } + } + return nullptr; + }; + for (size_t bi = 0; bi < blocks.size(); ++bi) { + const auto* bb = blocks[bi].get(); + auto& out = succs[bb]; + const auto& insts = bb->GetInstructions(); + for (const auto& inst : insts) { + switch (inst.GetOpcode()) { + case Opcode::B: + case Opcode::Bcond: + case Opcode::FBcond: { + const auto& ops = inst.GetOperands(); + if (!ops.empty() && ops[0].IsSymbol()) { + if (auto* target = find_block(ops[0].GetSymbol())) { + out.push_back(target); + } + } + break; + } + case Opcode::Cbnz: + case Opcode::Cbz: { + const auto& ops = inst.GetOperands(); + if (ops.size() > 1 && ops[1].IsSymbol()) { + if (auto* target = find_block(ops[1].GetSymbol())) { + out.push_back(target); + } + } + break; + } + default: + break; + } + } + if (!insts.empty()) { + Opcode last = insts.back().GetOpcode(); + if (last != Opcode::B && last != Opcode::Ret && bi + 1 < blocks.size()) { + out.push_back(blocks[bi + 1].get()); + } + } + std::sort(out.begin(), out.end()); + out.erase(std::unique(out.begin(), out.end()), out.end()); + } + return succs; +} + +void CountScalarStackAccesses(const MachineFunction& function, + const std::unordered_set& opaque_slots, + std::unordered_map& load_count, + std::unordered_map& store_count) { + load_count.clear(); + store_count.clear(); + for (const auto& bb_ptr : function.GetBlocks()) { + for (const auto& inst : bb_ptr->GetInstructions()) { + if (auto slot = GetLoadStackSlot(inst); + slot.has_value() && !opaque_slots.count(*slot)) { + ++load_count[*slot]; + } + if (auto slot = GetStoreStackSlot(inst); + slot.has_value() && !opaque_slots.count(*slot)) { + ++store_count[*slot]; + } + } + } +} + +void ForwardLatchTempStores( + MachineFunction& function, const std::unordered_set& opaque_slots, + const std::unordered_map>& live_out) { + std::unordered_map load_count; + std::unordered_map store_count; + CountScalarStackAccesses(function, opaque_slots, load_count, store_count); + + for (const auto& bb_ptr : function.GetBlocks()) { + auto& insts = bb_ptr->GetInstructions(); + std::vector remove(insts.size(), false); + const auto live_out_it = live_out.find(bb_ptr.get()); + const std::set* block_live_out = + live_out_it == live_out.end() ? nullptr : &live_out_it->second; + + for (size_t i = 0; i + 2 < insts.size(); ++i) { + if (remove[i]) { + continue; + } + auto temp_slot = GetStoreStackSlot(insts[i]); + if (!temp_slot.has_value() || opaque_slots.count(*temp_slot) || + load_count[*temp_slot] != 1 || store_count[*temp_slot] != 1 || + (block_live_out != nullptr && block_live_out->count(*temp_slot))) { + continue; + } + + const auto& store_ops = insts[i].GetOperands(); + if (store_ops.empty() || !store_ops[0].IsReg()) { + continue; + } + + for (size_t j = i + 1; j + 1 < insts.size(); ++j) { + if (IsControlTransfer(insts[j]) || insts[j].GetOpcode() == Opcode::Bl) { + break; + } + if (!MayTouchFrameSlot(insts[j], *temp_slot)) { + continue; + } + auto load_slot = GetLoadStackSlot(insts[j]); + if (!load_slot.has_value() || *load_slot != *temp_slot) { + break; + } + auto final_slot = GetStoreStackSlot(insts[j + 1]); + if (!final_slot.has_value() || *final_slot == *temp_slot || + opaque_slots.count(*final_slot) || + HasFrameSlotTouch(insts, i + 1, j, *final_slot) || + !IsStackCopyTail(insts, j + 2)) { + break; + } + + const auto& load_ops = insts[j].GetOperands(); + const auto& final_ops = insts[j + 1].GetOperands(); + if (load_ops.empty() || final_ops.empty() || !load_ops[0].IsReg() || + !final_ops[0].IsReg() || + !RegAlias(load_ops[0].GetReg(), final_ops[0].GetReg()) || + !SameRegWidth(store_ops[0].GetReg(), final_ops[0].GetReg()) || + LoadedRegUsedAfterRemovedStore(insts, j + 2, + load_ops[0].GetReg())) { + break; + } + + insts[i].SetOperand(1, Operand::FrameIndex(*final_slot)); + remove[j] = true; + remove[j + 1] = true; + break; + } + } + + std::vector filtered; + filtered.reserve(insts.size()); + for (size_t i = 0; i < insts.size(); ++i) { + if (!remove[i]) { + filtered.push_back(std::move(insts[i])); + } + } + insts = std::move(filtered); + } +} + +void ForwardUniqueStackTemps( + MachineFunction& function, const std::unordered_set& opaque_slots, + const std::unordered_map>& live_out) { + std::unordered_map load_count; + std::unordered_map store_count; + CountScalarStackAccesses(function, opaque_slots, load_count, store_count); + + for (const auto& bb_ptr : function.GetBlocks()) { + auto& insts = bb_ptr->GetInstructions(); + const auto live_out_it = live_out.find(bb_ptr.get()); + const std::set* block_live_out = + live_out_it == live_out.end() ? nullptr : &live_out_it->second; + std::vector remove(insts.size(), false); + std::vector> replacement(insts.size()); + + for (size_t i = 0; i < insts.size(); ++i) { + auto slot = GetStoreStackSlot(insts[i]); + if (!slot.has_value() || opaque_slots.count(*slot) || + load_count[*slot] != 1 || store_count[*slot] != 1 || + (block_live_out != nullptr && block_live_out->count(*slot))) { + continue; + } + const auto& store_ops = insts[i].GetOperands(); + if (store_ops.empty() || !store_ops[0].IsReg()) continue; + const PhysReg src = store_ops[0].GetReg(); + + for (size_t j = i + 1; j < insts.size(); ++j) { + if (IsControlTransfer(insts[j]) || insts[j].GetOpcode() == Opcode::Bl || + IsMemoryClobber(insts[j])) { + break; + } + + if (auto touched_slot = GetStoreStackSlot(insts[j]); + touched_slot.has_value() && *touched_slot == *slot) { + break; + } + + auto load_slot = GetLoadStackSlot(insts[j]); + if (!load_slot.has_value() || *load_slot != *slot) { + continue; + } + + const auto& load_ops = insts[j].GetOperands(); + if (load_ops.empty() || !load_ops[0].IsReg()) break; + const PhysReg dst = load_ops[0].GetReg(); + if (!SameRegWidth(src, dst)) break; + + bool can_hold_in_dst = true; + for (size_t k = i + 1; k < j; ++k) { + if (RegTouched(insts[k], dst)) { + can_hold_in_dst = false; + break; + } + } + if (!can_hold_in_dst) break; + + if (RegAlias(src, dst)) { + bool src_clobbered = false; + for (size_t k = i + 1; k < j; ++k) { + if (auto written = GetWrittenReg(insts[k]); + written.has_value() && RegAlias(*written, src)) { + src_clobbered = true; + break; + } + } + if (src_clobbered) break; + remove[i] = true; + } else { + const Opcode mv_op = (IsFloatReg(src) && IsFloatReg(dst)) + ? Opcode::FMovReg + : Opcode::MovReg; + replacement[i] = MachineInstr( + mv_op, std::vector{Operand::Reg(dst), + Operand::Reg(src)}); + } + remove[j] = true; + break; + } + } + + std::vector filtered; + filtered.reserve(insts.size()); + for (size_t i = 0; i < insts.size(); ++i) { + if (remove[i]) continue; + if (replacement[i].has_value()) { + filtered.push_back(*replacement[i]); + continue; + } + filtered.push_back(std::move(insts[i])); + } + insts = std::move(filtered); + } +} + +void RemoveDeadScalarStores(MachineFunction& function) { + std::unordered_set opaque_slots; + for (const auto& bb_ptr : function.GetBlocks()) { + for (const auto& inst : bb_ptr->GetInstructions()) { + int slot = -1; + if (IsOpaqueFrameSlotUse(inst, &slot)) { + opaque_slots.insert(slot); + } + } + } + + const auto succs = BuildSuccessorMap(function); + std::unordered_map> use; + std::unordered_map> def; + std::unordered_map> live_in; + std::unordered_map> live_out; + std::unordered_map load_count; + std::unordered_map store_count; + + for (const auto& bb_ptr : function.GetBlocks()) { + const auto* bb = bb_ptr.get(); + for (const auto& inst : bb->GetInstructions()) { + if (auto slot = GetLoadStackSlot(inst); slot.has_value()) { + if (!opaque_slots.count(*slot)) { + ++load_count[*slot]; + } + if (!opaque_slots.count(*slot) && !def[bb].count(*slot)) { + use[bb].insert(*slot); + } + } + if (auto slot = GetStoreStackSlot(inst); slot.has_value()) { + if (!opaque_slots.count(*slot)) { + ++store_count[*slot]; + def[bb].insert(*slot); + } + } + } + } + + bool changed = true; + while (changed) { + changed = false; + const auto& blocks = function.GetBlocks(); + for (int bi = static_cast(blocks.size()) - 1; bi >= 0; --bi) { + const auto* bb = blocks[bi].get(); + std::set new_out; + if (auto it = succs.find(bb); it != succs.end()) { + for (const auto* succ : it->second) { + const auto& succ_in = live_in[succ]; + new_out.insert(succ_in.begin(), succ_in.end()); + } + } + + std::set new_in = new_out; + for (int slot : def[bb]) { + new_in.erase(slot); + } + new_in.insert(use[bb].begin(), use[bb].end()); + + if (new_out != live_out[bb] || new_in != live_in[bb]) { + live_out[bb] = std::move(new_out); + live_in[bb] = std::move(new_in); + changed = true; + } + } + } + + for (const auto& bb_ptr : function.GetBlocks()) { + auto& insts = bb_ptr->GetInstructions(); + std::set live = live_out[bb_ptr.get()]; + std::vector filtered; + filtered.reserve(insts.size()); + + for (int i = static_cast(insts.size()) - 1; i >= 0; --i) { + auto& inst = insts[i]; + if (auto slot = GetLoadStackSlot(inst); + slot.has_value() && !opaque_slots.count(*slot)) { + live.insert(*slot); + filtered.push_back(std::move(inst)); + continue; + } + if (auto slot = GetStoreStackSlot(inst); + slot.has_value() && !opaque_slots.count(*slot)) { + if (!live.count(*slot)) { + continue; + } + live.erase(*slot); + filtered.push_back(std::move(inst)); + continue; + } + filtered.push_back(std::move(inst)); + } + + std::reverse(filtered.begin(), filtered.end()); + insts = std::move(filtered); + } + + ForwardLatchTempStores(function, opaque_slots, live_out); + ForwardUniqueStackTemps(function, opaque_slots, live_out); +} + } // namespace void RunPeephole(MachineFunction& function) { @@ -422,8 +944,10 @@ void RunPeephole(MachineFunction& function) { } } + RemoveOverwrittenStores(optimized); insts = std::move(optimized); } + RemoveDeadScalarStores(function); } } // namespace mir diff --git a/test/test_case.zip b/test/test_case.zip new file mode 100644 index 0000000..2b527a2 Binary files /dev/null and b/test/test_case.zip differ diff --git a/优化方案.md b/优化方案.md index 4c3a037..a0f9fdb 100644 --- a/优化方案.md +++ b/优化方案.md @@ -39,3 +39,54 @@ - 方案:IR 文本打印补齐 `ConstantFloat` 的 LLVM 十六进制浮点常量格式,把浮点二元运算/比较输出为 `fadd/fsub/fmul/fdiv`、`fcmp o*`;比较结果打印为 `i1` 后再 `zext` 回内部 `i32` 约定;删除不可达块时同步清理后继 phi 入边;scaled 间接访存补齐 peephole 的读寄存器与内存 clobber 描述。 - 代码位置:`src/ir/IRPrinter.cpp`、`src/ir/passes/CFGSimplify.cpp`、`src/mir/passes/Peephole.cpp`。 + +## 8. ASM 后端栈槽死写删除 + +- 方案:在 MIR peephole 中增加纯标量 frame slot 的活跃性分析;对没有地址暴露、没有 offset 访问的栈槽,若 `StoreStack` 写入后到块尾/后继都不会被 `LoadStack` 读取,则删除该死写,同时保留原有局部 store-load 前递。 +- 代码位置:`src/mir/passes/Peephole.cpp`,新增 `RemoveDeadScalarStores` 及 CFG successor 推导。 + +## 10. ASM 后端栈帧压缩 + +- 方案:FrameLowering 前扫描实际仍被指令引用的 frame slot,被 peephole 删除引用的死槽不再分配栈空间,减少栈帧大小和大偏移 `sub x11; ldr/str [x11]` 展开。 +- 代码位置:`src/mir/FrameLowering.cpp`,新增 `CollectUsedFrameSlots` 并跳过未引用槽。 + +## 11. 循环 lowering 尝试记录 + +- 方案:尝试将循环内 `i32 phi` 提升为跨块 MIR vreg,但当前寄存器分配仍假设跨块值经栈槽传递,直接改写会破坏跨块活跃性;已回退该方向。保留一个带全函数唯一读写保护的临时槽消除规则,未命中 `2025-MYO-20` 主热循环。 +- 代码位置:`src/mir/passes/Peephole.cpp`。 + +## 12. ASM 循环回边临时槽转发 + +- 方案:对循环 latch 尾部的 `store temp ... load temp; store phi` 形态做块内专用转发,把第一次写 temp 改成直接写 phi,并删除尾部回拷;要求 temp 全函数唯一读写、无地址暴露、目标 phi 槽在移动区间内未被触碰,且尾部只剩栈槽回拷和跳转。 +- 代码位置:`src/mir/passes/Peephole.cpp`,新增 `ForwardLatchTempStores`;配合 `src/mir/FrameLowering.cpp` 的死槽栈帧压缩继续减少帧大小。 + +## 13. Polyhedral-lite 前置:局部内存值编号 + +- 方案:完整 loop interchange/tiling 需要先有可靠的仿射访存依赖基础;先在局部 CSE 中纳入 `load`,只复用同一基本块内相同指针且中间没有 `store/call` 的读值,减少仿射地址重复 load,并为后续保守依赖分析打基础。 +- 代码位置:`src/ir/passes/CSE.cpp`,`Load` 加入 CSE key,遇到 `Store/Call` 清空 load 记录。 + +## 14. Polyhedral-lite 内存转发与保守别名失效 + +- 方案:在局部内存值编号中记录同块内最近一次精确指针 `store`,后续同指针 `load` 直接替换为已存值;`store` 不再清空全部 load,而是只清空可能别名的内存记录。别名判断增加简单仿射 GEP key,支持 `add/sub/常量乘` 下标;不同全局变量/不同 alloca 基址视为不别名,复杂表达式退回保守处理,函数指针参数仍视为可能别名。 +- 代码位置:`src/ir/passes/CSE.cpp`,新增 `BuildAffineExpr`、`BuildMemoryKey`、`MayAlias`、`InvalidateMayAliasMemory` 与 `store_values`。 + +## 15. MIR 循环标量槽寄存器化 + +- 方案:在 `RegAlloc` 后、`FrameLowering` 前按 CFG 回边与支配关系识别 natural loop,不再用物理块连续区间近似循环;同一函数内最多选择 4 个互不重叠热点循环,按循环体 load/store 次数和“既读又写”的 accumulator 特征挑选最多 6 个 4 字节纯标量 frame slot,用 `w19-w24` 保存在循环内;入口前驱加载初值,退出写回优先下沉到唯一出口块,减少条件块上的热路径写栈。 +- 代码位置:`src/mir/LoopSlotPromotion.cpp`,新增/扩展 `RunLoopSlotPromotion`;`include/mir/MIR.h`、`src/mir/Register.cpp`、`src/mir/FrameLowering.cpp`、`src/mir/passes/Peephole.cpp` 支持 `w19-w24/x19-x24`;`src/main.cpp` 在 `RunRegAlloc` 后调用。 +- 修复:候选槽必须所有普通 `LoadStack/StoreStack` 都使用 W 寄存器,排除 4 字节浮点 `s` 栈槽,避免生成非法的 `mov s*, w19-w24`。 + +## 16. MIR 浮点标量槽寄存器化 + +- 方案:在循环槽寄存器化中按 `LoadStack/StoreStack` 的实际寄存器类别区分整型槽与浮点槽;整型槽继续用 `w19-w24`,浮点槽用 callee-saved `s8-s10`,循环内改写为 `FMovReg`,入口/出口仍用栈槽加载和写回。评分上提高跨基本块使用槽的优先级,优先保留 float accumulator 这类跨展开块活跃值,避免 `s8-s10` 被单块临时值占满。 +- 代码位置:`src/mir/LoopSlotPromotion.cpp`,新增 slot register kind 分类、`s8-s10` promotion 分配和 `FMovReg` 重写。 + +## 17. 浮点临时栈槽前递与跨调用评分 + +- 方案:MIR peephole 对全函数唯一 store/load 且块外不活跃的纯临时栈槽做保守前递,把 `StoreStack temp; ...; LoadStack temp` 改为提前的 `MovReg/FMovReg`,消除单块 float 临时值的落栈;循环槽评分对 `Bl` 之后使用的浮点槽加权,优先保留跨调用后的 float accumulator。 +- 代码位置:`src/mir/passes/Peephole.cpp`,新增 `ForwardUniqueStackTemps`;`src/mir/LoopSlotPromotion.cpp`,增加 `after_call_uses` 评分。 + +## 18. MIR 循环只读栈槽寄存器化 + +- 方案:循环槽寄存器化支持 `stores == 0` 的 read-only 栈槽,入口加载一次后把循环内 `LoadStack` 改为寄存器移动,退出不写回;新增 `X` 类槽并让 `W/X` 共用 `19-24` 的 callee-saved 编号,避免别名冲突。该规则用于提升 `large_loop_array_2` 中 `loop(x, y, len)` 的只读参数槽,减少热循环中反复读取 `x/y/length` 栈槽。 +- 代码位置:`src/mir/LoopSlotPromotion.cpp`,扩展 slot kind、read-only 候选和统一 GPR 分配;`src/mir/MIRFunction.cpp`,把 `w19-w24` 规范化到对应 `x19-x24`,避免重复保存。