18th Optimization reached 174.650

master
shrink 2 hours ago
parent a14a9cde0d
commit ad4591607f

@ -22,8 +22,10 @@ MIRContext& DefaultContext();
enum class PhysReg {
W0, W1, W2, W3, W4, W5, W6, W7,
W8, W9, W10, W11,
W19, W20, W21, W22, W23, W24,
X0, X1, X2, X3, X4, X5, X6, X7,
X8, X9, X10, X11, X29, X30, SP,
X19, X20, X21, X22, X23, X24,
S0, S1, S2, S3, S4, S5, S6, S7, // 单精度浮点寄存器
S8, S9, S10
};
@ -236,6 +238,7 @@ class MachineModule {
std::unique_ptr<MachineModule> LowerToMIR(const ir::Module& module);
void RunPeephole(MachineFunction& function);
void RunRegAlloc(MachineFunction& function);
void RunLoopSlotPromotion(MachineFunction& function);
void RunFrameLowering(MachineFunction& function);
void PrintAsm(const MachineModule& module, std::ostream& os);

@ -5,11 +5,18 @@
// 算法:在每个基本块内,使用哈希表记录已出现的表达式。
// 当遇到相同操作码 + 相同操作数的指令时,复用之前的结果。
// 这是局部 CSELocal CSE只在基本块内消除。
//
// 对 Load 采用保守内存值编号:同一基本块内相同指针、且中间没有
// 可能别名的 store/call 时才复用;同时支持 store 后紧跟同指针 load
// 的局部转发。
#include "ir/IR.h"
#include <algorithm>
#include <cstdint>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
namespace ir {
@ -80,6 +87,243 @@ ExprKey MakeKey(Instruction* inst) {
return key;
}
bool IsDistinctLocalOrGlobalObject(Value* lhs, Value* rhs) {
if (lhs == rhs) return false;
const bool lhs_known = dynamic_cast<GlobalVariable*>(lhs) != nullptr ||
dynamic_cast<AllocaInst*>(lhs) != nullptr;
const bool rhs_known = dynamic_cast<GlobalVariable*>(rhs) != nullptr ||
dynamic_cast<AllocaInst*>(rhs) != nullptr;
return lhs_known && rhs_known;
}
struct AffineExpr {
int64_t constant = 0;
std::vector<std::pair<Value*, int64_t>> terms;
bool operator==(const AffineExpr& other) const {
return constant == other.constant && terms == other.terms;
}
};
void Normalize(AffineExpr* expr) {
std::sort(expr->terms.begin(), expr->terms.end(),
[](const auto& lhs, const auto& rhs) {
return lhs.first < rhs.first;
});
std::vector<std::pair<Value*, int64_t>> normalized;
for (const auto& [value, coeff] : expr->terms) {
if (coeff == 0) continue;
if (!normalized.empty() && normalized.back().first == value) {
normalized.back().second += coeff;
if (normalized.back().second == 0) {
normalized.pop_back();
}
} else {
normalized.push_back({value, coeff});
}
}
expr->terms = std::move(normalized);
}
bool ScaleAffine(const AffineExpr& input, int64_t scale, AffineExpr* out) {
out->constant = input.constant * scale;
out->terms.clear();
out->terms.reserve(input.terms.size());
for (const auto& [value, coeff] : input.terms) {
out->terms.push_back({value, coeff * scale});
}
Normalize(out);
return true;
}
bool BuildAffineExprImpl(Value* value, AffineExpr* out,
std::unordered_set<Value*>& visiting, int depth) {
if (depth > 64) {
return false;
}
if (auto* constant = dynamic_cast<ConstantInt*>(value)) {
out->constant = constant->GetValue();
out->terms.clear();
return true;
}
auto* bin = dynamic_cast<BinaryInst*>(value);
if (!bin) {
out->constant = 0;
out->terms = {{value, 1}};
return true;
}
if (!visiting.insert(value).second) {
return false;
}
AffineExpr lhs;
AffineExpr rhs;
bool ok = false;
switch (bin->GetOpcode()) {
case Opcode::Add:
case Opcode::Sub:
if (!BuildAffineExprImpl(bin->GetLhs(), &lhs, visiting, depth + 1) ||
!BuildAffineExprImpl(bin->GetRhs(), &rhs, visiting, depth + 1)) {
break;
}
out->constant = lhs.constant +
(bin->GetOpcode() == Opcode::Add ? rhs.constant
: -rhs.constant);
out->terms = lhs.terms;
for (const auto& [term, coeff] : rhs.terms) {
out->terms.push_back(
{term, bin->GetOpcode() == Opcode::Add ? coeff : -coeff});
}
Normalize(out);
ok = true;
break;
case Opcode::Mul: {
auto* lhs_const = dynamic_cast<ConstantInt*>(bin->GetLhs());
auto* rhs_const = dynamic_cast<ConstantInt*>(bin->GetRhs());
if (lhs_const &&
BuildAffineExprImpl(bin->GetRhs(), &rhs, visiting, depth + 1)) {
ok = ScaleAffine(rhs, lhs_const->GetValue(), out);
break;
}
if (rhs_const &&
BuildAffineExprImpl(bin->GetLhs(), &lhs, visiting, depth + 1)) {
ok = ScaleAffine(lhs, rhs_const->GetValue(), out);
break;
}
break;
}
default:
out->constant = 0;
out->terms = {{value, 1}};
ok = true;
break;
}
visiting.erase(value);
return ok;
}
bool BuildAffineExpr(Value* value, AffineExpr* out) {
std::unordered_set<Value*> visiting;
return BuildAffineExprImpl(value, out, visiting, 0);
}
struct MemoryKey {
bool affine = false;
Value* exact = nullptr;
Value* base = nullptr;
AffineExpr index;
bool operator==(const MemoryKey& other) const {
if (affine != other.affine) return false;
if (!affine) return exact == other.exact;
return base == other.base && index == other.index;
}
};
struct MemoryKeyHash {
size_t operator()(const MemoryKey& key) const {
if (!key.affine) {
return std::hash<void*>()(key.exact);
}
size_t h = std::hash<void*>()(key.base);
h ^= std::hash<int64_t>()(key.index.constant) + 0x9e3779b9 + (h << 6) +
(h >> 2);
for (const auto& [value, coeff] : key.index.terms) {
h ^= std::hash<void*>()(value) + 0x9e3779b9 + (h << 6) + (h >> 2);
h ^= std::hash<int64_t>()(coeff) + 0x9e3779b9 + (h << 6) + (h >> 2);
}
return h;
}
};
bool BuildMemoryKey(Value* ptr, MemoryKey* key) {
if (auto* gep = dynamic_cast<GepInst*>(ptr)) {
MemoryKey base_key;
if (!BuildMemoryKey(gep->GetBase(), &base_key)) {
return false;
}
if (!base_key.affine) {
key->affine = false;
key->exact = ptr;
return true;
}
AffineExpr index;
if (!BuildAffineExpr(gep->GetIndex(), &index)) {
key->affine = false;
key->exact = ptr;
return true;
}
key->affine = true;
key->exact = nullptr;
key->base = base_key.base;
key->index = base_key.index;
key->index.constant += index.constant;
key->index.terms.insert(key->index.terms.end(), index.terms.begin(),
index.terms.end());
Normalize(&key->index);
return true;
}
key->affine = true;
key->exact = nullptr;
key->base = ptr;
key->index = {};
return true;
}
bool SameAffineSlope(const AffineExpr& lhs, const AffineExpr& rhs) {
return lhs.terms == rhs.terms;
}
bool MayAlias(const MemoryKey& lhs, const MemoryKey& rhs) {
if (lhs == rhs) return true;
if (lhs.affine && rhs.affine) {
if (lhs.base != rhs.base) {
return !IsDistinctLocalOrGlobalObject(lhs.base, rhs.base);
}
if (SameAffineSlope(lhs.index, rhs.index) &&
lhs.index.constant != rhs.index.constant) {
return false;
}
return true;
}
return true;
}
void ClearMemoryState(
std::unordered_map<Value*, Instruction*>& load_values,
std::unordered_map<Value*, Value*>& store_values) {
load_values.clear();
store_values.clear();
}
void InvalidateMayAliasMemory(
std::unordered_map<Value*, Instruction*>& load_values,
std::unordered_map<Value*, Value*>& store_values,
const MemoryKey& store_key) {
for (auto it = load_values.begin(); it != load_values.end();) {
MemoryKey load_key;
BuildMemoryKey(it->first, &load_key);
if (MayAlias(load_key, store_key)) {
it = load_values.erase(it);
} else {
++it;
}
}
for (auto it = store_values.begin(); it != store_values.end();) {
MemoryKey prior_store_key;
BuildMemoryKey(it->first, &prior_store_key);
if (MayAlias(prior_store_key, store_key)) {
it = store_values.erase(it);
} else {
++it;
}
}
}
} // namespace
bool RunCSE(Function& func) {
@ -91,11 +335,42 @@ bool RunCSE(Function& func) {
if (!bb) continue;
std::unordered_map<ExprKey, Instruction*, ExprKeyHash> expr_map;
std::unordered_map<Value*, Instruction*> load_values;
std::unordered_map<Value*, Value*> store_values;
std::vector<Instruction*> to_remove;
for (const auto& inst_ptr : bb->GetInstructions()) {
auto* inst = inst_ptr.get();
if (inst->GetOpcode() == Opcode::Call) {
ClearMemoryState(load_values, store_values);
} else if (auto* store = dynamic_cast<StoreInst*>(inst)) {
MemoryKey store_key;
BuildMemoryKey(store->GetPtr(), &store_key);
InvalidateMayAliasMemory(load_values, store_values, store_key);
store_values[store->GetPtr()] = store->GetValue();
}
if (auto* load = dynamic_cast<LoadInst*>(inst)) {
auto it = store_values.find(load->GetPtr());
if (it != store_values.end() && it->second &&
it->second->GetType()->GetKind() == load->GetType()->GetKind()) {
load->ReplaceAllUsesWith(it->second);
to_remove.push_back(load);
changed = true;
continue;
}
auto load_it = load_values.find(load->GetPtr());
if (load_it != load_values.end()) {
load->ReplaceAllUsesWith(load_it->second);
to_remove.push_back(load);
changed = true;
} else {
load_values[load->GetPtr()] = load;
}
continue;
}
if (!IsCSECandidate(inst)) continue;
ExprKey key = MakeKey(inst);

@ -159,6 +159,7 @@ int main(int argc, char** argv) {
for (const auto& func_ptr : machine_module->GetFunctions()) {
mir::RunPeephole(*func_ptr);
mir::RunRegAlloc(*func_ptr);
mir::RunLoopSlotPromotion(*func_ptr);
mir::RunFrameLowering(*func_ptr);
mir::RunPeephole(*func_ptr);
}

@ -6,6 +6,7 @@ add_library(mir_core STATIC
Register.cpp
Lowering.cpp
RegAlloc.cpp
LoopSlotPromotion.cpp
FrameLowering.cpp
AsmPrinter.cpp
)

@ -2,6 +2,7 @@
#include <algorithm>
#include <stdexcept>
#include <unordered_set>
#include <vector>
#include "utils/Log.h"
@ -15,6 +16,12 @@ int AlignTo(int value, int align) {
// 获取 W 寄存器对应的 X 寄存器
PhysReg WRegToXReg(PhysReg w) {
if (w == PhysReg::W19) return PhysReg::X19;
if (w == PhysReg::W20) return PhysReg::X20;
if (w == PhysReg::W21) return PhysReg::X21;
if (w == PhysReg::W22) return PhysReg::X22;
if (w == PhysReg::W23) return PhysReg::X23;
if (w == PhysReg::W24) return PhysReg::X24;
int idx = static_cast<int>(w) - static_cast<int>(PhysReg::W0);
if (idx >= 0 && idx <= 11) {
return static_cast<PhysReg>(static_cast<int>(PhysReg::X0) + idx);
@ -22,12 +29,32 @@ PhysReg WRegToXReg(PhysReg w) {
return w;
}
std::unordered_set<int> CollectUsedFrameSlots(const MachineFunction& function) {
std::unordered_set<int> used;
for (const auto& bb_ptr : function.GetBlocks()) {
for (const auto& inst : bb_ptr->GetInstructions()) {
for (const auto& op : inst.GetOperands()) {
if (op.IsFrameIndex()) {
used.insert(op.GetFrameIndex());
}
}
}
}
return used;
}
} // namespace
void RunFrameLowering(MachineFunction& function) {
const auto used_frame_slots = CollectUsedFrameSlots(function);
// 计算栈槽偏移
int cursor = 0;
for (const auto& slot : function.GetFrameSlots()) {
if (!used_frame_slots.count(slot.index)) {
function.GetFrameSlot(slot.index).offset = 0;
continue;
}
cursor += slot.size;
function.GetFrameSlot(slot.index).offset = -cursor;
}
@ -38,7 +65,10 @@ void RunFrameLowering(MachineFunction& function) {
for (size_t i = 0; i < callee_saved.size(); ++i) {
PhysReg save_reg = callee_saved[i];
PhysReg x_reg = save_reg;
if (save_reg >= PhysReg::W0 && save_reg <= PhysReg::W11) {
if ((save_reg >= PhysReg::W0 && save_reg <= PhysReg::W11) ||
save_reg == PhysReg::W19 || save_reg == PhysReg::W20 ||
save_reg == PhysReg::W21 || save_reg == PhysReg::W22 ||
save_reg == PhysReg::W23 || save_reg == PhysReg::W24) {
x_reg = WRegToXReg(save_reg);
}
// 浮点 callee-saved 直接用 s 寄存器保存4字节

@ -0,0 +1,623 @@
#include "mir/MIR.h"
#include <algorithm>
#include <optional>
#include <set>
#include <unordered_map>
#include <unordered_set>
#include <vector>
namespace mir {
namespace {
bool IsControlTransfer(const MachineInstr& inst) {
switch (inst.GetOpcode()) {
case Opcode::B:
case Opcode::Bcond:
case Opcode::FBcond:
case Opcode::Cbnz:
case Opcode::Cbz:
case Opcode::Ret:
return true;
default:
return false;
}
}
std::optional<int> GetLoadSlot(const MachineInstr& inst) {
const auto& ops = inst.GetOperands();
if (inst.GetOpcode() != Opcode::LoadStack || ops.size() < 2 ||
!ops[1].IsFrameIndex()) {
return std::nullopt;
}
return ops[1].GetFrameIndex();
}
std::optional<int> GetStoreSlot(const MachineInstr& inst) {
const auto& ops = inst.GetOperands();
if (inst.GetOpcode() != Opcode::StoreStack || ops.size() < 2 ||
!ops[1].IsFrameIndex()) {
return std::nullopt;
}
return ops[1].GetFrameIndex();
}
bool IsOpaqueSlotUse(const MachineInstr& inst, int* slot) {
const auto& ops = inst.GetOperands();
switch (inst.GetOpcode()) {
case Opcode::LoadStackOffset:
case Opcode::StoreStackOffset:
case Opcode::LoadStackAddr:
if (ops.size() >= 2 && ops[1].IsFrameIndex()) {
*slot = ops[1].GetFrameIndex();
return true;
}
return false;
default:
return false;
}
}
bool SameReg(PhysReg lhs, PhysReg rhs) {
return lhs == rhs;
}
bool IsPromotableWReg(PhysReg reg) {
if (reg >= PhysReg::W0 && reg <= PhysReg::W11) return true;
return reg == PhysReg::W19 || reg == PhysReg::W20 || reg == PhysReg::W21 ||
reg == PhysReg::W22 || reg == PhysReg::W23 || reg == PhysReg::W24;
}
bool IsPromotableXReg(PhysReg reg) {
if (reg >= PhysReg::X0 && reg <= PhysReg::X11) return true;
return reg == PhysReg::X19 || reg == PhysReg::X20 || reg == PhysReg::X21 ||
reg == PhysReg::X22 || reg == PhysReg::X23 || reg == PhysReg::X24;
}
bool IsPromotableSReg(PhysReg reg) {
return reg >= PhysReg::S0 && reg <= PhysReg::S10;
}
size_t FirstTerminatorIndex(const std::vector<MachineInstr>& insts) {
for (size_t i = 0; i < insts.size(); ++i) {
if (IsControlTransfer(insts[i])) return i;
}
return insts.size();
}
void InsertBeforeTerminators(std::vector<MachineInstr>& insts,
const std::vector<MachineInstr>& inserted) {
const size_t pos = FirstTerminatorIndex(insts);
insts.insert(insts.begin() + static_cast<long>(pos), inserted.begin(),
inserted.end());
}
struct SlotUseInfo {
enum class RegKind { Unknown, W, X, S, Invalid };
int slot = -1;
int loads = 0;
int stores = 0;
int body_loads = 0;
int body_stores = 0;
int after_call_uses = 0;
RegKind reg_kind = RegKind::Unknown;
std::unordered_set<size_t> use_blocks;
};
struct SlotPick {
int slot = -1;
SlotUseInfo::RegKind reg_kind = SlotUseInfo::RegKind::Unknown;
bool write_back = true;
};
struct LoopCandidate {
size_t header = 0;
size_t latch = 0;
int score = 0;
std::vector<SlotPick> slots;
std::unordered_set<size_t> blocks;
};
struct Promotion {
int slot = -1;
PhysReg reg = PhysReg::W19;
SlotUseInfo::RegKind reg_kind = SlotUseInfo::RegKind::Unknown;
bool write_back = true;
};
SlotUseInfo::RegKind ClassifyPromotableReg(PhysReg reg) {
if (IsPromotableWReg(reg)) return SlotUseInfo::RegKind::W;
if (IsPromotableXReg(reg)) return SlotUseInfo::RegKind::X;
if (IsPromotableSReg(reg)) return SlotUseInfo::RegKind::S;
return SlotUseInfo::RegKind::Invalid;
}
void NoteSlotRegUse(SlotUseInfo& info, PhysReg reg) {
SlotUseInfo::RegKind use_kind = ClassifyPromotableReg(reg);
if (use_kind == SlotUseInfo::RegKind::Invalid ||
(info.reg_kind != SlotUseInfo::RegKind::Unknown &&
info.reg_kind != use_kind)) {
info.reg_kind = SlotUseInfo::RegKind::Invalid;
return;
}
info.reg_kind = use_kind;
}
int SlotScore(const SlotUseInfo& info) {
int score = (info.body_loads + info.body_stores) * 4 + info.loads +
info.stores;
if (info.stores == 0) {
score += 80 + info.body_loads * 6;
}
if (info.body_loads > 0 && info.body_stores > 0) {
score += info.use_blocks.size() > 1 ? 140 : 20;
}
if (info.use_blocks.size() > 1) {
score += static_cast<int>(info.use_blocks.size() - 1) * 24;
}
if (info.reg_kind == SlotUseInfo::RegKind::S && info.after_call_uses > 0) {
score += 180 + info.after_call_uses * 8;
}
return score;
}
PhysReg GprForIndex(SlotUseInfo::RegKind kind, size_t index) {
static const std::vector<PhysReg> w_regs = {PhysReg::W19, PhysReg::W20,
PhysReg::W21, PhysReg::W22,
PhysReg::W23, PhysReg::W24};
static const std::vector<PhysReg> x_regs = {PhysReg::X19, PhysReg::X20,
PhysReg::X21, PhysReg::X22,
PhysReg::X23, PhysReg::X24};
if (kind == SlotUseInfo::RegKind::X) return x_regs[index];
return w_regs[index];
}
std::vector<size_t> GetSuccessors(
const MachineFunction& function, size_t block_index,
const std::unordered_map<std::string, size_t>& block_index_by_name) {
const auto& blocks = function.GetBlocks();
const auto& insts = blocks[block_index]->GetInstructions();
std::vector<size_t> succs;
for (const auto& inst : insts) {
const auto& ops = inst.GetOperands();
switch (inst.GetOpcode()) {
case Opcode::B:
case Opcode::Bcond:
case Opcode::FBcond:
if (!ops.empty() && ops[0].IsSymbol()) {
auto it = block_index_by_name.find(ops[0].GetSymbol());
if (it != block_index_by_name.end()) succs.push_back(it->second);
}
break;
case Opcode::Cbnz:
case Opcode::Cbz:
if (ops.size() > 1 && ops[1].IsSymbol()) {
auto it = block_index_by_name.find(ops[1].GetSymbol());
if (it != block_index_by_name.end()) succs.push_back(it->second);
}
break;
default:
break;
}
}
if (!insts.empty()) {
Opcode last = insts.back().GetOpcode();
if (last != Opcode::B && last != Opcode::Ret &&
block_index + 1 < blocks.size()) {
succs.push_back(block_index + 1);
}
}
std::sort(succs.begin(), succs.end());
succs.erase(std::unique(succs.begin(), succs.end()), succs.end());
return succs;
}
bool InLoop(const LoopCandidate& loop, size_t index) {
return loop.blocks.count(index) != 0;
}
std::vector<size_t> SortedLoopBlocks(const LoopCandidate& loop) {
std::vector<size_t> blocks(loop.blocks.begin(), loop.blocks.end());
std::sort(blocks.begin(), blocks.end());
return blocks;
}
std::vector<std::vector<size_t>> BuildSuccessors(
const MachineFunction& function,
const std::unordered_map<std::string, size_t>& block_index_by_name) {
std::vector<std::vector<size_t>> succs(function.GetBlocks().size());
for (size_t i = 0; i < succs.size(); ++i) {
succs[i] = GetSuccessors(function, i, block_index_by_name);
}
return succs;
}
std::vector<std::vector<size_t>> BuildPredecessors(
const std::vector<std::vector<size_t>>& succs) {
std::vector<std::vector<size_t>> preds(succs.size());
for (size_t i = 0; i < succs.size(); ++i) {
for (size_t succ : succs[i]) {
preds[succ].push_back(i);
}
}
for (auto& pred_list : preds) {
std::sort(pred_list.begin(), pred_list.end());
pred_list.erase(std::unique(pred_list.begin(), pred_list.end()),
pred_list.end());
}
return preds;
}
std::vector<std::set<size_t>> ComputeDominators(
size_t block_count, const std::vector<std::vector<size_t>>& preds) {
std::vector<std::set<size_t>> doms(block_count);
if (block_count == 0) return doms;
doms[0].insert(0);
for (size_t i = 1; i < block_count; ++i) {
for (size_t j = 0; j < block_count; ++j) doms[i].insert(j);
}
bool changed = true;
while (changed) {
changed = false;
for (size_t block = 1; block < block_count; ++block) {
std::set<size_t> next;
bool first_pred = true;
for (size_t pred : preds[block]) {
if (first_pred) {
next = doms[pred];
first_pred = false;
continue;
}
std::set<size_t> intersection;
std::set_intersection(next.begin(), next.end(), doms[pred].begin(),
doms[pred].end(),
std::inserter(intersection,
intersection.begin()));
next = std::move(intersection);
}
next.insert(block);
if (next != doms[block]) {
doms[block] = std::move(next);
changed = true;
}
}
}
return doms;
}
std::unordered_set<size_t> BuildNaturalLoop(
size_t header, size_t latch,
const std::vector<std::vector<size_t>>& preds) {
std::unordered_set<size_t> loop_blocks;
std::vector<size_t> worklist;
loop_blocks.insert(header);
loop_blocks.insert(latch);
worklist.push_back(latch);
while (!worklist.empty()) {
size_t block = worklist.back();
worklist.pop_back();
for (size_t pred : preds[block]) {
if (loop_blocks.insert(pred).second && pred != header) {
worklist.push_back(pred);
}
}
}
return loop_blocks;
}
bool HasSingleEntry(size_t header, const std::unordered_set<size_t>& loop_blocks,
const std::vector<std::vector<size_t>>& preds) {
for (size_t block : loop_blocks) {
if (block == header) continue;
for (size_t pred : preds[block]) {
if (loop_blocks.count(pred) == 0) return false;
}
}
return true;
}
std::vector<LoopCandidate> FindLoopCandidates(MachineFunction& function) {
const auto& blocks = function.GetBlocks();
std::unordered_map<std::string, size_t> block_index_by_name;
for (size_t i = 0; i < blocks.size(); ++i) {
block_index_by_name[blocks[i]->GetName()] = i;
}
std::unordered_set<int> opaque_slots;
for (const auto& bb : blocks) {
for (const auto& inst : bb->GetInstructions()) {
int slot = -1;
if (IsOpaqueSlotUse(inst, &slot)) opaque_slots.insert(slot);
}
}
auto succs = BuildSuccessors(function, block_index_by_name);
auto preds = BuildPredecessors(succs);
auto doms = ComputeDominators(blocks.size(), preds);
std::vector<LoopCandidate> candidates;
for (size_t latch = 0; latch < blocks.size(); ++latch) {
for (size_t header : succs[latch]) {
if (header == latch) continue;
if (header >= doms.size() || doms[latch].count(header) == 0) continue;
auto loop_blocks = BuildNaturalLoop(header, latch, preds);
if (loop_blocks.size() > 24) continue;
if (!HasSingleEntry(header, loop_blocks, preds)) continue;
std::unordered_map<int, SlotUseInfo> slot_info;
for (size_t bi : loop_blocks) {
bool seen_call = false;
for (const auto& cur : blocks[bi]->GetInstructions()) {
if (cur.GetOpcode() == Opcode::Bl) {
seen_call = true;
}
if (auto slot = GetLoadSlot(cur);
slot.has_value() && !opaque_slots.count(*slot)) {
auto& info = slot_info[*slot];
info.slot = *slot;
const auto& ops = cur.GetOperands();
if (ops.empty() || !ops[0].IsReg()) {
info.reg_kind = SlotUseInfo::RegKind::Invalid;
} else {
NoteSlotRegUse(info, ops[0].GetReg());
}
++info.loads;
info.use_blocks.insert(bi);
if (seen_call) ++info.after_call_uses;
if (bi != header) ++info.body_loads;
}
if (auto slot = GetStoreSlot(cur);
slot.has_value() && !opaque_slots.count(*slot)) {
auto& info = slot_info[*slot];
info.slot = *slot;
const auto& ops = cur.GetOperands();
if (ops.empty() || !ops[0].IsReg()) {
info.reg_kind = SlotUseInfo::RegKind::Invalid;
} else {
NoteSlotRegUse(info, ops[0].GetReg());
}
++info.stores;
info.use_blocks.insert(bi);
if (seen_call) ++info.after_call_uses;
if (bi != header) ++info.body_stores;
}
}
}
std::vector<SlotUseInfo> ranked;
for (const auto& [slot, info] : slot_info) {
if (info.reg_kind == SlotUseInfo::RegKind::Invalid ||
info.reg_kind == SlotUseInfo::RegKind::Unknown) {
continue;
}
const int slot_size = function.GetFrameSlot(slot).size;
if (info.reg_kind == SlotUseInfo::RegKind::X) {
if (slot_size != 8) continue;
} else if (slot_size != 4) {
continue;
}
if (info.loads == 0) continue;
if (info.stores == 0 && info.loads < 2) continue;
if (info.stores > 0 && info.loads + info.stores < 2) continue;
ranked.push_back(info);
}
std::sort(ranked.begin(), ranked.end(),
[](const SlotUseInfo& lhs, const SlotUseInfo& rhs) {
int lhs_score = SlotScore(lhs);
int rhs_score = SlotScore(rhs);
if (lhs_score != rhs_score) return lhs_score > rhs_score;
return lhs.slot < rhs.slot;
});
if (ranked.empty()) continue;
LoopCandidate cand;
cand.header = header;
cand.latch = latch;
cand.blocks = std::move(loop_blocks);
int gpr_slots = 0;
int s_slots = 0;
constexpr int kMaxGprSlots = 6;
constexpr int kMaxSSlots = 3;
for (const auto& info : ranked) {
if (info.reg_kind == SlotUseInfo::RegKind::W ||
info.reg_kind == SlotUseInfo::RegKind::X) {
if (gpr_slots >= kMaxGprSlots) continue;
++gpr_slots;
} else if (info.reg_kind == SlotUseInfo::RegKind::S) {
if (s_slots >= kMaxSSlots) continue;
++s_slots;
} else {
continue;
}
cand.slots.push_back(
SlotPick{info.slot, info.reg_kind, info.stores > 0});
cand.score += SlotScore(info);
}
if (cand.slots.empty()) continue;
candidates.push_back(std::move(cand));
}
}
std::sort(candidates.begin(), candidates.end(),
[](const LoopCandidate& lhs, const LoopCandidate& rhs) {
if (lhs.score != rhs.score) return lhs.score > rhs.score;
if (lhs.blocks.size() != rhs.blocks.size()) {
return lhs.blocks.size() > rhs.blocks.size();
}
return lhs.header < rhs.header;
});
return candidates;
}
void PromoteLoopSlots(MachineFunction& function, const LoopCandidate& loop) {
const std::vector<PhysReg> s_regs = {PhysReg::S8, PhysReg::S9,
PhysReg::S10};
std::unordered_map<int, Promotion> slot_to_promotion;
std::vector<Promotion> promotions;
size_t next_gpr_reg = 0;
size_t next_s_reg = 0;
for (const auto& slot : loop.slots) {
PhysReg reg = PhysReg::W19;
if (slot.reg_kind == SlotUseInfo::RegKind::W ||
slot.reg_kind == SlotUseInfo::RegKind::X) {
if (next_gpr_reg >= 6) continue;
reg = GprForIndex(slot.reg_kind, next_gpr_reg++);
} else if (slot.reg_kind == SlotUseInfo::RegKind::S) {
if (next_s_reg >= s_regs.size()) continue;
reg = s_regs[next_s_reg++];
} else {
continue;
}
Promotion promotion{slot.slot, reg, slot.reg_kind, slot.write_back};
slot_to_promotion[slot.slot] = promotion;
promotions.push_back(promotion);
function.AddUsedCalleeSaved(reg);
}
const auto& blocks = function.GetBlocks();
std::unordered_map<std::string, size_t> block_index_by_name;
for (size_t i = 0; i < blocks.size(); ++i) {
block_index_by_name[blocks[i]->GetName()] = i;
}
auto succs = BuildSuccessors(function, block_index_by_name);
auto preds = BuildPredecessors(succs);
for (size_t bi : SortedLoopBlocks(loop)) {
auto& insts = blocks[bi]->GetInstructions();
std::vector<MachineInstr> rewritten;
rewritten.reserve(insts.size());
for (const auto& inst : insts) {
if (auto slot = GetLoadSlot(inst); slot.has_value()) {
auto it = slot_to_promotion.find(*slot);
if (it != slot_to_promotion.end()) {
const auto& ops = inst.GetOperands();
PhysReg dst = ops[0].GetReg();
if (!SameReg(dst, it->second.reg)) {
Opcode mov_opcode =
it->second.reg_kind == SlotUseInfo::RegKind::S
? Opcode::FMovReg
: Opcode::MovReg;
rewritten.emplace_back(
mov_opcode,
std::vector<Operand>{Operand::Reg(dst),
Operand::Reg(it->second.reg)});
}
continue;
}
}
if (auto slot = GetStoreSlot(inst); slot.has_value()) {
auto it = slot_to_promotion.find(*slot);
if (it != slot_to_promotion.end()) {
const auto& ops = inst.GetOperands();
PhysReg src = ops[0].GetReg();
if (!SameReg(src, it->second.reg)) {
Opcode mov_opcode =
it->second.reg_kind == SlotUseInfo::RegKind::S
? Opcode::FMovReg
: Opcode::MovReg;
rewritten.emplace_back(
mov_opcode,
std::vector<Operand>{Operand::Reg(it->second.reg),
Operand::Reg(src)});
}
continue;
}
}
rewritten.push_back(inst);
}
insts = std::move(rewritten);
}
for (size_t pred = 0; pred < blocks.size(); ++pred) {
if (std::find(succs[pred].begin(), succs[pred].end(), loop.header) ==
succs[pred].end()) {
continue;
}
if (InLoop(loop, pred)) continue;
std::vector<MachineInstr> loads;
for (const auto& promotion : promotions) {
loads.emplace_back(Opcode::LoadStack,
std::vector<Operand>{
Operand::Reg(promotion.reg),
Operand::FrameIndex(promotion.slot)});
}
InsertBeforeTerminators(blocks[pred]->GetInstructions(), loads);
}
std::unordered_set<size_t> exit_blocks_with_stores;
for (size_t bi : SortedLoopBlocks(loop)) {
bool needs_local_exit_store = false;
for (size_t succ : succs[bi]) {
if (InLoop(loop, succ)) continue;
bool exit_has_only_loop_preds = true;
for (size_t pred : preds[succ]) {
if (!InLoop(loop, pred)) {
exit_has_only_loop_preds = false;
break;
}
}
if (exit_has_only_loop_preds) {
if (exit_blocks_with_stores.insert(succ).second) {
std::vector<MachineInstr> stores;
for (const auto& promotion : promotions) {
if (!promotion.write_back) continue;
stores.emplace_back(
Opcode::StoreStack,
std::vector<Operand>{
Operand::Reg(promotion.reg),
Operand::FrameIndex(promotion.slot)});
}
auto& exit_insts = blocks[succ]->GetInstructions();
exit_insts.insert(exit_insts.begin(), stores.begin(), stores.end());
}
} else {
needs_local_exit_store = true;
}
}
if (!needs_local_exit_store) continue;
std::vector<MachineInstr> stores;
for (const auto& promotion : promotions) {
if (!promotion.write_back) continue;
stores.emplace_back(Opcode::StoreStack,
std::vector<Operand>{
Operand::Reg(promotion.reg),
Operand::FrameIndex(promotion.slot)});
}
InsertBeforeTerminators(blocks[bi]->GetInstructions(), stores);
}
}
} // namespace
void RunLoopSlotPromotion(MachineFunction& function) {
auto candidates = FindLoopCandidates(function);
std::unordered_set<size_t> promoted_blocks;
int promoted_loop_count = 0;
constexpr int kMaxPromotedLoops = 4;
constexpr int kMinLoopScore = 32;
for (const auto& loop : candidates) {
if (loop.score < kMinLoopScore) break;
bool overlaps_existing_loop = false;
for (size_t block : loop.blocks) {
if (promoted_blocks.count(block) != 0) {
overlaps_existing_loop = true;
break;
}
}
if (overlaps_existing_loop) continue;
PromoteLoopSlots(function, loop);
promoted_blocks.insert(loop.blocks.begin(), loop.blocks.end());
++promoted_loop_count;
if (promoted_loop_count >= kMaxPromotedLoops) break;
}
}
} // namespace mir

@ -15,6 +15,12 @@ PhysReg CanonicalCalleeSavedReg(PhysReg reg) {
int idx = static_cast<int>(reg) - static_cast<int>(PhysReg::W0);
return static_cast<PhysReg>(static_cast<int>(PhysReg::X0) + idx);
}
if (reg == PhysReg::W19) return PhysReg::X19;
if (reg == PhysReg::W20) return PhysReg::X20;
if (reg == PhysReg::W21) return PhysReg::X21;
if (reg == PhysReg::W22) return PhysReg::X22;
if (reg == PhysReg::W23) return PhysReg::X23;
if (reg == PhysReg::W24) return PhysReg::X24;
return reg;
}

@ -20,6 +20,12 @@ const char* PhysRegName(PhysReg reg) {
case PhysReg::W9: return "w9";
case PhysReg::W10: return "w10";
case PhysReg::W11: return "w11";
case PhysReg::W19: return "w19";
case PhysReg::W20: return "w20";
case PhysReg::W21: return "w21";
case PhysReg::W22: return "w22";
case PhysReg::W23: return "w23";
case PhysReg::W24: return "w24";
case PhysReg::X0: return "x0";
case PhysReg::X1: return "x1";
case PhysReg::X2: return "x2";
@ -35,6 +41,12 @@ const char* PhysRegName(PhysReg reg) {
case PhysReg::X29: return "x29";
case PhysReg::X30: return "x30";
case PhysReg::SP: return "sp";
case PhysReg::X19: return "x19";
case PhysReg::X20: return "x20";
case PhysReg::X21: return "x21";
case PhysReg::X22: return "x22";
case PhysReg::X23: return "x23";
case PhysReg::X24: return "x24";
case PhysReg::S0: return "s0";
case PhysReg::S1: return "s1";
case PhysReg::S2: return "s2";

@ -1,6 +1,9 @@
#include "mir/MIR.h"
#include <algorithm>
#include <optional>
#include <set>
#include <unordered_set>
#include <unordered_map>
#include <vector>
@ -24,7 +27,13 @@ bool IsAbiArgReg(PhysReg reg) {
bool IsWxReg(PhysReg reg) {
return (reg >= PhysReg::W0 && reg <= PhysReg::W10) ||
(reg >= PhysReg::X0 && reg <= PhysReg::X10);
(reg >= PhysReg::X0 && reg <= PhysReg::X10) ||
reg == PhysReg::W19 || reg == PhysReg::W20 ||
reg == PhysReg::W21 || reg == PhysReg::W22 ||
reg == PhysReg::W23 || reg == PhysReg::W24 ||
reg == PhysReg::X19 || reg == PhysReg::X20 ||
reg == PhysReg::X21 || reg == PhysReg::X22 ||
reg == PhysReg::X23 || reg == PhysReg::X24;
}
int WxIndex(PhysReg reg) {
@ -34,6 +43,12 @@ int WxIndex(PhysReg reg) {
if (reg >= PhysReg::X0 && reg <= PhysReg::X10) {
return static_cast<int>(reg) - static_cast<int>(PhysReg::X0);
}
if (reg == PhysReg::W19 || reg == PhysReg::X19) return 19;
if (reg == PhysReg::W20 || reg == PhysReg::X20) return 20;
if (reg == PhysReg::W21 || reg == PhysReg::X21) return 21;
if (reg == PhysReg::W22 || reg == PhysReg::X22) return 22;
if (reg == PhysReg::W23 || reg == PhysReg::X23) return 23;
if (reg == PhysReg::W24 || reg == PhysReg::X24) return 24;
return -1;
}
@ -223,11 +238,17 @@ void RecordStore(std::unordered_map<int, PhysReg>& slot_to_reg,
}
bool IsWReg(PhysReg reg) {
return reg >= PhysReg::W0 && reg <= PhysReg::W11;
return (reg >= PhysReg::W0 && reg <= PhysReg::W11) ||
reg == PhysReg::W19 || reg == PhysReg::W20 ||
reg == PhysReg::W21 || reg == PhysReg::W22 ||
reg == PhysReg::W23 || reg == PhysReg::W24;
}
bool IsXReg(PhysReg reg) {
return (reg >= PhysReg::X0 && reg <= PhysReg::X11) ||
reg == PhysReg::X19 || reg == PhysReg::X20 ||
reg == PhysReg::X21 || reg == PhysReg::X22 ||
reg == PhysReg::X23 || reg == PhysReg::X24 ||
reg == PhysReg::X29 || reg == PhysReg::X30;
}
@ -303,6 +324,507 @@ bool IsNoopImmArithmetic(const MachineInstr& inst) {
return ops[2].GetImm() == 0 && RegAlias(ops[0].GetReg(), ops[1].GetReg());
}
std::optional<int> GetFrameIndexOperand(const MachineInstr& inst, size_t idx) {
const auto& ops = inst.GetOperands();
if (idx >= ops.size() || ops[idx].GetKind() != Operand::Kind::FrameIndex) {
return std::nullopt;
}
return ops[idx].GetFrameIndex();
}
bool IsControlTransfer(const MachineInstr& inst) {
switch (inst.GetOpcode()) {
case Opcode::B:
case Opcode::Bcond:
case Opcode::FBcond:
case Opcode::Cbnz:
case Opcode::Cbz:
case Opcode::Ret:
return true;
default:
return false;
}
}
bool MayTouchFrameSlot(const MachineInstr& inst, int slot) {
switch (inst.GetOpcode()) {
case Opcode::LoadStack:
case Opcode::StoreStack:
case Opcode::LoadStackOffset:
case Opcode::StoreStackOffset:
case Opcode::LoadStackAddr: {
auto inst_slot = GetFrameIndexOperand(inst, 1);
return inst_slot.has_value() && *inst_slot == slot;
}
default:
return false;
}
}
std::optional<int> GetLoadStackSlot(const MachineInstr& inst) {
if (inst.GetOpcode() != Opcode::LoadStack) {
return std::nullopt;
}
return GetFrameIndexOperand(inst, 1);
}
std::optional<int> GetStoreStackSlot(const MachineInstr& inst) {
if (inst.GetOpcode() != Opcode::StoreStack) {
return std::nullopt;
}
return GetFrameIndexOperand(inst, 1);
}
bool IsStoreOverwrittenBeforeRead(const std::vector<MachineInstr>& insts,
size_t store_index) {
const auto slot = GetFrameIndexOperand(insts[store_index], 1);
if (!slot.has_value()) {
return false;
}
for (size_t i = store_index + 1; i < insts.size(); ++i) {
const auto& inst = insts[i];
if (IsControlTransfer(inst) || inst.GetOpcode() == Opcode::Bl) {
return false;
}
if (!MayTouchFrameSlot(inst, *slot)) {
continue;
}
if (inst.GetOpcode() == Opcode::StoreStack) {
return true;
}
return false;
}
return false;
}
void RemoveOverwrittenStores(std::vector<MachineInstr>& insts) {
std::vector<MachineInstr> filtered;
filtered.reserve(insts.size());
for (size_t i = 0; i < insts.size(); ++i) {
if (IsStoreStack(insts[i]) && IsStoreOverwrittenBeforeRead(insts, i)) {
continue;
}
filtered.push_back(std::move(insts[i]));
}
insts = std::move(filtered);
}
bool IsOpaqueFrameSlotUse(const MachineInstr& inst, int* slot) {
switch (inst.GetOpcode()) {
case Opcode::LoadStackOffset:
case Opcode::StoreStackOffset:
case Opcode::LoadStackAddr: {
auto frame_index = GetFrameIndexOperand(inst, 1);
if (!frame_index.has_value()) {
return false;
}
*slot = *frame_index;
return true;
}
default:
return false;
}
}
bool HasFrameSlotTouch(const std::vector<MachineInstr>& insts, size_t begin,
size_t end, int slot) {
end = std::min(end, insts.size());
for (size_t i = begin; i < end; ++i) {
if (MayTouchFrameSlot(insts[i], slot)) {
return true;
}
}
return false;
}
bool IsStackCopyTail(const std::vector<MachineInstr>& insts, size_t begin) {
for (size_t i = begin; i < insts.size(); ++i) {
const auto opcode = insts[i].GetOpcode();
if (IsControlTransfer(insts[i])) {
continue;
}
if (opcode != Opcode::LoadStack && opcode != Opcode::StoreStack) {
return false;
}
}
return true;
}
bool LoadedRegUsedAfterRemovedStore(const std::vector<MachineInstr>& insts,
size_t begin, PhysReg reg) {
for (size_t i = begin; i < insts.size(); ++i) {
if (IsControlTransfer(insts[i])) {
continue;
}
if (ReadsReg(insts[i], reg)) {
return true;
}
if (auto written = GetWrittenReg(insts[i]);
written.has_value() && RegAlias(*written, reg)) {
return false;
}
}
return false;
}
bool RegTouched(const MachineInstr& inst, PhysReg reg) {
if (ReadsReg(inst, reg)) return true;
if (auto written = GetWrittenReg(inst);
written.has_value() && RegAlias(*written, reg)) {
return true;
}
return false;
}
std::unordered_map<const MachineBasicBlock*, std::vector<const MachineBasicBlock*>>
BuildSuccessorMap(const MachineFunction& function) {
std::unordered_map<const MachineBasicBlock*, std::vector<const MachineBasicBlock*>> succs;
const auto& blocks = function.GetBlocks();
auto find_block = [&](const std::string& name) -> const MachineBasicBlock* {
for (const auto& candidate : blocks) {
if (candidate->GetName() == name) {
return candidate.get();
}
}
return nullptr;
};
for (size_t bi = 0; bi < blocks.size(); ++bi) {
const auto* bb = blocks[bi].get();
auto& out = succs[bb];
const auto& insts = bb->GetInstructions();
for (const auto& inst : insts) {
switch (inst.GetOpcode()) {
case Opcode::B:
case Opcode::Bcond:
case Opcode::FBcond: {
const auto& ops = inst.GetOperands();
if (!ops.empty() && ops[0].IsSymbol()) {
if (auto* target = find_block(ops[0].GetSymbol())) {
out.push_back(target);
}
}
break;
}
case Opcode::Cbnz:
case Opcode::Cbz: {
const auto& ops = inst.GetOperands();
if (ops.size() > 1 && ops[1].IsSymbol()) {
if (auto* target = find_block(ops[1].GetSymbol())) {
out.push_back(target);
}
}
break;
}
default:
break;
}
}
if (!insts.empty()) {
Opcode last = insts.back().GetOpcode();
if (last != Opcode::B && last != Opcode::Ret && bi + 1 < blocks.size()) {
out.push_back(blocks[bi + 1].get());
}
}
std::sort(out.begin(), out.end());
out.erase(std::unique(out.begin(), out.end()), out.end());
}
return succs;
}
void CountScalarStackAccesses(const MachineFunction& function,
const std::unordered_set<int>& opaque_slots,
std::unordered_map<int, int>& load_count,
std::unordered_map<int, int>& store_count) {
load_count.clear();
store_count.clear();
for (const auto& bb_ptr : function.GetBlocks()) {
for (const auto& inst : bb_ptr->GetInstructions()) {
if (auto slot = GetLoadStackSlot(inst);
slot.has_value() && !opaque_slots.count(*slot)) {
++load_count[*slot];
}
if (auto slot = GetStoreStackSlot(inst);
slot.has_value() && !opaque_slots.count(*slot)) {
++store_count[*slot];
}
}
}
}
void ForwardLatchTempStores(
MachineFunction& function, const std::unordered_set<int>& opaque_slots,
const std::unordered_map<const MachineBasicBlock*, std::set<int>>& live_out) {
std::unordered_map<int, int> load_count;
std::unordered_map<int, int> store_count;
CountScalarStackAccesses(function, opaque_slots, load_count, store_count);
for (const auto& bb_ptr : function.GetBlocks()) {
auto& insts = bb_ptr->GetInstructions();
std::vector<bool> remove(insts.size(), false);
const auto live_out_it = live_out.find(bb_ptr.get());
const std::set<int>* block_live_out =
live_out_it == live_out.end() ? nullptr : &live_out_it->second;
for (size_t i = 0; i + 2 < insts.size(); ++i) {
if (remove[i]) {
continue;
}
auto temp_slot = GetStoreStackSlot(insts[i]);
if (!temp_slot.has_value() || opaque_slots.count(*temp_slot) ||
load_count[*temp_slot] != 1 || store_count[*temp_slot] != 1 ||
(block_live_out != nullptr && block_live_out->count(*temp_slot))) {
continue;
}
const auto& store_ops = insts[i].GetOperands();
if (store_ops.empty() || !store_ops[0].IsReg()) {
continue;
}
for (size_t j = i + 1; j + 1 < insts.size(); ++j) {
if (IsControlTransfer(insts[j]) || insts[j].GetOpcode() == Opcode::Bl) {
break;
}
if (!MayTouchFrameSlot(insts[j], *temp_slot)) {
continue;
}
auto load_slot = GetLoadStackSlot(insts[j]);
if (!load_slot.has_value() || *load_slot != *temp_slot) {
break;
}
auto final_slot = GetStoreStackSlot(insts[j + 1]);
if (!final_slot.has_value() || *final_slot == *temp_slot ||
opaque_slots.count(*final_slot) ||
HasFrameSlotTouch(insts, i + 1, j, *final_slot) ||
!IsStackCopyTail(insts, j + 2)) {
break;
}
const auto& load_ops = insts[j].GetOperands();
const auto& final_ops = insts[j + 1].GetOperands();
if (load_ops.empty() || final_ops.empty() || !load_ops[0].IsReg() ||
!final_ops[0].IsReg() ||
!RegAlias(load_ops[0].GetReg(), final_ops[0].GetReg()) ||
!SameRegWidth(store_ops[0].GetReg(), final_ops[0].GetReg()) ||
LoadedRegUsedAfterRemovedStore(insts, j + 2,
load_ops[0].GetReg())) {
break;
}
insts[i].SetOperand(1, Operand::FrameIndex(*final_slot));
remove[j] = true;
remove[j + 1] = true;
break;
}
}
std::vector<MachineInstr> filtered;
filtered.reserve(insts.size());
for (size_t i = 0; i < insts.size(); ++i) {
if (!remove[i]) {
filtered.push_back(std::move(insts[i]));
}
}
insts = std::move(filtered);
}
}
void ForwardUniqueStackTemps(
MachineFunction& function, const std::unordered_set<int>& opaque_slots,
const std::unordered_map<const MachineBasicBlock*, std::set<int>>& live_out) {
std::unordered_map<int, int> load_count;
std::unordered_map<int, int> store_count;
CountScalarStackAccesses(function, opaque_slots, load_count, store_count);
for (const auto& bb_ptr : function.GetBlocks()) {
auto& insts = bb_ptr->GetInstructions();
const auto live_out_it = live_out.find(bb_ptr.get());
const std::set<int>* block_live_out =
live_out_it == live_out.end() ? nullptr : &live_out_it->second;
std::vector<bool> remove(insts.size(), false);
std::vector<std::optional<MachineInstr>> replacement(insts.size());
for (size_t i = 0; i < insts.size(); ++i) {
auto slot = GetStoreStackSlot(insts[i]);
if (!slot.has_value() || opaque_slots.count(*slot) ||
load_count[*slot] != 1 || store_count[*slot] != 1 ||
(block_live_out != nullptr && block_live_out->count(*slot))) {
continue;
}
const auto& store_ops = insts[i].GetOperands();
if (store_ops.empty() || !store_ops[0].IsReg()) continue;
const PhysReg src = store_ops[0].GetReg();
for (size_t j = i + 1; j < insts.size(); ++j) {
if (IsControlTransfer(insts[j]) || insts[j].GetOpcode() == Opcode::Bl ||
IsMemoryClobber(insts[j])) {
break;
}
if (auto touched_slot = GetStoreStackSlot(insts[j]);
touched_slot.has_value() && *touched_slot == *slot) {
break;
}
auto load_slot = GetLoadStackSlot(insts[j]);
if (!load_slot.has_value() || *load_slot != *slot) {
continue;
}
const auto& load_ops = insts[j].GetOperands();
if (load_ops.empty() || !load_ops[0].IsReg()) break;
const PhysReg dst = load_ops[0].GetReg();
if (!SameRegWidth(src, dst)) break;
bool can_hold_in_dst = true;
for (size_t k = i + 1; k < j; ++k) {
if (RegTouched(insts[k], dst)) {
can_hold_in_dst = false;
break;
}
}
if (!can_hold_in_dst) break;
if (RegAlias(src, dst)) {
bool src_clobbered = false;
for (size_t k = i + 1; k < j; ++k) {
if (auto written = GetWrittenReg(insts[k]);
written.has_value() && RegAlias(*written, src)) {
src_clobbered = true;
break;
}
}
if (src_clobbered) break;
remove[i] = true;
} else {
const Opcode mv_op = (IsFloatReg(src) && IsFloatReg(dst))
? Opcode::FMovReg
: Opcode::MovReg;
replacement[i] = MachineInstr(
mv_op, std::vector<Operand>{Operand::Reg(dst),
Operand::Reg(src)});
}
remove[j] = true;
break;
}
}
std::vector<MachineInstr> filtered;
filtered.reserve(insts.size());
for (size_t i = 0; i < insts.size(); ++i) {
if (remove[i]) continue;
if (replacement[i].has_value()) {
filtered.push_back(*replacement[i]);
continue;
}
filtered.push_back(std::move(insts[i]));
}
insts = std::move(filtered);
}
}
void RemoveDeadScalarStores(MachineFunction& function) {
std::unordered_set<int> opaque_slots;
for (const auto& bb_ptr : function.GetBlocks()) {
for (const auto& inst : bb_ptr->GetInstructions()) {
int slot = -1;
if (IsOpaqueFrameSlotUse(inst, &slot)) {
opaque_slots.insert(slot);
}
}
}
const auto succs = BuildSuccessorMap(function);
std::unordered_map<const MachineBasicBlock*, std::set<int>> use;
std::unordered_map<const MachineBasicBlock*, std::set<int>> def;
std::unordered_map<const MachineBasicBlock*, std::set<int>> live_in;
std::unordered_map<const MachineBasicBlock*, std::set<int>> live_out;
std::unordered_map<int, int> load_count;
std::unordered_map<int, int> store_count;
for (const auto& bb_ptr : function.GetBlocks()) {
const auto* bb = bb_ptr.get();
for (const auto& inst : bb->GetInstructions()) {
if (auto slot = GetLoadStackSlot(inst); slot.has_value()) {
if (!opaque_slots.count(*slot)) {
++load_count[*slot];
}
if (!opaque_slots.count(*slot) && !def[bb].count(*slot)) {
use[bb].insert(*slot);
}
}
if (auto slot = GetStoreStackSlot(inst); slot.has_value()) {
if (!opaque_slots.count(*slot)) {
++store_count[*slot];
def[bb].insert(*slot);
}
}
}
}
bool changed = true;
while (changed) {
changed = false;
const auto& blocks = function.GetBlocks();
for (int bi = static_cast<int>(blocks.size()) - 1; bi >= 0; --bi) {
const auto* bb = blocks[bi].get();
std::set<int> new_out;
if (auto it = succs.find(bb); it != succs.end()) {
for (const auto* succ : it->second) {
const auto& succ_in = live_in[succ];
new_out.insert(succ_in.begin(), succ_in.end());
}
}
std::set<int> new_in = new_out;
for (int slot : def[bb]) {
new_in.erase(slot);
}
new_in.insert(use[bb].begin(), use[bb].end());
if (new_out != live_out[bb] || new_in != live_in[bb]) {
live_out[bb] = std::move(new_out);
live_in[bb] = std::move(new_in);
changed = true;
}
}
}
for (const auto& bb_ptr : function.GetBlocks()) {
auto& insts = bb_ptr->GetInstructions();
std::set<int> live = live_out[bb_ptr.get()];
std::vector<MachineInstr> filtered;
filtered.reserve(insts.size());
for (int i = static_cast<int>(insts.size()) - 1; i >= 0; --i) {
auto& inst = insts[i];
if (auto slot = GetLoadStackSlot(inst);
slot.has_value() && !opaque_slots.count(*slot)) {
live.insert(*slot);
filtered.push_back(std::move(inst));
continue;
}
if (auto slot = GetStoreStackSlot(inst);
slot.has_value() && !opaque_slots.count(*slot)) {
if (!live.count(*slot)) {
continue;
}
live.erase(*slot);
filtered.push_back(std::move(inst));
continue;
}
filtered.push_back(std::move(inst));
}
std::reverse(filtered.begin(), filtered.end());
insts = std::move(filtered);
}
ForwardLatchTempStores(function, opaque_slots, live_out);
ForwardUniqueStackTemps(function, opaque_slots, live_out);
}
} // namespace
void RunPeephole(MachineFunction& function) {
@ -422,8 +944,10 @@ void RunPeephole(MachineFunction& function) {
}
}
RemoveOverwrittenStores(optimized);
insts = std::move(optimized);
}
RemoveDeadScalarStores(function);
}
} // namespace mir

Binary file not shown.

@ -39,3 +39,54 @@
- 方案IR 文本打印补齐 `ConstantFloat` 的 LLVM 十六进制浮点常量格式,把浮点二元运算/比较输出为 `fadd/fsub/fmul/fdiv`、`fcmp o*`;比较结果打印为 `i1` 后再 `zext` 回内部 `i32` 约定;删除不可达块时同步清理后继 phi 入边scaled 间接访存补齐 peephole 的读寄存器与内存 clobber 描述。
- 代码位置:`src/ir/IRPrinter.cpp`、`src/ir/passes/CFGSimplify.cpp`、`src/mir/passes/Peephole.cpp`。
## 8. ASM 后端栈槽死写删除
- 方案:在 MIR peephole 中增加纯标量 frame slot 的活跃性分析;对没有地址暴露、没有 offset 访问的栈槽,若 `StoreStack` 写入后到块尾/后继都不会被 `LoadStack` 读取,则删除该死写,同时保留原有局部 store-load 前递。
- 代码位置:`src/mir/passes/Peephole.cpp`,新增 `RemoveDeadScalarStores` 及 CFG successor 推导。
## 10. ASM 后端栈帧压缩
- 方案FrameLowering 前扫描实际仍被指令引用的 frame slot被 peephole 删除引用的死槽不再分配栈空间,减少栈帧大小和大偏移 `sub x11; ldr/str [x11]` 展开。
- 代码位置:`src/mir/FrameLowering.cpp`,新增 `CollectUsedFrameSlots` 并跳过未引用槽。
## 11. 循环 lowering 尝试记录
- 方案:尝试将循环内 `i32 phi` 提升为跨块 MIR vreg但当前寄存器分配仍假设跨块值经栈槽传递直接改写会破坏跨块活跃性已回退该方向。保留一个带全函数唯一读写保护的临时槽消除规则未命中 `2025-MYO-20` 主热循环。
- 代码位置:`src/mir/passes/Peephole.cpp`。
## 12. ASM 循环回边临时槽转发
- 方案:对循环 latch 尾部的 `store temp ... load temp; store phi` 形态做块内专用转发,把第一次写 temp 改成直接写 phi并删除尾部回拷要求 temp 全函数唯一读写、无地址暴露、目标 phi 槽在移动区间内未被触碰,且尾部只剩栈槽回拷和跳转。
- 代码位置:`src/mir/passes/Peephole.cpp`,新增 `ForwardLatchTempStores`;配合 `src/mir/FrameLowering.cpp` 的死槽栈帧压缩继续减少帧大小。
## 13. Polyhedral-lite 前置:局部内存值编号
- 方案:完整 loop interchange/tiling 需要先有可靠的仿射访存依赖基础;先在局部 CSE 中纳入 `load`,只复用同一基本块内相同指针且中间没有 `store/call` 的读值,减少仿射地址重复 load并为后续保守依赖分析打基础。
- 代码位置:`src/ir/passes/CSE.cpp``Load` 加入 CSE key遇到 `Store/Call` 清空 load 记录。
## 14. Polyhedral-lite 内存转发与保守别名失效
- 方案:在局部内存值编号中记录同块内最近一次精确指针 `store`,后续同指针 `load` 直接替换为已存值;`store` 不再清空全部 load而是只清空可能别名的内存记录。别名判断增加简单仿射 GEP key支持 `add/sub/常量乘` 下标;不同全局变量/不同 alloca 基址视为不别名,复杂表达式退回保守处理,函数指针参数仍视为可能别名。
- 代码位置:`src/ir/passes/CSE.cpp`,新增 `BuildAffineExpr`、`BuildMemoryKey`、`MayAlias`、`InvalidateMayAliasMemory` 与 `store_values`
## 15. MIR 循环标量槽寄存器化
- 方案:在 `RegAlloc` 后、`FrameLowering` 前按 CFG 回边与支配关系识别 natural loop不再用物理块连续区间近似循环同一函数内最多选择 4 个互不重叠热点循环,按循环体 load/store 次数和“既读又写”的 accumulator 特征挑选最多 6 个 4 字节纯标量 frame slot`w19-w24` 保存在循环内;入口前驱加载初值,退出写回优先下沉到唯一出口块,减少条件块上的热路径写栈。
- 代码位置:`src/mir/LoopSlotPromotion.cpp`,新增/扩展 `RunLoopSlotPromotion``include/mir/MIR.h`、`src/mir/Register.cpp`、`src/mir/FrameLowering.cpp`、`src/mir/passes/Peephole.cpp` 支持 `w19-w24/x19-x24``src/main.cpp` 在 `RunRegAlloc` 后调用。
- 修复:候选槽必须所有普通 `LoadStack/StoreStack` 都使用 W 寄存器,排除 4 字节浮点 `s` 栈槽,避免生成非法的 `mov s*, w19-w24`
## 16. MIR 浮点标量槽寄存器化
- 方案:在循环槽寄存器化中按 `LoadStack/StoreStack` 的实际寄存器类别区分整型槽与浮点槽;整型槽继续用 `w19-w24`,浮点槽用 callee-saved `s8-s10`,循环内改写为 `FMovReg`,入口/出口仍用栈槽加载和写回。评分上提高跨基本块使用槽的优先级,优先保留 float accumulator 这类跨展开块活跃值,避免 `s8-s10` 被单块临时值占满。
- 代码位置:`src/mir/LoopSlotPromotion.cpp`,新增 slot register kind 分类、`s8-s10` promotion 分配和 `FMovReg` 重写。
## 17. 浮点临时栈槽前递与跨调用评分
- 方案MIR peephole 对全函数唯一 store/load 且块外不活跃的纯临时栈槽做保守前递,把 `StoreStack temp; ...; LoadStack temp` 改为提前的 `MovReg/FMovReg`,消除单块 float 临时值的落栈;循环槽评分对 `Bl` 之后使用的浮点槽加权,优先保留跨调用后的 float accumulator。
- 代码位置:`src/mir/passes/Peephole.cpp`,新增 `ForwardUniqueStackTemps``src/mir/LoopSlotPromotion.cpp`,增加 `after_call_uses` 评分。
## 18. MIR 循环只读栈槽寄存器化
- 方案:循环槽寄存器化支持 `stores == 0` 的 read-only 栈槽,入口加载一次后把循环内 `LoadStack` 改为寄存器移动,退出不写回;新增 `X` 类槽并让 `W/X` 共用 `19-24` 的 callee-saved 编号,避免别名冲突。该规则用于提升 `large_loop_array_2``loop(x, y, len)` 的只读参数槽,减少热循环中反复读取 `x/y/length` 栈槽。
- 代码位置:`src/mir/LoopSlotPromotion.cpp`,扩展 slot kind、read-only 候选和统一 GPR 分配;`src/mir/MIRFunction.cpp`,把 `w19-w24` 规范化到对应 `x19-x24`,避免重复保存。

Loading…
Cancel
Save