diff --git a/src/mir/AsmPrinter.cpp b/src/mir/AsmPrinter.cpp index 756deeb1..596fafef 100644 --- a/src/mir/AsmPrinter.cpp +++ b/src/mir/AsmPrinter.cpp @@ -256,10 +256,20 @@ namespace mir } } + // ADRP 缓存——避免连续访问同一全局变量时重复发射 ADRP + std::string g_cached_adrp_symbol; + bool g_adrp_cache_valid = false; + + void InvalidateAdrpCache() + { + g_adrp_cache_valid = false; + } + void EmitStackAdjust(const char *op, int amount, std::ostream &os) { if (amount > 12285) { + InvalidateAdrpCache(); os << " movz x13, #" << (amount & 0xFFFF) << "\n"; if ((amount >> 16) != 0) os << " movk x13, #" << ((amount >> 16) & 0xFFFF) << ", lsl #16\n"; @@ -287,6 +297,7 @@ namespace mir { if (offset > 12285) { + InvalidateAdrpCache(); os << " movz x13, #" << (offset & 0xFFFF) << "\n"; if ((offset >> 16) != 0) os << " movk x13, #" << ((offset >> 16) & 0xFFFF) << ", lsl #16\n"; @@ -297,6 +308,7 @@ namespace mir if (offset < -12285) { int abs_off = -offset; + InvalidateAdrpCache(); os << " movz x13, #" << (abs_off & 0xFFFF) << "\n"; if ((abs_off >> 16) != 0) os << " movk x13, #" << ((abs_off >> 16) & 0xFFFF) << ", lsl #16\n"; @@ -365,7 +377,17 @@ namespace mir const std::string asm_symbol = NormalizeAsmSymbol(symbol); const PhysReg scratch_xreg = PrinterScratchXReg(); - os << " adrp " << PhysRegName(scratch_xreg) << ", " << asm_symbol << "\n"; + if (g_adrp_cache_valid && g_cached_adrp_symbol == asm_symbol) + { + // x13 已持有该全局变量的页面地址,跳过 ADRP + } + else + { + os << " adrp " << PhysRegName(scratch_xreg) << ", " << asm_symbol << "\n"; + g_cached_adrp_symbol = asm_symbol; + g_adrp_cache_valid = true; + } + os << " " << (opcode == Opcode::LoadGlobal ? "ldr " : "str "); PrintOperand(reg, os); os << ", [" << PhysRegName(scratch_xreg) << ", #:lo12:" << asm_symbol << "]\n"; @@ -823,6 +845,10 @@ namespace mir } return; + case Opcode::Call: + InvalidateAdrpCache(); + // 不 break,落到 default 让泛型打印机输出 bl 指令 + default: break; } @@ -878,6 +904,7 @@ namespace mir void PrintAsm(const MachineFunction &function, std::ostream &os) { + g_adrp_cache_valid = false; const std::string asm_name = NormalizeAsmSymbol(function.GetName()); os << " .text\n"; @@ -893,6 +920,8 @@ namespace mir } const auto &block = *block_ptr; + // 每个基本块重置 ADRP 缓存——跨块时 x13 可能已被 call/clobber 破坏 + g_adrp_cache_valid = false; PrintBlockLabelRef(function, block.GetLabelId(), os); os << ":\n"; diff --git a/优化记录.md b/优化记录.md index 1013368e..0e46a4c4 100644 --- a/优化记录.md +++ b/优化记录.md @@ -57,3 +57,15 @@ - **退化**:无 - **功能测试**:100/100 functional 通过,30/31 h_functional 通过(1 个预存故障 30_many_dimensions) - **已知局限**:仅修复 EmitLargeImmediate;EmitStackAdjust/EmitAddressFromBase 中的 movz 模式仍有同样问题,可后续统一 + +--- + +## 2026-05-25 | ADRP 冗余消除 + +- **类型**:后端(AsmPrinter) +- **假设**:连续访问同一全局变量时,x13 已持有页面地址,后续 ADRP 冗余。例如 `adrp x13, k; str w8, [x13, :lo12:k]; adrp x13, k` 中第二个 ADRP 多余 +- **实现**:AsmPrinter 添加 ADRP 缓存(g_cached_adrp_symbol + g_adrp_cache_valid)。PrintGlobalAccess 检测同符号命中时跳过 ADRP。EmitStackAdjust/EmitAddressFromBase 使用 x13 时失效缓存。Call 指令失效缓存(x13 caller-saved)。每个基本块入口重置缓存(跨块时 call/clobber 不确定) +- **指令数效果**:减少 135 条,shuffle -48(-3.4%)、crypto -27(-1.4%)、conv2d -21(-3.2%)、fft -12(-2.0%)、huffman -9(-1.1%)、h-9 -9(-4.0%)、03_sort -6(-0.9%)、h-8 -3(-0.7%) +- **退化**:无 +- **功能测试**:100/100 functional 通过,30/31 h_functional 通过(1 个预存故障 30_many_dimensions) +- **已知局限**:仅缓存 x13 上的 ADRP;LoadGlobalAddr 使用其他寄存器时不参与缓存;同一基本块内优化最有效 diff --git a/指令数基线.md b/指令数基线.md index 7238b973..babfd964 100644 --- a/指令数基线.md +++ b/指令数基线.md @@ -23,21 +23,21 @@ | performance/01_mm1 | 309 | | performance/01_mm2 | 309 | | performance/01_mm3 | 309 | -| performance/03_sort1 | 641 | -| performance/03_sort2 | 641 | -| performance/03_sort3 | 641 | -| performance/conv2d-1 | 656 | -| performance/conv2d-2 | 656 | -| performance/conv2d-3 | 656 | +| performance/03_sort1 | 639 | +| performance/03_sort2 | 639 | +| performance/03_sort3 | 639 | +| performance/conv2d-1 | 649 | +| performance/conv2d-2 | 649 | +| performance/conv2d-3 | 649 | | performance/crc1 | 279 | | performance/crc2 | 279 | | performance/crc3 | 279 | -| performance/crypto-1 | 1919 | -| performance/crypto-2 | 1919 | -| performance/crypto-3 | 1919 | -| performance/fft0 | 595 | -| performance/fft1 | 595 | -| performance/fft2 | 595 | +| performance/crypto-1 | 1910 | +| performance/crypto-2 | 1910 | +| performance/crypto-3 | 1910 | +| performance/fft0 | 591 | +| performance/fft1 | 591 | +| performance/fft2 | 591 | | performance/h-1-01 | 157 | | performance/h-1-02 | 157 | | performance/h-1-03 | 157 | @@ -50,15 +50,15 @@ | performance/h-5-01 | 338 | | performance/h-5-02 | 338 | | performance/h-5-03 | 338 | -| performance/h-8-01 | 411 | -| performance/h-8-02 | 411 | -| performance/h-8-03 | 411 | -| performance/h-9-01 | 224 | -| performance/h-9-02 | 224 | -| performance/h-9-03 | 224 | -| performance/huffman-01 | 790 | -| performance/huffman-02 | 790 | -| performance/huffman-03 | 790 | +| performance/h-8-01 | 410 | +| performance/h-8-02 | 410 | +| performance/h-8-03 | 410 | +| performance/h-9-01 | 221 | +| performance/h-9-02 | 221 | +| performance/h-9-03 | 221 | +| performance/huffman-01 | 787 | +| performance/huffman-02 | 787 | +| performance/huffman-03 | 787 | | performance/knapsack_naive-1 | 167 | | performance/knapsack_naive-2 | 167 | | performance/knapsack_naive-3 | 167 | @@ -71,9 +71,9 @@ | performance/optimization_scheduling1 | 116 | | performance/optimization_scheduling2 | 116 | | performance/optimization_scheduling3 | 116 | -| performance/shuffle0 | 471 | -| performance/shuffle1 | 471 | -| performance/shuffle2 | 471 | +| performance/shuffle0 | 455 | +| performance/shuffle1 | 455 | +| performance/shuffle2 | 455 | | performance/sl1 | 247 | | performance/sl2 | 247 | | performance/sl3 | 247 |