perf(backend): 消除连续全局变量访问的冗余 ADRP

AsmPrinter 添加 ADRP 缓存,同符号连续访问时跳过重复的页面地址装载。
x13 被非全局访问路径使用时失效缓存;基本块入口重置。

shuffle -48, crypto -27, conv2d -21, fft -12, huffman -9, h-9 -9,
03_sort -6, h-8 -3。总计 -135 条,零退化。
lzk
lzkk 5 days ago
parent acdac5391d
commit 854168fb4e

@ -256,10 +256,20 @@ namespace mir
}
}
// ADRP 缓存——避免连续访问同一全局变量时重复发射 ADRP
std::string g_cached_adrp_symbol;
bool g_adrp_cache_valid = false;
void InvalidateAdrpCache()
{
g_adrp_cache_valid = false;
}
void EmitStackAdjust(const char *op, int amount, std::ostream &os)
{
if (amount > 12285)
{
InvalidateAdrpCache();
os << " movz x13, #" << (amount & 0xFFFF) << "\n";
if ((amount >> 16) != 0)
os << " movk x13, #" << ((amount >> 16) & 0xFFFF) << ", lsl #16\n";
@ -287,6 +297,7 @@ namespace mir
{
if (offset > 12285)
{
InvalidateAdrpCache();
os << " movz x13, #" << (offset & 0xFFFF) << "\n";
if ((offset >> 16) != 0)
os << " movk x13, #" << ((offset >> 16) & 0xFFFF) << ", lsl #16\n";
@ -297,6 +308,7 @@ namespace mir
if (offset < -12285)
{
int abs_off = -offset;
InvalidateAdrpCache();
os << " movz x13, #" << (abs_off & 0xFFFF) << "\n";
if ((abs_off >> 16) != 0)
os << " movk x13, #" << ((abs_off >> 16) & 0xFFFF) << ", lsl #16\n";
@ -365,7 +377,17 @@ namespace mir
const std::string asm_symbol = NormalizeAsmSymbol(symbol);
const PhysReg scratch_xreg = PrinterScratchXReg();
os << " adrp " << PhysRegName(scratch_xreg) << ", " << asm_symbol << "\n";
if (g_adrp_cache_valid && g_cached_adrp_symbol == asm_symbol)
{
// x13 已持有该全局变量的页面地址,跳过 ADRP
}
else
{
os << " adrp " << PhysRegName(scratch_xreg) << ", " << asm_symbol << "\n";
g_cached_adrp_symbol = asm_symbol;
g_adrp_cache_valid = true;
}
os << " " << (opcode == Opcode::LoadGlobal ? "ldr " : "str ");
PrintOperand(reg, os);
os << ", [" << PhysRegName(scratch_xreg) << ", #:lo12:" << asm_symbol << "]\n";
@ -823,6 +845,10 @@ namespace mir
}
return;
case Opcode::Call:
InvalidateAdrpCache();
// 不 break落到 default 让泛型打印机输出 bl 指令
default:
break;
}
@ -878,6 +904,7 @@ namespace mir
void PrintAsm(const MachineFunction &function, std::ostream &os)
{
g_adrp_cache_valid = false;
const std::string asm_name = NormalizeAsmSymbol(function.GetName());
os << " .text\n";
@ -893,6 +920,8 @@ namespace mir
}
const auto &block = *block_ptr;
// 每个基本块重置 ADRP 缓存——跨块时 x13 可能已被 call/clobber 破坏
g_adrp_cache_valid = false;
PrintBlockLabelRef(function, block.GetLabelId(), os);
os << ":\n";

@ -57,3 +57,15 @@
- **退化**:无
- **功能测试**100/100 functional 通过30/31 h_functional 通过1 个预存故障 30_many_dimensions
- **已知局限**:仅修复 EmitLargeImmediateEmitStackAdjust/EmitAddressFromBase 中的 movz 模式仍有同样问题,可后续统一
---
## 2026-05-25 | ADRP 冗余消除
- **类型**后端AsmPrinter
- **假设**连续访问同一全局变量时x13 已持有页面地址,后续 ADRP 冗余。例如 `adrp x13, k; str w8, [x13, :lo12:k]; adrp x13, k` 中第二个 ADRP 多余
- **实现**AsmPrinter 添加 ADRP 缓存g_cached_adrp_symbol + g_adrp_cache_valid。PrintGlobalAccess 检测同符号命中时跳过 ADRP。EmitStackAdjust/EmitAddressFromBase 使用 x13 时失效缓存。Call 指令失效缓存x13 caller-saved。每个基本块入口重置缓存跨块时 call/clobber 不确定)
- **指令数效果**:减少 135 条shuffle -48-3.4%、crypto -27-1.4%、conv2d -21-3.2%、fft -12-2.0%、huffman -9-1.1%、h-9 -9-4.0%、03_sort -6-0.9%、h-8 -3-0.7%
- **退化**:无
- **功能测试**100/100 functional 通过30/31 h_functional 通过1 个预存故障 30_many_dimensions
- **已知局限**:仅缓存 x13 上的 ADRPLoadGlobalAddr 使用其他寄存器时不参与缓存;同一基本块内优化最有效

@ -23,21 +23,21 @@
| performance/01_mm1 | 309 |
| performance/01_mm2 | 309 |
| performance/01_mm3 | 309 |
| performance/03_sort1 | 641 |
| performance/03_sort2 | 641 |
| performance/03_sort3 | 641 |
| performance/conv2d-1 | 656 |
| performance/conv2d-2 | 656 |
| performance/conv2d-3 | 656 |
| performance/03_sort1 | 639 |
| performance/03_sort2 | 639 |
| performance/03_sort3 | 639 |
| performance/conv2d-1 | 649 |
| performance/conv2d-2 | 649 |
| performance/conv2d-3 | 649 |
| performance/crc1 | 279 |
| performance/crc2 | 279 |
| performance/crc3 | 279 |
| performance/crypto-1 | 1919 |
| performance/crypto-2 | 1919 |
| performance/crypto-3 | 1919 |
| performance/fft0 | 595 |
| performance/fft1 | 595 |
| performance/fft2 | 595 |
| performance/crypto-1 | 1910 |
| performance/crypto-2 | 1910 |
| performance/crypto-3 | 1910 |
| performance/fft0 | 591 |
| performance/fft1 | 591 |
| performance/fft2 | 591 |
| performance/h-1-01 | 157 |
| performance/h-1-02 | 157 |
| performance/h-1-03 | 157 |
@ -50,15 +50,15 @@
| performance/h-5-01 | 338 |
| performance/h-5-02 | 338 |
| performance/h-5-03 | 338 |
| performance/h-8-01 | 411 |
| performance/h-8-02 | 411 |
| performance/h-8-03 | 411 |
| performance/h-9-01 | 224 |
| performance/h-9-02 | 224 |
| performance/h-9-03 | 224 |
| performance/huffman-01 | 790 |
| performance/huffman-02 | 790 |
| performance/huffman-03 | 790 |
| performance/h-8-01 | 410 |
| performance/h-8-02 | 410 |
| performance/h-8-03 | 410 |
| performance/h-9-01 | 221 |
| performance/h-9-02 | 221 |
| performance/h-9-03 | 221 |
| performance/huffman-01 | 787 |
| performance/huffman-02 | 787 |
| performance/huffman-03 | 787 |
| performance/knapsack_naive-1 | 167 |
| performance/knapsack_naive-2 | 167 |
| performance/knapsack_naive-3 | 167 |
@ -71,9 +71,9 @@
| performance/optimization_scheduling1 | 116 |
| performance/optimization_scheduling2 | 116 |
| performance/optimization_scheduling3 | 116 |
| performance/shuffle0 | 471 |
| performance/shuffle1 | 471 |
| performance/shuffle2 | 471 |
| performance/shuffle0 | 455 |
| performance/shuffle1 | 455 |
| performance/shuffle2 | 455 |
| performance/sl1 | 247 |
| performance/sl2 | 247 |
| performance/sl3 | 247 |

Loading…
Cancel
Save