Skip to content

Commit 012278b

Browse files
authored
Local heap optimizations on Arm64 (dotnet#64481)
# Local heap optimizations on Arm64 1. When not required to zero the allocated space for local heap (for sizes up to 64 bytes) - do not emit zeroing sequence. Instead do stack probing and adjust stack pointer: ```diff - stp xzr, xzr, [sp,#-16]! - stp xzr, xzr, [sp,#-16]! - stp xzr, xzr, [sp,#-16]! - stp xzr, xzr, [sp,#-16]! + ldr wzr, [sp],#-64 ``` 2. For sizes less than one `PAGE_SIZE` use `ldr wzr, [sp], #-amount` that does probing at `[sp]` and allocates the space at the same time. This saves one instruction for such local heap allocations: ```diff - ldr wzr, [sp] - sub sp, sp, #208 + ldr wzr, [sp],#-208 ``` Use `ldp tmpReg, xzr, [sp], #-amount` when the offset not encodable by post-index variant of `ldr`: ```diff - ldr wzr, [sp] - sub sp, sp, dotnet#512 + ldp x0, xzr, [sp],#-512 ``` 3. Allow non-loop zeroing (i.e. unrolled sequence) for sizes up to 128 bytes (i.e. up to `LCLHEAP_UNROLL_LIMIT`). This frees up two internal integer registers for such cases: ```diff - mov w11, #128 - ;; bbWeight=0.50 PerfScore 0.25 -G_M44913_IG19: ; gcrefRegs=00F9 {x0 x3 x4 x5 x6 x7}, byrefRegs=0000 {}, byref, isz stp xzr, xzr, [sp,#-16]! - subs x11, x11, #16 - bne G_M44913_IG19 + stp xzr, xzr, [sp,#-112]! + stp xzr, xzr, [sp,#16] + stp xzr, xzr, [sp,#32] + stp xzr, xzr, [sp,#48] + stp xzr, xzr, [sp,#64] + stp xzr, xzr, [sp,#80] + stp xzr, xzr, [sp,#96] ``` 4. Do zeroing in ascending order of the effective address: ```diff - mov w7, #96 -G_M49279_IG13: stp xzr, xzr, [sp,#-16]! - subs x7, x7, #16 - bne G_M49279_IG13 + stp xzr, xzr, [sp,#-80]! + stp xzr, xzr, [sp,#16] + stp xzr, xzr, [sp,#32] + stp xzr, xzr, [sp,#48] + stp xzr, xzr, [sp,#64] ``` In the example, the zeroing is done at `[initialSp-16], [initialSp-96], [initialSp-80], [initialSp-64], [initialSp-48], [initialSp-32]` addresses. The idea here is to allow a CPU to detect the sequential `memset` to `0` pattern and switch into write streaming mode.
1 parent b11469f commit 012278b

File tree

3 files changed

+64
-24
lines changed

3 files changed

+64
-24
lines changed

src/coreclr/jit/codegenarm64.cpp

Lines changed: 58 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2311,34 +2311,76 @@ void CodeGen::genLclHeap(GenTree* tree)
23112311
// We should reach here only for non-zero, constant size allocations.
23122312
assert(amount > 0);
23132313

2314+
const int storePairRegsWritesBytes = 2 * REGSIZE_BYTES;
2315+
23142316
// For small allocations we will generate up to four stp instructions, to zero 16 to 64 bytes.
2315-
static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2));
2316-
assert(amount % (REGSIZE_BYTES * 2) == 0); // stp stores two registers at a time
2317-
size_t stpCount = amount / (REGSIZE_BYTES * 2);
2318-
if (stpCount <= 4)
2317+
static_assert_no_msg(STACK_ALIGN == storePairRegsWritesBytes);
2318+
assert(amount % storePairRegsWritesBytes == 0); // stp stores two registers at a time
2319+
2320+
if (compiler->info.compInitMem)
23192321
{
2320-
while (stpCount != 0)
2322+
if (amount <= LCLHEAP_UNROLL_LIMIT)
23212323
{
2322-
// We can use pre-indexed addressing.
2323-
// stp ZR, ZR, [SP, #-16]! // STACK_ALIGN is 16
2324-
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, REG_SPBASE, -16, INS_OPTS_PRE_INDEX);
2325-
stpCount -= 1;
2326-
}
2324+
// The following zeroes the last 16 bytes and probes the page containing [sp, #16] address.
2325+
// stp xzr, xzr, [sp, #-16]!
2326+
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, -storePairRegsWritesBytes,
2327+
INS_OPTS_PRE_INDEX);
23272328

2328-
lastTouchDelta = 0;
2329+
if (amount > storePairRegsWritesBytes)
2330+
{
2331+
// The following sets SP to its final value and zeroes the first 16 bytes of the allocated space.
2332+
// stp xzr, xzr, [sp, #-amount+16]!
2333+
const ssize_t finalSpDelta = (ssize_t)amount - storePairRegsWritesBytes;
2334+
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, -finalSpDelta,
2335+
INS_OPTS_PRE_INDEX);
2336+
2337+
// The following zeroes the remaining space in [finalSp+16, initialSp-16) interval
2338+
// using a sequence of stp instruction with unsigned offset.
2339+
for (ssize_t offset = storePairRegsWritesBytes; offset < finalSpDelta;
2340+
offset += storePairRegsWritesBytes)
2341+
{
2342+
// stp xzr, xzr, [sp, #offset]
2343+
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, offset);
2344+
}
2345+
}
23292346

2330-
goto ALLOC_DONE;
2347+
lastTouchDelta = 0;
2348+
2349+
goto ALLOC_DONE;
2350+
}
23312351
}
2332-
else if (!compiler->info.compInitMem && (amount < compiler->eeGetPageSize())) // must be < not <=
2352+
else if (amount < compiler->eeGetPageSize()) // must be < not <=
23332353
{
23342354
// Since the size is less than a page, simply adjust the SP value.
23352355
// The SP might already be in the guard page, so we must touch it BEFORE
23362356
// the alloc, not after.
23372357

2338-
// ldr wz, [SP, #0]
2339-
GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SP, 0);
2358+
// Note the we check against the lower boundary of the post-index immediate range [-256, 256)
2359+
// since the offset is -amount.
2360+
const bool canEncodeLoadRegPostIndexOffset = amount <= 256;
23402361

2341-
genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, amount, rsGetRsvdReg());
2362+
if (canEncodeLoadRegPostIndexOffset)
2363+
{
2364+
GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, -(ssize_t)amount,
2365+
INS_OPTS_POST_INDEX);
2366+
}
2367+
else if (emitter::canEncodeLoadOrStorePairOffset(-(ssize_t)amount, EA_8BYTE))
2368+
{
2369+
// The following probes the page and allocates the local heap.
2370+
// ldp tmpReg, xzr, [sp], #-amount
2371+
// Note that we cannot use ldp xzr, xzr since
2372+
// the behaviour of ldp where two source registers are the same is unpredictable.
2373+
const regNumber tmpReg = targetReg;
2374+
GetEmitter()->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, tmpReg, REG_ZR, REG_SPBASE, -(ssize_t)amount,
2375+
INS_OPTS_POST_INDEX);
2376+
}
2377+
else
2378+
{
2379+
// ldr wzr, [sp]
2380+
// sub, sp, #amount
2381+
GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, amount);
2382+
genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, amount, rsGetRsvdReg());
2383+
}
23422384

23432385
lastTouchDelta = amount;
23442386

src/coreclr/jit/lsraarm64.cpp

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -543,14 +543,14 @@ int LinearScan::BuildNode(GenTree* tree)
543543
{
544544
assert(dstCount == 1);
545545

546-
// Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
546+
// Need a variable number of temp regs (see genLclHeap() in codegenarm64.cpp):
547547
// Here '-' means don't care.
548548
//
549549
// Size? Init Memory? # temp regs
550550
// 0 - 0
551-
// const and <=6 ptr words - 0
551+
// const and <=UnrollLimit - 0
552552
// const and <PageSize No 0
553-
// >6 ptr words Yes 0
553+
// >UnrollLimit Yes 0
554554
// Non-const Yes 0
555555
// Non-const No 2
556556
//
@@ -569,12 +569,9 @@ int LinearScan::BuildNode(GenTree* tree)
569569
// Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
570570
// This should also help in debugging as we can examine the original size specified with
571571
// localloc.
572-
sizeVal = AlignUp(sizeVal, STACK_ALIGN);
573-
size_t stpCount = sizeVal / (REGSIZE_BYTES * 2);
572+
sizeVal = AlignUp(sizeVal, STACK_ALIGN);
574573

575-
// For small allocations up to 4 'stp' instructions (i.e. 16 to 64 bytes of localloc)
576-
//
577-
if (stpCount <= 4)
574+
if (sizeVal <= LCLHEAP_UNROLL_LIMIT)
578575
{
579576
// Need no internal registers
580577
}

src/coreclr/jit/targetarm64.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#define CPBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll CpBlk (when both srcAddr and dstAddr point to the stack)
1616
#define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk
1717
#define INITBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk (when dstAddr points to the stack)
18+
#define LCLHEAP_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll LclHeap (when zeroing is required)
1819

1920
#ifdef FEATURE_SIMD
2021
#define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned

0 commit comments

Comments
 (0)