Skip to content

Commit 7f0c5b0

Browse files
authored
[AArch64]Fix invalid use of ld1/st1 in stack alloc (#105518)
This patch fixes incorrect usage of scalar+immediate variant of ld1/st1 instructions during stack allocation caused by [c4bac7f](c4bac7f). This commit used ld1/st1 even when stack offset was outside of immediate range for this instruction, producing invalid assembly. This commit was also using incorrect offsets when using ld1/st1.
1 parent 1a12647 commit 7f0c5b0

File tree

5 files changed

+1952
-1139
lines changed

5 files changed

+1952
-1139
lines changed

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3020,6 +3020,7 @@ static void computeCalleeSaveRegisterPairs(
30203020
ByteOffset += StackFillDir * StackHazardSize;
30213021
LastReg = RPI.Reg1;
30223022

3023+
int Scale = RPI.getScale();
30233024
// Add the next reg to the pair if it is in the same register class.
30243025
if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) {
30253026
Register NextReg = CSI[i + RegInc].getReg();
@@ -3045,9 +3046,14 @@ static void computeCalleeSaveRegisterPairs(
30453046
case RegPairInfo::PPR:
30463047
break;
30473048
case RegPairInfo::ZPR:
3048-
if (AFI->getPredicateRegForFillSpill() != 0)
3049-
if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1))
3049+
if (AFI->getPredicateRegForFillSpill() != 0 &&
3050+
((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1)) {
3051+
// Calculate offset of register pair to see if pair instruction can be
3052+
// used.
3053+
int Offset = (ScalableByteOffset + StackFillDir * 2 * Scale) / Scale;
3054+
if ((-16 <= Offset && Offset <= 14) && (Offset % 2 == 0))
30503055
RPI.Reg2 = NextReg;
3056+
}
30513057
break;
30523058
case RegPairInfo::VG:
30533059
break;
@@ -3087,7 +3093,6 @@ static void computeCalleeSaveRegisterPairs(
30873093
if (NeedsWinCFI &&
30883094
RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
30893095
RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
3090-
int Scale = RPI.getScale();
30913096

30923097
int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
30933098
assert(OffsetPre % Scale == 0);
@@ -3356,8 +3361,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
33563361
MachineMemOperand::MOStore, Size, Alignment));
33573362
MIB.addReg(PnReg);
33583363
MIB.addReg(AArch64::SP)
3359-
.addImm(RPI.Offset) // [sp, #offset*scale],
3360-
// where factor*scale is implicit
3364+
.addImm(RPI.Offset / 2) // [sp, #imm*2*vscale],
3365+
// where 2*vscale is implicit
33613366
.setMIFlag(MachineInstr::FrameSetup);
33623367
MIB.addMemOperand(MF.getMachineMemOperand(
33633368
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
@@ -3378,8 +3383,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
33783383
}
33793384
MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
33803385
.addReg(AArch64::SP)
3381-
.addImm(RPI.Offset) // [sp, #offset*scale],
3382-
// where factor*scale is implicit
3386+
.addImm(RPI.Offset) // [sp, #offset*vscale],
3387+
// where factor*vscale is implicit
33833388
.setMIFlag(MachineInstr::FrameSetup);
33843389
MIB.addMemOperand(MF.getMachineMemOperand(
33853390
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
@@ -3523,8 +3528,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
35233528
MachineMemOperand::MOLoad, Size, Alignment));
35243529
MIB.addReg(PnReg);
35253530
MIB.addReg(AArch64::SP)
3526-
.addImm(RPI.Offset) // [sp, #offset*scale]
3527-
// where factor*scale is implicit
3531+
.addImm(RPI.Offset / 2) // [sp, #imm*2*vscale]
3532+
// where 2*vscale is implicit
35283533
.setMIFlag(MachineInstr::FrameDestroy);
35293534
MIB.addMemOperand(MF.getMachineMemOperand(
35303535
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
@@ -3541,8 +3546,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
35413546
}
35423547
MIB.addReg(Reg1, getDefRegState(true));
35433548
MIB.addReg(AArch64::SP)
3544-
.addImm(RPI.Offset) // [sp, #offset*scale]
3545-
// where factor*scale is implicit
3549+
.addImm(RPI.Offset) // [sp, #offset*vscale]
3550+
// where factor*vscale is implicit
35463551
.setMIFlag(MachineInstr::FrameDestroy);
35473552
MIB.addMemOperand(MF.getMachineMemOperand(
35483553
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),

llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll

Lines changed: 36 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -332,24 +332,25 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
332332
; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
333333
; CHECK-NEXT: ptrue pn8.b
334334
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
335-
; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
336-
; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
335+
; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
336+
; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
337337
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
338-
; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
339-
; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
338+
; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
339+
; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
340340
; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
341-
; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
342-
; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
341+
; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
342+
; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
343343
; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
344-
; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
344+
; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
345345
; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
346346
; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
347347
; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
348348
; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
349349
; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
350350
; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
351351
; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
352-
; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
352+
; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
353+
; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
353354
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG
354355
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG
355356
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG
@@ -372,15 +373,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
372373
; CHECK-NEXT: addvl sp, sp, #1
373374
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
374375
; CHECK-NEXT: ptrue pn8.b
376+
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
377+
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
378+
; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
379+
; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
380+
; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
381+
; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
382+
; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
383+
; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
384+
; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
375385
; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
376-
; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
377-
; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
378-
; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
379-
; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
380-
; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
381-
; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
382-
; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
383-
; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
384386
; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
385387
; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
386388
; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -427,24 +429,25 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
427429
; FP-CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
428430
; FP-CHECK-NEXT: ptrue pn8.b
429431
; FP-CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
430-
; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
431-
; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
432+
; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
433+
; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
432434
; FP-CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
433-
; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
434-
; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
435+
; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
436+
; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
435437
; FP-CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
436-
; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
437-
; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
438+
; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
439+
; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
438440
; FP-CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
439-
; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
441+
; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
440442
; FP-CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
441443
; FP-CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
442444
; FP-CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
443445
; FP-CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
444446
; FP-CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
445447
; FP-CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
446448
; FP-CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
447-
; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
449+
; FP-CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
450+
; FP-CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
448451
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG
449452
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG
450453
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG
@@ -465,15 +468,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
465468
; FP-CHECK-NEXT: .cfi_restore vg
466469
; FP-CHECK-NEXT: addvl sp, sp, #1
467470
; FP-CHECK-NEXT: ptrue pn8.b
471+
; FP-CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
472+
; FP-CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
473+
; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
474+
; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
475+
; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
476+
; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
477+
; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
478+
; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
479+
; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
468480
; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
469-
; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
470-
; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
471-
; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
472-
; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
473-
; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
474-
; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
475-
; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
476-
; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
477481
; FP-CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
478482
; FP-CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
479483
; FP-CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload

0 commit comments

Comments
 (0)