@@ -444,14 +444,6 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
444444; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
445445; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
446446; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
447- ; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
448- ; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
449- ; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
450- ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
451- ; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse
452- ; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
453- ; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
454- ; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
455447; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48
456448; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64
457449; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80
@@ -464,20 +456,15 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
464456; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192
465457; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
466458; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
467- ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:240
468- ; GISEL-GFX942-NEXT: s_nop 0
459+ ; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
460+ ; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
461+ ; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
462+ ; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
463+ ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
464+ ; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
469465; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
470466; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
471- ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(2)
472- ; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
473- ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
474- ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
475- ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
476- ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
477- ; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
478- ; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
479- ; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
480- ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:32
467+ ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
481468; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
482469; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
483470; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
@@ -490,10 +477,8 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
490477; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
491478; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
492479; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
493- ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
494- ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
495- ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
496- ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
480+ ; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
481+ ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
497482; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
498483; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB0_1
499484; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
@@ -822,14 +807,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
822807; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[4:7], 0 offen
823808; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16
824809; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32
825- ; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0
826- ; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
827- ; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc
828- ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
829- ; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse
830- ; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
831- ; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
832- ; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
833810; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48
834811; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64
835812; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80
@@ -842,20 +819,16 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
842819; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192
843820; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208
844821; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224
845- ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:240
846- ; SDAG-GFX942-NEXT: s_nop 0
822+ ; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240
823+ ; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0
824+ ; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
825+ ; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc
826+ ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
827+ ; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
828+ ; SDAG-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
847829; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen
848830; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16
849- ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(2)
850- ; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
851- ; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
852- ; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
853- ; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
854- ; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
855- ; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
856- ; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
857- ; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
858- ; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:32
831+ ; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[12:15], 0 offen offset:32
859832; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48
860833; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64
861834; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80
@@ -868,10 +841,8 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
868841; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192
869842; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208
870843; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224
871- ; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
872- ; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
873- ; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
874- ; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
844+ ; SDAG-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
845+ ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
875846; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240
876847; SDAG-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
877848; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
@@ -993,16 +964,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
993964; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[8:11], 0 offen
994965; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[8:11], 0 offen offset:16
995966; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:32
996- ; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
997- ; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
998- ; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1
999- ; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1
1000- ; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec
1001- ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
1002- ; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse
1003- ; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
1004- ; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
1005- ; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
1006967; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[8:11], 0 offen offset:48
1007968; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[8:11], 0 offen offset:64
1008969; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[8:11], 0 offen offset:80
@@ -1015,20 +976,18 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
1015976; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[8:11], 0 offen offset:192
1016977; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208
1017978; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224
1018- ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:240
1019- ; GISEL-GFX942-NEXT: s_nop 0
979+ ; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[8:11], 0 offen offset:240
980+ ; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
981+ ; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
982+ ; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1
983+ ; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1
984+ ; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec
985+ ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
986+ ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
987+ ; GISEL-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
1020988; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen
1021989; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16
1022- ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(2)
1023- ; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
1024- ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
1025- ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
1026- ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
1027- ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
1028- ; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
1029- ; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
1030- ; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
1031- ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:32
990+ ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[4:7], 0 offen offset:32
1032991; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48
1033992; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64
1034993; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80
@@ -1041,10 +1000,8 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
10411000; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192
10421001; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208
10431002; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224
1044- ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
1045- ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
1046- ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
1047- ; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
1003+ ; GISEL-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
1004+ ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
10481005; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240
10491006; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
10501007; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
0 commit comments