Skip to content

Commit

Permalink
rocr: Remove gfx940 and gfx941 support
Browse files Browse the repository at this point in the history
  • Loading branch information
dayatsin-amd committed Feb 19, 2025
1 parent 806ddfc commit 13c591d
Show file tree
Hide file tree
Showing 14 changed files with 28 additions and 536 deletions.
2 changes: 1 addition & 1 deletion rocrtst/suites/test_common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ else()
endif()

set(DEFAULT_TARGETS "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810"
"gfx900;gfx902;gfx904;gfx906;gfx908;gfx909;gfx90a;gfx90c;gfx940;gfx941;gfx942;gfx950"
"gfx900;gfx902;gfx904;gfx906;gfx908;gfx909;gfx90a;gfx90c;gfx942;gfx950"
"gfx1010;gfx1011;gfx1012;gfx1013;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036"
"gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151;gfx1152;gfx1153;gfx1200;gfx1201")

Expand Down
382 changes: 0 additions & 382 deletions runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -492,388 +492,6 @@ static std::string& kBlitKernelSource() {
L_FILL_PHASE_2_DONE:
s_endpgm
end
shader CopyAligned_940
type(CS)
user_sgpr_count(2)
sgpr_count(32)
vgpr_count(8 + (kCopyAlignedUnroll * kCopyAlignedVecWidth))
// Retrieve kernel arguments.
s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0)
s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10)
s_load_dwordx4 s[12:15], s[0:1], s_load_dword_offset(0x20)
s_load_dwordx4 s[16:19], s[0:1], s_load_dword_offset(0x30)
s_load_dwordx4 s[20:23], s[0:1], s_load_dword_offset(0x40)
s_load_dword s24, s[0:1], s_load_dword_offset(0x50)
s_waitcnt lgkmcnt(0)
// Compute workitem id.
s_lshl_b32 s2, s2, 0x6
v_add_co_u32 v0, vcc, s2, v0
// =====================================================
// Phase 1: Byte copy up to 0x100 destination alignment.
// =====================================================
// Compute phase source address.
v_mov_b32 v3, s5
v_add_co_u32 v2, vcc, v0, s4
v_addc_co_u32 v3, vcc, v3, 0x0, vcc
// Compute phase destination address.
v_mov_b32 v5, s7
v_add_co_u32 v4, vcc, v0, s6
v_addc_co_u32 v5, vcc, v5, 0x0, vcc
L_COPY_ALIGNED_PHASE_1_LOOP:
// Mask off lanes (or branch out) after phase end.
v_cmp_lt_u64 vcc, v[2:3], s[8:9]
s_cbranch_vccz L_COPY_ALIGNED_PHASE_1_DONE
s_and_b64 exec, exec, vcc
// Load from/advance the source address.
flat_load_ubyte v1, v[2:3] sc0:1 sc1:1
s_waitcnt vmcnt(0)
v_add_co_u32 v2, vcc, v2, s24
v_addc_co_u32 v3, vcc, v3, 0x0, vcc
// Write to/advance the destination address.
flat_store_byte v[4:5], v1 sc0:1 sc1:1
v_add_co_u32 v4, vcc, v4, s24
v_addc_co_u32 v5, vcc, v5, 0x0, vcc
// Repeat until branched out.
s_branch L_COPY_ALIGNED_PHASE_1_LOOP
L_COPY_ALIGNED_PHASE_1_DONE:
// Restore EXEC mask for all lanes.
s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF
// ========================================================
// Phase 2: Unrolled dword[x4] copy up to last whole block.
// ========================================================
// Compute unrolled dword[x4] stride across all threads.
if kCopyAlignedVecWidth == 4
s_lshl_b32 s25, s24, 0x4
else
s_lshl_b32 s25, s24, 0x2
end
// Compute phase source address.
if kCopyAlignedVecWidth == 4
v_lshlrev_b32 v1, 0x4, v0
else
v_lshlrev_b32 v1, 0x2, v0
end
v_mov_b32 v3, s9
v_add_co_u32 v2, vcc, v1, s8
v_addc_co_u32 v3, vcc, v3, 0x0, vcc
// Compute phase destination address.
v_mov_b32 v5, s11
v_add_co_u32 v4, vcc, v1, s10
v_addc_co_u32 v5, vcc, v5, 0x0, vcc
L_COPY_ALIGNED_PHASE_2_LOOP:
// Branch out after phase end.
v_cmp_lt_u64 vcc, v[2:3], s[12:13]
s_cbranch_vccz L_COPY_ALIGNED_PHASE_2_DONE
// Load from/advance the source address.
for var i = 0; i < kCopyAlignedUnroll; i ++
if kCopyAlignedVecWidth == 4
flat_load_dwordx4 v[8 + (i * 4)], v[2:3] sc0:1 sc1:1
else
flat_load_dword v[8 + i], v[2:3] sc0:1 sc1:1
end
v_add_co_u32 v2, vcc, v2, s25
v_addc_co_u32 v3, vcc, v3, 0x0, vcc
end
// Write to/advance the destination address.
s_waitcnt vmcnt(0)
for var i = 0; i < kCopyAlignedUnroll; i ++
if kCopyAlignedVecWidth == 4
flat_store_dwordx4 v[4:5], v[8 + (i * 4)] sc0:1 sc1:1
else
flat_store_dword v[4:5], v[8 + i] sc0:1 sc1:1
end
v_add_co_u32 v4, vcc, v4, s25
v_addc_co_u32 v5, vcc, v5, 0x0, vcc
end
// Repeat until branched out.
s_branch L_COPY_ALIGNED_PHASE_2_LOOP
L_COPY_ALIGNED_PHASE_2_DONE:
// ===========================================
// Phase 3: Dword copy up to last whole dword.
// ===========================================
// Compute dword stride across all threads.
s_lshl_b32 s25, s24, 0x2
// Compute phase source address.
v_lshlrev_b32 v1, 0x2, v0
v_mov_b32 v3, s13
v_add_co_u32 v2, vcc, v1, s12
v_addc_co_u32 v3, vcc, v3, 0x0, vcc
// Compute phase destination address.
v_mov_b32 v5, s15
v_add_co_u32 v4, vcc, v1, s14
v_addc_co_u32 v5, vcc, v5, 0x0, vcc
L_COPY_ALIGNED_PHASE_3_LOOP:
// Mask off lanes (or branch out) after phase end.
v_cmp_lt_u64 vcc, v[2:3], s[16:17]
s_cbranch_vccz L_COPY_ALIGNED_PHASE_3_DONE
s_and_b64 exec, exec, vcc
// Load from/advance the source address.
flat_load_dword v1, v[2:3] sc0:1 sc1:1
v_add_co_u32 v2, vcc, v2, s25
v_addc_co_u32 v3, vcc, v3, 0x0, vcc
s_waitcnt vmcnt(0)
// Write to/advance the destination address.
flat_store_dword v[4:5], v1 sc0:1 sc1:1
v_add_co_u32 v4, vcc, v4, s25
v_addc_co_u32 v5, vcc, v5, 0x0, vcc
// Repeat until branched out.
s_branch L_COPY_ALIGNED_PHASE_3_LOOP
L_COPY_ALIGNED_PHASE_3_DONE:
// Restore EXEC mask for all lanes.
s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF
// =============================
// Phase 4: Byte copy up to end.
// =============================
// Compute phase source address.
v_mov_b32 v3, s17
v_add_co_u32 v2, vcc, v0, s16
v_addc_co_u32 v3, vcc, v3, 0x0, vcc
// Compute phase destination address.
v_mov_b32 v5, s19
v_add_co_u32 v4, vcc, v0, s18
v_addc_co_u32 v5, vcc, v5, 0x0, vcc
// Mask off lanes (or branch out) after phase end.
v_cmp_lt_u64 vcc, v[2:3], s[20:21]
s_cbranch_vccz L_COPY_ALIGNED_PHASE_4_DONE
s_and_b64 exec, exec, vcc
// Load from the source address.
flat_load_ubyte v1, v[2:3] sc0:1 sc1:1
s_waitcnt vmcnt(0)
// Write to the destination address.
flat_store_byte v[4:5], v1 sc0:1 sc1:1
L_COPY_ALIGNED_PHASE_4_DONE:
s_endpgm
end
shader CopyMisaligned_940
type(CS)
user_sgpr_count(2)
sgpr_count(23)
vgpr_count(6 + kCopyMisalignedUnroll)
// Retrieve kernel arguments.
s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0)
s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10)
s_load_dwordx4 s[12:15], s[0:1], s_load_dword_offset(0x20)
s_load_dword s16, s[0:1], s_load_dword_offset(0x30)
s_waitcnt lgkmcnt(0)
// Compute workitem id.
s_lshl_b32 s2, s2, 0x6
v_add_co_u32 v0, vcc, s2, v0
// ===================================================
// Phase 1: Unrolled byte copy up to last whole block.
// ===================================================
// Compute phase source address.
v_mov_b32 v3, s5
v_add_co_u32 v2, vcc, v0, s4
v_addc_co_u32 v3, vcc, v3, 0x0, vcc
// Compute phase destination address.
v_mov_b32 v5, s7
v_add_co_u32 v4, vcc, v0, s6
v_addc_co_u32 v5, vcc, v5, 0x0, vcc
L_COPY_MISALIGNED_PHASE_1_LOOP:
// Branch out after phase end.
v_cmp_lt_u64 vcc, v[2:3], s[8:9]
s_cbranch_vccz L_COPY_MISALIGNED_PHASE_1_DONE
// Load from/advance the source address.
for var i = 0; i < kCopyMisalignedUnroll; i ++
flat_load_ubyte v[6 + i], v[2:3] sc0:1 sc1:1
v_add_co_u32 v2, vcc, v2, s16
v_addc_co_u32 v3, vcc, v3, 0x0, vcc
end
// Write to/advance the destination address.
s_waitcnt vmcnt(0)
for var i = 0; i < kCopyMisalignedUnroll; i ++
flat_store_byte v[4:5], v[6 + i] sc0:1 sc1:1
v_add_co_u32 v4, vcc, v4, s16
v_addc_co_u32 v5, vcc, v5, 0x0, vcc
end
// Repeat until branched out.
s_branch L_COPY_MISALIGNED_PHASE_1_LOOP
L_COPY_MISALIGNED_PHASE_1_DONE:
// =============================
// Phase 2: Byte copy up to end.
// =============================
// Compute phase source address.
v_mov_b32 v3, s9
v_add_co_u32 v2, vcc, v0, s8
v_addc_co_u32 v3, vcc, v3, 0x0, vcc
// Compute phase destination address.
v_mov_b32 v5, s11
v_add_co_u32 v4, vcc, v0, s10
v_addc_co_u32 v5, vcc, v5, 0x0, vcc
L_COPY_MISALIGNED_PHASE_2_LOOP:
// Mask off lanes (or branch out) after phase end.
v_cmp_lt_u64 vcc, v[2:3], s[12:13]
s_cbranch_vccz L_COPY_MISALIGNED_PHASE_2_DONE
s_and_b64 exec, exec, vcc
// Load from/advance the source address.
flat_load_ubyte v1, v[2:3] sc0:1 sc1:1
v_add_co_u32 v2, vcc, v2, s16
v_addc_co_u32 v3, vcc, v3, 0x0, vcc
s_waitcnt vmcnt(0)
// Write to/advance the destination address.
flat_store_byte v[4:5], v1 sc0:1 sc1:1
v_add_co_u32 v4, vcc, v4, s16
v_addc_co_u32 v5, vcc, v5, 0x0, vcc
// Repeat until branched out.
s_branch L_COPY_MISALIGNED_PHASE_2_LOOP
L_COPY_MISALIGNED_PHASE_2_DONE:
s_endpgm
end
shader Fill_940
type(CS)
user_sgpr_count(2)
sgpr_count(19)
vgpr_count(8)
// Retrieve kernel arguments.
s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0)
s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10)
s_waitcnt lgkmcnt(0)
// Compute workitem id.
s_lshl_b32 s2, s2, 0x6
v_add_co_u32 v0, vcc, s2, v0
// Copy fill pattern into VGPRs.
for var i = 0; i < kFillVecWidth; i ++
v_mov_b32 v[4 + i], s10
end
// ========================================================
// Phase 1: Unrolled dword[x4] fill up to last whole block.
// ========================================================
// Compute unrolled dword[x4] stride across all threads.
if kFillVecWidth == 4
s_lshl_b32 s12, s11, 0x4
else
s_lshl_b32 s12, s11, 0x2
end
// Compute phase destination address.
if kFillVecWidth == 4
v_lshlrev_b32 v1, 0x4, v0
else
v_lshlrev_b32 v1, 0x2, v0
end
v_mov_b32 v3, s5
v_add_co_u32 v2, vcc, v1, s4
v_addc_co_u32 v3, vcc, v3, 0x0, vcc
L_FILL_PHASE_1_LOOP:
// Branch out after phase end.
v_cmp_lt_u64 vcc, v[2:3], s[6:7]
s_cbranch_vccz L_FILL_PHASE_1_DONE
// Write to/advance the destination address.
for var i = 0; i < kFillUnroll; i ++
if kFillVecWidth == 4
flat_store_dwordx4 v[2:3], v[4:7] sc0:1 sc1:1
else
flat_store_dword v[2:3], v4 sc0:1 sc1:1
end
v_add_co_u32 v2, vcc, v2, s12
v_addc_co_u32 v3, vcc, v3, 0x0, vcc
end
// Repeat until branched out.
s_branch L_FILL_PHASE_1_LOOP
L_FILL_PHASE_1_DONE:
// ==============================
// Phase 2: Dword fill up to end.
// ==============================
// Compute dword stride across all threads.
s_lshl_b32 s12, s11, 0x2
// Compute phase destination address.
v_lshlrev_b32 v1, 0x2, v0
v_mov_b32 v3, s7
v_add_co_u32 v2, vcc, v1, s6
v_addc_co_u32 v3, vcc, v3, 0x0, vcc
L_FILL_PHASE_2_LOOP:
// Mask off lanes (or branch out) after phase end.
v_cmp_lt_u64 vcc, v[2:3], s[8:9]
s_cbranch_vccz L_FILL_PHASE_2_DONE
s_and_b64 exec, exec, vcc
// Write to/advance the destination address.
flat_store_dword v[2:3], v4 sc0:1 sc1:1
v_add_co_u32 v2, vcc, v2, s12
v_addc_co_u32 v3, vcc, v3, 0x0, vcc
// Repeat until branched out.
s_branch L_FILL_PHASE_2_LOOP
L_FILL_PHASE_2_DONE:
s_endpgm
end
)");
return kBlitKernelSource_;
}
Expand Down
Loading

0 comments on commit 13c591d

Please sign in to comment.