Closed
Description
The suboptimal codegeneration I'm seeing relates to llvm.masked.load
and llvm.masked.store
intrinsics where the mask is uniform across the vector but not a constant true or false. That is, for example, in cases where each lane will either do or not do the masked operation in its entirity. This should compile to a vector operation, but does not.
That is, given the input IR
; ModuleID = './masked-ops.ll'
source_filename = "./masked-ops.ll"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
target triple = "amdgcn-amd-amdhsa"
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define protected amdgpu_kernel void @masked_intrinsics(ptr addrspace(1) nocapture noundef readonly align 16 %x, ptr addrspace(1) nocapture noundef writeonly align 16 %y) local_unnamed_addr #0 !reqd_work_group_size !0 {
entry:
%id = tail call noundef range(i32 0, 128) i32 @llvm.amdgcn.workitem.id.x()
%id.zext = zext nneg i32 %id to i64
%load.cond = icmp ult i32 %id, 16
%load.cond.singleton.vec = insertelement <4 x i1> poison, i1 %load.cond, i64 0
%load.cond.vec = shufflevector <4 x i1> %load.cond.singleton.vec, <4 x i1> poison, <4 x i32> zeroinitializer
%load.ptr = getelementptr <4 x i32>, ptr addrspace(1) %x, i64 %id.zext
%value = tail call <4 x i32> @llvm.masked.load.v4i32.p1(ptr addrspace(1) %load.ptr, i32 16, <4 x i1> %load.cond.vec, <4 x i32> zeroinitializer)
%store.cond = icmp ult i32 %id, 32
%store.cond.singleton.vec = insertelement <4 x i1> poison, i1 %store.cond, i64 0
%store.cond.vec = shufflevector <4 x i1> %store.cond.singleton.vec, <4 x i1> poison, <4 x i32> zeroinitializer
%store.ptr = getelementptr <4 x i32>, ptr addrspace(1) %y, i64 %id.zext
tail call void @llvm.masked.store.v4i32.p1(<4 x i32> %value, ptr addrspace(1) %store.ptr, i32 16, <4 x i1> %store.cond.vec)
ret void
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.amdgcn.workitem.id.x() #1
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read)
declare <4 x i32> @llvm.masked.load.v4i32.p1(ptr addrspace(1) nocapture, i32 immarg, <4 x i1>, <4 x i32>) #2
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write)
declare void @llvm.masked.store.v4i32.p1(<4 x i32>, ptr addrspace(1) nocapture, i32 immarg, <4 x i1>) #3
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) }
attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) }
!0 = !{i32 128, i32 1, i32 1}
I get assembly (at `-O3) like
.text
.amdgcn_target "amdgcn-amd-amdhsa--gfx90a"
.amdhsa_code_object_version 5
.protected masked_intrinsics ; -- Begin function masked_intrinsics
.globl masked_intrinsics
.p2align 8
.type masked_intrinsics,@function
masked_intrinsics: ; @masked_intrinsics
; %bb.0: ; %entry
v_cmp_gt_u32_e32 vcc, 16, v0
s_load_dwordx4 s[0:3], s[4:5], 0x0
v_cndmask_b32_e64 v1, 0, 1, vcc
v_lshlrev_b16_e32 v2, 1, v1
v_or_b32_e32 v1, v1, v2
v_lshlrev_b16_e32 v2, 2, v1
v_and_b32_e32 v1, 3, v1
s_mov_b32 s4, 0
v_or_b32_e32 v8, v1, v2
v_lshlrev_b32_e32 v1, 4, v0
s_waitcnt lgkmcnt(0)
v_mov_b32_e32 v2, s1
v_add_co_u32_e64 v6, s[0:1], s0, v1
s_mov_b32 s5, s4
v_addc_co_u32_e64 v7, s[0:1], 0, v2, s[0:1]
s_mov_b32 s6, s4
s_mov_b32 s7, s4
v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
v_pk_mov_b32 v[4:5], s[6:7], s[6:7] op_sel:[0,1]
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz .LBB0_2
; %bb.1: ; %cond.load
global_load_dword v2, v[6:7], off
v_mov_b32_e32 v3, 0
v_mov_b32_e32 v4, v3
v_mov_b32_e32 v5, v3
.LBB0_2: ; %else
s_or_b64 exec, exec, s[0:1]
v_lshrrev_b16_e32 v9, 1, v8
v_and_b32_e32 v9, 1, v9
v_cmp_eq_u32_e32 vcc, 1, v9
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz .LBB0_4
; %bb.3: ; %cond.load1
global_load_dword v3, v[6:7], off offset:4
.LBB0_4: ; %else2
s_or_b64 exec, exec, s[0:1]
v_lshrrev_b16_e32 v9, 2, v8
v_and_b32_e32 v9, 1, v9
v_cmp_eq_u32_e32 vcc, 1, v9
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz .LBB0_6
; %bb.5: ; %cond.load4
global_load_dword v4, v[6:7], off offset:8
.LBB0_6: ; %else5
s_or_b64 exec, exec, s[0:1]
v_lshrrev_b16_e32 v8, 3, v8
v_and_b32_e32 v8, 1, v8
v_cmp_eq_u32_e32 vcc, 1, v8
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz .LBB0_8
; %bb.7: ; %cond.load7
global_load_dword v5, v[6:7], off offset:12
.LBB0_8: ; %else8
s_or_b64 exec, exec, s[0:1]
v_cmp_gt_u32_e32 vcc, 32, v0
v_cndmask_b32_e64 v0, 0, 1, vcc
v_lshlrev_b16_e32 v6, 1, v0
v_or_b32_e32 v0, v0, v6
v_lshlrev_b16_e32 v6, 2, v0
v_and_b32_e32 v0, 3, v0
v_or_b32_e32 v6, v0, v6
v_mov_b32_e32 v7, s3
v_add_co_u32_e64 v0, s[0:1], s2, v1
v_addc_co_u32_e64 v1, s[0:1], 0, v7, s[0:1]
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz .LBB0_10
; %bb.9: ; %cond.store
s_waitcnt vmcnt(0)
global_store_dword v[0:1], v2, off
.LBB0_10: ; %else11
s_or_b64 exec, exec, s[0:1]
s_waitcnt vmcnt(0)
v_lshrrev_b16_e32 v2, 1, v6
v_and_b32_e32 v2, 1, v2
v_cmp_eq_u32_e32 vcc, 1, v2
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz .LBB0_12
; %bb.11: ; %cond.store12
global_store_dword v[0:1], v3, off offset:4
.LBB0_12: ; %else13
s_or_b64 exec, exec, s[0:1]
v_lshrrev_b16_e32 v2, 2, v6
v_and_b32_e32 v2, 1, v2
v_cmp_eq_u32_e32 vcc, 1, v2
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz .LBB0_14
; %bb.13: ; %cond.store14
global_store_dword v[0:1], v4, off offset:8
.LBB0_14: ; %else15
s_or_b64 exec, exec, s[0:1]
v_lshrrev_b16_e32 v2, 3, v6
v_and_b32_e32 v2, 1, v2
v_cmp_eq_u32_e32 vcc, 1, v2
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz .LBB0_16
; %bb.15: ; %cond.store16
global_store_dword v[0:1], v5, off offset:12
.LBB0_16: ; %else17
s_endpgm
.section .rodata,"a",@progbits
.p2align 6, 0x0
.amdhsa_kernel masked_intrinsics
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 16
.amdhsa_user_sgpr_count 6
.amdhsa_user_sgpr_private_segment_buffer 1
.amdhsa_user_sgpr_dispatch_ptr 0
.amdhsa_user_sgpr_queue_ptr 0
.amdhsa_user_sgpr_kernarg_segment_ptr 1
.amdhsa_user_sgpr_dispatch_id 0
.amdhsa_user_sgpr_flat_scratch_init 0
.amdhsa_user_sgpr_kernarg_preload_length 0
.amdhsa_user_sgpr_kernarg_preload_offset 0
.amdhsa_user_sgpr_private_segment_size 0
.amdhsa_uses_dynamic_stack 0
.amdhsa_system_sgpr_private_segment_wavefront_offset 0
.amdhsa_system_sgpr_workgroup_id_x 1
.amdhsa_system_sgpr_workgroup_id_y 0
.amdhsa_system_sgpr_workgroup_id_z 0
.amdhsa_system_sgpr_workgroup_info 0
.amdhsa_system_vgpr_workitem_id 0
.amdhsa_next_free_vgpr 10
.amdhsa_next_free_sgpr 8
.amdhsa_accum_offset 12
.amdhsa_reserve_vcc 1
.amdhsa_reserve_flat_scratch 0
.amdhsa_reserve_xnack_mask 1
.amdhsa_float_round_mode_32 0
.amdhsa_float_round_mode_16_64 0
.amdhsa_float_denorm_mode_32 3
.amdhsa_float_denorm_mode_16_64 3
.amdhsa_dx10_clamp 1
.amdhsa_ieee_mode 1
.amdhsa_fp16_overflow 0
.amdhsa_tg_split 0
.amdhsa_exception_fp_ieee_invalid_op 0
.amdhsa_exception_fp_denorm_src 0
.amdhsa_exception_fp_ieee_div_zero 0
.amdhsa_exception_fp_ieee_overflow 0
.amdhsa_exception_fp_ieee_underflow 0
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.Lfunc_end0:
.size masked_intrinsics, .Lfunc_end0-masked_intrinsics
; -- End function
.section .AMDGPU.csdata,"",@progbits
; Kernel info:
; codeLenInByte = 404
; NumSgprs: 12
; NumVgprs: 10
; NumAgprs: 0
; TotalNumVgprs: 10
; ScratchSize: 0
; MemoryBound: 0
; FloatMode: 240
; IeeeMode: 1
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
; VGPRBlocks: 1
; NumSGPRsForWavesPerEU: 12
; NumVGPRsForWavesPerEU: 10
; AccumOffset: 12
; Occupancy: 8
; WaveLimiterHint : 0
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 6
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
; COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 2
; COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: 0
.text
.p2alignl 6, 3212836864
.fill 256, 4, 3212836864
.section ".note.GNU-stack","",@progbits
.amdgpu_metadata
---
amdhsa.kernels:
- .agpr_count: 0
.args:
- .address_space: global
.name: x
.offset: 0
.size: 8
.value_kind: global_buffer
- .address_space: global
.name: !str y
.offset: 8
.size: 8
.value_kind: global_buffer
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 16
.max_flat_workgroup_size: 128
.name: masked_intrinsics
.private_segment_fixed_size: 0
.reqd_workgroup_size:
- 128
- 1
- 1
.sgpr_count: 12
.sgpr_spill_count: 0
.symbol: masked_intrinsics.kd
.uniform_work_group_size: 1
.uses_dynamic_stack: false
.vgpr_count: 10
.vgpr_spill_count: 0
.wavefront_size: 64
amdhsa.target: amdgcn-amd-amdhsa--gfx90a
amdhsa.version:
- 1
- 2
...
.end_amdgpu_metadata
(-global-isel
acts similarly)
Expected code
I expected that, with optimizations on, there should be one conditional guarding a global_load_dwordx4
and one guarding a global_store_dwordx4
- instead, the masked vector load isn't vectorized.