Skip to content

[AMDGPU] Masked load/store with a uniform but non-trivial mask don't simplify #104520

Closed
@krzysz00

Description

@krzysz00

godbolt link

The suboptimal codegeneration I'm seeing relates to llvm.masked.load and llvm.masked.store intrinsics where the mask is uniform across the vector but not a constant true or false. That is, for example, in cases where each lane will either do or not do the masked operation in its entirity. This should compile to a vector operation, but does not.

That is, given the input IR

; ModuleID = './masked-ops.ll'
source_filename = "./masked-ops.ll"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
target triple = "amdgcn-amd-amdhsa"

; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define protected amdgpu_kernel void @masked_intrinsics(ptr addrspace(1) nocapture noundef readonly align 16 %x, ptr addrspace(1) nocapture noundef writeonly align 16 %y) local_unnamed_addr #0 !reqd_work_group_size !0 {
entry:
  %id = tail call noundef range(i32 0, 128) i32 @llvm.amdgcn.workitem.id.x()
  %id.zext = zext nneg i32 %id to i64
  %load.cond = icmp ult i32 %id, 16
  %load.cond.singleton.vec = insertelement <4 x i1> poison, i1 %load.cond, i64 0
  %load.cond.vec = shufflevector <4 x i1> %load.cond.singleton.vec, <4 x i1> poison, <4 x i32> zeroinitializer
  %load.ptr = getelementptr <4 x i32>, ptr addrspace(1) %x, i64 %id.zext
  %value = tail call <4 x i32> @llvm.masked.load.v4i32.p1(ptr addrspace(1) %load.ptr, i32 16, <4 x i1> %load.cond.vec, <4 x i32> zeroinitializer)
  %store.cond = icmp ult i32 %id, 32
  %store.cond.singleton.vec = insertelement <4 x i1> poison, i1 %store.cond, i64 0
  %store.cond.vec = shufflevector <4 x i1> %store.cond.singleton.vec, <4 x i1> poison, <4 x i32> zeroinitializer
  %store.ptr = getelementptr <4 x i32>, ptr addrspace(1) %y, i64 %id.zext
  tail call void @llvm.masked.store.v4i32.p1(<4 x i32> %value, ptr addrspace(1) %store.ptr, i32 16, <4 x i1> %store.cond.vec)
  ret void
}

; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.amdgcn.workitem.id.x() #1

; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read)
declare <4 x i32> @llvm.masked.load.v4i32.p1(ptr addrspace(1) nocapture, i32 immarg, <4 x i1>, <4 x i32>) #2

; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write)
declare void @llvm.masked.store.v4i32.p1(<4 x i32>, ptr addrspace(1) nocapture, i32 immarg, <4 x i1>) #3

attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) }
attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) }

!0 = !{i32 128, i32 1, i32 1}

I get assembly (at `-O3) like

	.text
	.amdgcn_target "amdgcn-amd-amdhsa--gfx90a"
	.amdhsa_code_object_version 5
	.protected	masked_intrinsics       ; -- Begin function masked_intrinsics
	.globl	masked_intrinsics
	.p2align	8
	.type	masked_intrinsics,@function
masked_intrinsics:                      ; @masked_intrinsics
; %bb.0:                                ; %entry
	v_cmp_gt_u32_e32 vcc, 16, v0
	s_load_dwordx4 s[0:3], s[4:5], 0x0
	v_cndmask_b32_e64 v1, 0, 1, vcc
	v_lshlrev_b16_e32 v2, 1, v1
	v_or_b32_e32 v1, v1, v2
	v_lshlrev_b16_e32 v2, 2, v1
	v_and_b32_e32 v1, 3, v1
	s_mov_b32 s4, 0
	v_or_b32_e32 v8, v1, v2
	v_lshlrev_b32_e32 v1, 4, v0
	s_waitcnt lgkmcnt(0)
	v_mov_b32_e32 v2, s1
	v_add_co_u32_e64 v6, s[0:1], s0, v1
	s_mov_b32 s5, s4
	v_addc_co_u32_e64 v7, s[0:1], 0, v2, s[0:1]
	s_mov_b32 s6, s4
	s_mov_b32 s7, s4
	v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
	v_pk_mov_b32 v[4:5], s[6:7], s[6:7] op_sel:[0,1]
	s_and_saveexec_b64 s[0:1], vcc
	s_cbranch_execz .LBB0_2
; %bb.1:                                ; %cond.load
	global_load_dword v2, v[6:7], off
	v_mov_b32_e32 v3, 0
	v_mov_b32_e32 v4, v3
	v_mov_b32_e32 v5, v3
.LBB0_2:                                ; %else
	s_or_b64 exec, exec, s[0:1]
	v_lshrrev_b16_e32 v9, 1, v8
	v_and_b32_e32 v9, 1, v9
	v_cmp_eq_u32_e32 vcc, 1, v9
	s_and_saveexec_b64 s[0:1], vcc
	s_cbranch_execz .LBB0_4
; %bb.3:                                ; %cond.load1
	global_load_dword v3, v[6:7], off offset:4
.LBB0_4:                                ; %else2
	s_or_b64 exec, exec, s[0:1]
	v_lshrrev_b16_e32 v9, 2, v8
	v_and_b32_e32 v9, 1, v9
	v_cmp_eq_u32_e32 vcc, 1, v9
	s_and_saveexec_b64 s[0:1], vcc
	s_cbranch_execz .LBB0_6
; %bb.5:                                ; %cond.load4
	global_load_dword v4, v[6:7], off offset:8
.LBB0_6:                                ; %else5
	s_or_b64 exec, exec, s[0:1]
	v_lshrrev_b16_e32 v8, 3, v8
	v_and_b32_e32 v8, 1, v8
	v_cmp_eq_u32_e32 vcc, 1, v8
	s_and_saveexec_b64 s[0:1], vcc
	s_cbranch_execz .LBB0_8
; %bb.7:                                ; %cond.load7
	global_load_dword v5, v[6:7], off offset:12
.LBB0_8:                                ; %else8
	s_or_b64 exec, exec, s[0:1]
	v_cmp_gt_u32_e32 vcc, 32, v0
	v_cndmask_b32_e64 v0, 0, 1, vcc
	v_lshlrev_b16_e32 v6, 1, v0
	v_or_b32_e32 v0, v0, v6
	v_lshlrev_b16_e32 v6, 2, v0
	v_and_b32_e32 v0, 3, v0
	v_or_b32_e32 v6, v0, v6
	v_mov_b32_e32 v7, s3
	v_add_co_u32_e64 v0, s[0:1], s2, v1
	v_addc_co_u32_e64 v1, s[0:1], 0, v7, s[0:1]
	s_and_saveexec_b64 s[0:1], vcc
	s_cbranch_execz .LBB0_10
; %bb.9:                                ; %cond.store
	s_waitcnt vmcnt(0)
	global_store_dword v[0:1], v2, off
.LBB0_10:                               ; %else11
	s_or_b64 exec, exec, s[0:1]
	s_waitcnt vmcnt(0)
	v_lshrrev_b16_e32 v2, 1, v6
	v_and_b32_e32 v2, 1, v2
	v_cmp_eq_u32_e32 vcc, 1, v2
	s_and_saveexec_b64 s[0:1], vcc
	s_cbranch_execz .LBB0_12
; %bb.11:                               ; %cond.store12
	global_store_dword v[0:1], v3, off offset:4
.LBB0_12:                               ; %else13
	s_or_b64 exec, exec, s[0:1]
	v_lshrrev_b16_e32 v2, 2, v6
	v_and_b32_e32 v2, 1, v2
	v_cmp_eq_u32_e32 vcc, 1, v2
	s_and_saveexec_b64 s[0:1], vcc
	s_cbranch_execz .LBB0_14
; %bb.13:                               ; %cond.store14
	global_store_dword v[0:1], v4, off offset:8
.LBB0_14:                               ; %else15
	s_or_b64 exec, exec, s[0:1]
	v_lshrrev_b16_e32 v2, 3, v6
	v_and_b32_e32 v2, 1, v2
	v_cmp_eq_u32_e32 vcc, 1, v2
	s_and_saveexec_b64 s[0:1], vcc
	s_cbranch_execz .LBB0_16
; %bb.15:                               ; %cond.store16
	global_store_dword v[0:1], v5, off offset:12
.LBB0_16:                               ; %else17
	s_endpgm
	.section	.rodata,"a",@progbits
	.p2align	6, 0x0
	.amdhsa_kernel masked_intrinsics
		.amdhsa_group_segment_fixed_size 0
		.amdhsa_private_segment_fixed_size 0
		.amdhsa_kernarg_size 16
		.amdhsa_user_sgpr_count 6
		.amdhsa_user_sgpr_private_segment_buffer 1
		.amdhsa_user_sgpr_dispatch_ptr 0
		.amdhsa_user_sgpr_queue_ptr 0
		.amdhsa_user_sgpr_kernarg_segment_ptr 1
		.amdhsa_user_sgpr_dispatch_id 0
		.amdhsa_user_sgpr_flat_scratch_init 0
		.amdhsa_user_sgpr_kernarg_preload_length 0
		.amdhsa_user_sgpr_kernarg_preload_offset 0
		.amdhsa_user_sgpr_private_segment_size 0
		.amdhsa_uses_dynamic_stack 0
		.amdhsa_system_sgpr_private_segment_wavefront_offset 0
		.amdhsa_system_sgpr_workgroup_id_x 1
		.amdhsa_system_sgpr_workgroup_id_y 0
		.amdhsa_system_sgpr_workgroup_id_z 0
		.amdhsa_system_sgpr_workgroup_info 0
		.amdhsa_system_vgpr_workitem_id 0
		.amdhsa_next_free_vgpr 10
		.amdhsa_next_free_sgpr 8
		.amdhsa_accum_offset 12
		.amdhsa_reserve_vcc 1
		.amdhsa_reserve_flat_scratch 0
		.amdhsa_reserve_xnack_mask 1
		.amdhsa_float_round_mode_32 0
		.amdhsa_float_round_mode_16_64 0
		.amdhsa_float_denorm_mode_32 3
		.amdhsa_float_denorm_mode_16_64 3
		.amdhsa_dx10_clamp 1
		.amdhsa_ieee_mode 1
		.amdhsa_fp16_overflow 0
		.amdhsa_tg_split 0
		.amdhsa_exception_fp_ieee_invalid_op 0
		.amdhsa_exception_fp_denorm_src 0
		.amdhsa_exception_fp_ieee_div_zero 0
		.amdhsa_exception_fp_ieee_overflow 0
		.amdhsa_exception_fp_ieee_underflow 0
		.amdhsa_exception_fp_ieee_inexact 0
		.amdhsa_exception_int_div_zero 0
	.end_amdhsa_kernel
	.text
.Lfunc_end0:
	.size	masked_intrinsics, .Lfunc_end0-masked_intrinsics
                                        ; -- End function
	.section	.AMDGPU.csdata,"",@progbits
; Kernel info:
; codeLenInByte = 404
; NumSgprs: 12
; NumVgprs: 10
; NumAgprs: 0
; TotalNumVgprs: 10
; ScratchSize: 0
; MemoryBound: 0
; FloatMode: 240
; IeeeMode: 1
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
; VGPRBlocks: 1
; NumSGPRsForWavesPerEU: 12
; NumVGPRsForWavesPerEU: 10
; AccumOffset: 12
; Occupancy: 8
; WaveLimiterHint : 0
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 6
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
; COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 2
; COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: 0
	.text
	.p2alignl 6, 3212836864
	.fill 256, 4, 3212836864
	.section	".note.GNU-stack","",@progbits
	.amdgpu_metadata
---
amdhsa.kernels:
  - .agpr_count:     0
    .args:
      - .address_space:  global
        .name:           x
        .offset:         0
        .size:           8
        .value_kind:     global_buffer
      - .address_space:  global
        .name:           !str y
        .offset:         8
        .size:           8
        .value_kind:     global_buffer
    .group_segment_fixed_size: 0
    .kernarg_segment_align: 8
    .kernarg_segment_size: 16
    .max_flat_workgroup_size: 128
    .name:           masked_intrinsics
    .private_segment_fixed_size: 0
    .reqd_workgroup_size:
      - 128
      - 1
      - 1
    .sgpr_count:     12
    .sgpr_spill_count: 0
    .symbol:         masked_intrinsics.kd
    .uniform_work_group_size: 1
    .uses_dynamic_stack: false
    .vgpr_count:     10
    .vgpr_spill_count: 0
    .wavefront_size: 64
amdhsa.target:   amdgcn-amd-amdhsa--gfx90a
amdhsa.version:
  - 1
  - 2
...

	.end_amdgpu_metadata

(-global-isel acts similarly)

Expected code

I expected that, with optimizations on, there should be one conditional guarding a global_load_dwordx4 and one guarding a global_store_dwordx4 - instead, the masked vector load isn't vectorized.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions