@@ -674,6 +674,100 @@ void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
674674  * outf8  =  __builtin_amdgcn_cvt_scale_pk8_f32_fp4 (src1 , scale , 7 );
675675}
676676
677+ // CHECK-LABEL: @test_cvt_scalef32_pk( 
678+ // CHECK-NEXT:  entry: 
679+ // CHECK-NEXT:    [[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 
680+ // CHECK-NEXT:    [[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, addrspace(5) 
681+ // CHECK-NEXT:    [[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) 
682+ // CHECK-NEXT:    [[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, addrspace(5) 
683+ // CHECK-NEXT:    [[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 
684+ // CHECK-NEXT:    [[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, addrspace(5) 
685+ // CHECK-NEXT:    [[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5) 
686+ // CHECK-NEXT:    [[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5) 
687+ // CHECK-NEXT:    [[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 
688+ // CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 
689+ // CHECK-NEXT:    [[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT2_ADDR]] to ptr 
690+ // CHECK-NEXT:    [[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF8_ADDR]] to ptr 
691+ // CHECK-NEXT:    [[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH8_ADDR]] to ptr 
692+ // CHECK-NEXT:    [[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF8_ADDR]] to ptr 
693+ // CHECK-NEXT:    [[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT3_ADDR]] to ptr 
694+ // CHECK-NEXT:    [[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF16_ADDR]] to ptr 
695+ // CHECK-NEXT:    [[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH16_ADDR]] to ptr 
696+ // CHECK-NEXT:    [[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF16_ADDR]] to ptr 
697+ // CHECK-NEXT:    [[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT1_ADDR]] to ptr 
698+ // CHECK-NEXT:    [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr 
699+ // CHECK-NEXT:    store ptr addrspace(1) [[OUT2:%.*]], ptr [[OUT2_ADDR_ASCAST]], align 8 
700+ // CHECK-NEXT:    store <8 x bfloat> [[SRCBF8:%.*]], ptr [[SRCBF8_ADDR_ASCAST]], align 16 
701+ // CHECK-NEXT:    store <8 x half> [[SRCH8:%.*]], ptr [[SRCH8_ADDR_ASCAST]], align 16 
702+ // CHECK-NEXT:    store <8 x float> [[SRCF8:%.*]], ptr [[SRCF8_ADDR_ASCAST]], align 32 
703+ // CHECK-NEXT:    store ptr addrspace(1) [[OUT3:%.*]], ptr [[OUT3_ADDR_ASCAST]], align 8 
704+ // CHECK-NEXT:    store <16 x bfloat> [[SRCBF16:%.*]], ptr [[SRCBF16_ADDR_ASCAST]], align 32 
705+ // CHECK-NEXT:    store <16 x half> [[SRCH16:%.*]], ptr [[SRCH16_ADDR_ASCAST]], align 32 
706+ // CHECK-NEXT:    store <16 x float> [[SRCF16:%.*]], ptr [[SRCF16_ADDR_ASCAST]], align 64 
707+ // CHECK-NEXT:    store ptr addrspace(1) [[OUT1:%.*]], ptr [[OUT1_ADDR_ASCAST]], align 8 
708+ // CHECK-NEXT:    store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 
709+ // CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16 
710+ // CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 
711+ // CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.bf16(<8 x bfloat> [[TMP0]], float [[TMP1]]) 
712+ // CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 
713+ // CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8 
714+ // CHECK-NEXT:    [[TMP4:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16 
715+ // CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 
716+ // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.bf16(<8 x bfloat> [[TMP4]], float [[TMP5]]) 
717+ // CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 
718+ // CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 
719+ // CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16 
720+ // CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 
721+ // CHECK-NEXT:    [[TMP10:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f16(<8 x half> [[TMP8]], float [[TMP9]]) 
722+ // CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 
723+ // CHECK-NEXT:    store <2 x i32> [[TMP10]], ptr addrspace(1) [[TMP11]], align 8 
724+ // CHECK-NEXT:    [[TMP12:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16 
725+ // CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 
726+ // CHECK-NEXT:    [[TMP14:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f16(<8 x half> [[TMP12]], float [[TMP13]]) 
727+ // CHECK-NEXT:    [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 
728+ // CHECK-NEXT:    store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8 
729+ // CHECK-NEXT:    [[TMP16:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32 
730+ // CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 
731+ // CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> [[TMP16]], float [[TMP17]]) 
732+ // CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 
733+ // CHECK-NEXT:    store <2 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 8 
734+ // CHECK-NEXT:    [[TMP20:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32 
735+ // CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 
736+ // CHECK-NEXT:    [[TMP22:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f32(<8 x float> [[TMP20]], float [[TMP21]]) 
737+ // CHECK-NEXT:    [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 
738+ // CHECK-NEXT:    store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8 
739+ // CHECK-NEXT:    [[TMP24:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32 
740+ // CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 
741+ // CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f32(<8 x float> [[TMP24]], float [[TMP25]]) 
742+ // CHECK-NEXT:    [[TMP27:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 
743+ // CHECK-NEXT:    store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4 
744+ // CHECK-NEXT:    [[TMP28:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16 
745+ // CHECK-NEXT:    [[TMP29:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 
746+ // CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f16(<8 x half> [[TMP28]], float [[TMP29]]) 
747+ // CHECK-NEXT:    [[TMP31:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 
748+ // CHECK-NEXT:    store i32 [[TMP30]], ptr addrspace(1) [[TMP31]], align 4 
749+ // CHECK-NEXT:    [[TMP32:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16 
750+ // CHECK-NEXT:    [[TMP33:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 
751+ // CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> [[TMP32]], float [[TMP33]]) 
752+ // CHECK-NEXT:    [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 
753+ // CHECK-NEXT:    store i32 [[TMP34]], ptr addrspace(1) [[TMP35]], align 4 
754+ // CHECK-NEXT:    ret void 
755+ // 
756+ void  test_cvt_scalef32_pk (global  uint2  * out2 , bfloat8  srcbf8 , half8  srch8 , float8  srcf8 ,
757+                           global  uint3  * out3 , bfloat16  srcbf16 , half16  srch16 , float16  srcf16 ,
758+                           global  uint  * out1 , float  scale )
759+ {
760+   * out2  =  __builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16 (srcbf8 , scale );
761+   * out2  =  __builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16 (srcbf8 , scale );
762+   * out2  =  __builtin_amdgcn_cvt_scalef32_pk8_fp8_f16 (srch8 , scale );
763+   * out2  =  __builtin_amdgcn_cvt_scalef32_pk8_bf8_f16 (srch8 , scale );
764+   * out2  =  __builtin_amdgcn_cvt_scalef32_pk8_fp8_f32 (srcf8 , scale );
765+   * out2  =  __builtin_amdgcn_cvt_scalef32_pk8_bf8_f32 (srcf8 , scale );
766+   * out1  =  __builtin_amdgcn_cvt_scalef32_pk8_fp4_f32 (srcf8 , scale );
767+   * out1  =  __builtin_amdgcn_cvt_scalef32_pk8_fp4_f16 (srch8 , scale );
768+   * out1  =  __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16 (srcbf8 , scale );
769+ }
770+ 
677771// CHECK-LABEL: @test_sat_pk4_i4_i8( 
678772// CHECK-NEXT:  entry: 
679773// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) 
0 commit comments