11; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 
22; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX-942 %s 
33; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s 
4+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GFX1250 %s 
45
56; TODO: Add global-isel when it can support bf16 
67
@@ -9,6 +10,11 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) {
910; GCN:       ; %bb.0: 
1011; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0 
1112; GCN-NEXT:    ; return to shader part epilog 
13+ ; 
14+ ; GFX1250-LABEL: v_test_cvt_bf16_f32_v: 
15+ ; GFX1250:       ; %bb.0: 
16+ ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0 
17+ ; GFX1250-NEXT:    ; return to shader part epilog 
1218  %cvt  = fpext  bfloat %v  to  float 
1319  ret  float  %cvt 
1420}
@@ -19,6 +25,13 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) {
1925; GCN-NEXT:    s_lshl_b32 s0, s0, 16 
2026; GCN-NEXT:    v_mov_b32_e32 v0, s0 
2127; GCN-NEXT:    ; return to shader part epilog 
28+ ; 
29+ ; GFX1250-LABEL: v_test_cvt_bf16_f32_s: 
30+ ; GFX1250:       ; %bb.0: 
31+ ; GFX1250-NEXT:    s_lshl_b32 s0, s0, 16 
32+ ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) 
33+ ; GFX1250-NEXT:    v_mov_b32_e32 v0, s0 
34+ ; GFX1250-NEXT:    ; return to shader part epilog 
2235  %cvt  = fpext  bfloat %v  to  float 
2336  ret  float  %cvt 
2437}
@@ -47,6 +60,11 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) {
4760; GFX-950:       ; %bb.0: 
4861; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1 
4962; GFX-950-NEXT:    ; return to shader part epilog 
63+ ; 
64+ ; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_v: 
65+ ; GFX1250:       ; %bb.0: 
66+ ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1 
67+ ; GFX1250-NEXT:    ; return to shader part epilog 
5068  %res  = fptrunc  <2  x float > %src  to  <2  x bfloat>
5169  %cast  = bitcast  <2  x bfloat> %res  to  float 
5270  ret  float  %cast 
@@ -80,6 +98,11 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) {
8098; GFX-950-NEXT:    v_mov_b32_e32 v0, s1 
8199; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, s0, v0 
82100; GFX-950-NEXT:    ; return to shader part epilog 
101+ ; 
102+ ; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_s: 
103+ ; GFX1250:       ; %bb.0: 
104+ ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, s0, s1 
105+ ; GFX1250-NEXT:    ; return to shader part epilog 
83106  %res  = fptrunc  <2  x float > %src  to  <2  x bfloat>
84107  %cast  = bitcast  <2  x bfloat> %res  to  float 
85108  ret  float  %cast 
@@ -103,6 +126,13 @@ define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) {
103126; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0 
104127; GFX-950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0 
105128; GFX-950-NEXT:    ; return to shader part epilog 
129+ ; 
130+ ; GFX1250-LABEL: v_test_cvt_f32_bf16_v: 
131+ ; GFX1250:       ; %bb.0: 
132+ ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0 
133+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) 
134+ ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0 
135+ ; GFX1250-NEXT:    ; return to shader part epilog 
106136  %trunc  = fptrunc  float  %src  to  bfloat
107137  %ext  = fpext  bfloat %trunc  to  float 
108138  ret  float  %ext 
@@ -172,6 +202,38 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
172202; GFX-950-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc 
173203; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v4 
174204; GFX-950-NEXT:    ; return to shader part epilog 
205+ ; 
206+ ; GFX1250-LABEL: v_test_cvt_v2f64_v2bf16_v: 
207+ ; GFX1250:       ; %bb.0: 
208+ ; GFX1250-NEXT:    v_cvt_f32_f64_e32 v8, v[2:3] 
209+ ; GFX1250-NEXT:    v_cvt_f32_f64_e32 v9, v[0:1] 
210+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 
211+ ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8 
212+ ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[6:7], v9 
213+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 
214+ ; GFX1250-NEXT:    v_cmp_gt_f64_e64 s1, |v[2:3]|, |v[4:5]| 
215+ ; GFX1250-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, v[2:3], v[4:5] 
216+ ; GFX1250-NEXT:    v_cmp_nlg_f64_e64 s0, v[0:1], v[6:7] 
217+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 
218+ ; GFX1250-NEXT:    v_cndmask_b32_e64 v2, -1, 1, s1 
219+ ; GFX1250-NEXT:    v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[6:7]| 
220+ ; GFX1250-NEXT:    v_dual_add_nc_u32 v1, v8, v2 :: v_dual_bitop2_b32 v10, 1, v8 bitop3:0x40 
221+ ; GFX1250-NEXT:    s_wait_alu 0xf1ff 
222+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 
223+ ; GFX1250-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s1 
224+ ; GFX1250-NEXT:    v_and_b32_e32 v11, 1, v9 
225+ ; GFX1250-NEXT:    v_cmp_eq_u32_e64 s1, 1, v10 
226+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 
227+ ; GFX1250-NEXT:    v_add_nc_u32_e32 v0, v9, v0 
228+ ; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, 1, v11 
229+ ; GFX1250-NEXT:    s_or_b32 vcc_lo, s1, vcc_lo 
230+ ; GFX1250-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo 
231+ ; GFX1250-NEXT:    s_or_b32 vcc_lo, s2, s0 
232+ ; GFX1250-NEXT:    s_wait_alu 0xfffe 
233+ ; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc_lo 
234+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) 
235+ ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1 
236+ ; GFX1250-NEXT:    ; return to shader part epilog 
175237  %res  = fptrunc  <2  x double > %src  to  <2  x bfloat>
176238  %cast  = bitcast  <2  x bfloat> %res  to  float 
177239  ret  float  %cast 
@@ -201,6 +263,11 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) {
201263; GFX-950:       ; %bb.0: ; %entry 
202264; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1 
203265; GFX-950-NEXT:    ; return to shader part epilog 
266+ ; 
267+ ; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16: 
268+ ; GFX1250:       ; %bb.0: ; %entry 
269+ ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1 
270+ ; GFX1250-NEXT:    ; return to shader part epilog 
204271entry:
205272  %a.cvt  = fptrunc  float  %a  to  bfloat
206273  %b.cvt  = fptrunc  float  %b  to  bfloat
@@ -236,6 +303,11 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) {
236303; GFX-950:       ; %bb.0: ; %entry 
237304; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, |v1| 
238305; GFX-950-NEXT:    ; return to shader part epilog 
306+ ; 
307+ ; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16_mods: 
308+ ; GFX1250:       ; %bb.0: ; %entry 
309+ ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, |v1| 
310+ ; GFX1250-NEXT:    ; return to shader part epilog 
239311entry:
240312  %a.neg  = fneg float  %a 
241313  %a.cvt  = fptrunc  float  %a.neg  to  bfloat
@@ -269,6 +341,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
269341; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0 
270342; GFX-950-NEXT:    flat_store_short v[2:3], v0 
271343; GFX-950-NEXT:    s_endpgm 
344+ ; 
345+ ; GFX1250-LABEL: fptrunc_f32_to_bf16: 
346+ ; GFX1250:       ; %bb.0: ; %entry 
347+ ; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 
348+ ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0 
349+ ; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 
350+ ; GFX1250-NEXT:    s_endpgm 
272351entry:
273352  %a.cvt  = fptrunc  float  %a  to  bfloat
274353  store  bfloat %a.cvt , ptr  %out 
@@ -298,6 +377,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
298377; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, |v0|, s0 
299378; GFX-950-NEXT:    flat_store_short v[2:3], v0 
300379; GFX-950-NEXT:    s_endpgm 
380+ ; 
381+ ; GFX1250-LABEL: fptrunc_f32_to_bf16_abs: 
382+ ; GFX1250:       ; %bb.0: ; %entry 
383+ ; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 
384+ ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, |v0|, s0 
385+ ; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 
386+ ; GFX1250-NEXT:    s_endpgm 
301387entry:
302388  %a.abs  = call  float  @llvm.fabs.f32 (float  %a )
303389  %a.cvt  = fptrunc  float  %a.abs  to  bfloat
@@ -328,6 +414,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
328414; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, s0 
329415; GFX-950-NEXT:    flat_store_short v[2:3], v0 
330416; GFX-950-NEXT:    s_endpgm 
417+ ; 
418+ ; GFX1250-LABEL: fptrunc_f32_to_bf16_neg: 
419+ ; GFX1250:       ; %bb.0: ; %entry 
420+ ; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 
421+ ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, s0 
422+ ; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 
423+ ; GFX1250-NEXT:    s_endpgm 
331424entry:
332425  %a.neg  = fneg float  %a 
333426  %a.cvt  = fptrunc  float  %a.neg  to  bfloat
@@ -373,6 +466,24 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
373466; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0 
374467; GFX-950-NEXT:    flat_store_short v[2:3], v0 
375468; GFX-950-NEXT:    s_endpgm 
469+ ; 
470+ ; GFX1250-LABEL: fptrunc_f64_to_bf16: 
471+ ; GFX1250:       ; %bb.0: ; %entry 
472+ ; GFX1250-NEXT:    v_cvt_f32_f64_e32 v6, v[0:1] 
473+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 
474+ ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6 
475+ ; GFX1250-NEXT:    v_cmp_gt_f64_e64 s0, |v[0:1]|, |v[4:5]| 
476+ ; GFX1250-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[4:5] 
477+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 
478+ ; GFX1250-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s0 
479+ ; GFX1250-NEXT:    v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40 
480+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 
481+ ; GFX1250-NEXT:    v_cmp_eq_u32_e64 s0, 1, v7 
482+ ; GFX1250-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0 
483+ ; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo 
484+ ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0 
485+ ; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 
486+ ; GFX1250-NEXT:    s_endpgm 
376487entry:
377488  %a.cvt  = fptrunc  double  %a  to  bfloat
378489  store  bfloat %a.cvt , ptr  %out 
@@ -417,6 +528,25 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
417528; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0 
418529; GFX-950-NEXT:    flat_store_short v[2:3], v0 
419530; GFX-950-NEXT:    s_endpgm 
531+ ; 
532+ ; GFX1250-LABEL: fptrunc_f64_to_bf16_neg: 
533+ ; GFX1250:       ; %bb.0: ; %entry 
534+ ; GFX1250-NEXT:    v_cvt_f32_f64_e64 v6, -v[0:1] 
535+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 
536+ ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6 
537+ ; GFX1250-NEXT:    v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[4:5]| 
538+ ; GFX1250-NEXT:    v_cmp_nlg_f64_e64 s0, -v[0:1], v[4:5] 
539+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 
540+ ; GFX1250-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s1 
541+ ; GFX1250-NEXT:    v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40 
542+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 
543+ ; GFX1250-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7 
544+ ; GFX1250-NEXT:    s_or_b32 vcc_lo, s0, vcc_lo 
545+ ; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo 
546+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) 
547+ ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0 
548+ ; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 
549+ ; GFX1250-NEXT:    s_endpgm 
420550entry:
421551  %a.neg  = fneg double  %a 
422552  %a.cvt  = fptrunc  double  %a.neg  to  bfloat
@@ -462,6 +592,25 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
462592; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0 
463593; GFX-950-NEXT:    flat_store_short v[2:3], v0 
464594; GFX-950-NEXT:    s_endpgm 
595+ ; 
596+ ; GFX1250-LABEL: fptrunc_f64_to_bf16_abs: 
597+ ; GFX1250:       ; %bb.0: ; %entry 
598+ ; GFX1250-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]| 
599+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 
600+ ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6 
601+ ; GFX1250-NEXT:    v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[4:5]| 
602+ ; GFX1250-NEXT:    v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5] 
603+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 
604+ ; GFX1250-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s1 
605+ ; GFX1250-NEXT:    v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40 
606+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 
607+ ; GFX1250-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7 
608+ ; GFX1250-NEXT:    s_or_b32 vcc_lo, s0, vcc_lo 
609+ ; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo 
610+ ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) 
611+ ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0 
612+ ; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 
613+ ; GFX1250-NEXT:    s_endpgm 
465614entry:
466615  %a.abs  = call  double  @llvm.fabs.f64 (double  %a )
467616  %a.cvt  = fptrunc  double  %a.abs  to  bfloat
0 commit comments