Skip to content

Commit 90cbd4a

Browse files
authored
[NVPTX] Add folding for cvt.rn.bf16x2.f32 (#116109)
1 parent 7b7ae72 commit 90cbd4a

File tree

5 files changed

+247
-145
lines changed

5 files changed

+247
-145
lines changed

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -739,6 +739,20 @@ let hasSideEffects = false in {
739739
def CVT_f16x2_e5m2x2 : CVT_f16x2_fp8<"e5m2">;
740740
}
741741

742+
def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{
743+
return N->hasOneUse();
744+
}]>;
745+
746+
def : Pat<(v2bf16 (build_vector (bf16 (fpround_oneuse Float32Regs:$a)),
747+
(bf16 (fpround_oneuse Float32Regs:$b)))),
748+
(CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>,
749+
Requires<[hasPTX<70>, hasSM<80>, hasBF16Math]>;
750+
751+
def : Pat<(v2f16 (build_vector (f16 (fpround_oneuse Float32Regs:$a)),
752+
(f16 (fpround_oneuse Float32Regs:$b)))),
753+
(CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>,
754+
Requires<[hasPTX<70>, hasSM<80>, useFP16Math]>;
755+
742756
//-----------------------------------
743757
// Selection instructions (selp)
744758
//-----------------------------------

llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Lines changed: 18 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
204204
;
205205
; SM80-LABEL: test_faddx2(
206206
; SM80: {
207-
; SM80-NEXT: .reg .b16 %rs<7>;
207+
; SM80-NEXT: .reg .b16 %rs<5>;
208208
; SM80-NEXT: .reg .b32 %r<4>;
209209
; SM80-NEXT: .reg .f32 %f<7>;
210210
; SM80-EMPTY:
@@ -216,18 +216,16 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
216216
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
217217
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
218218
; SM80-NEXT: add.rn.f32 %f3, %f2, %f1;
219-
; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
220219
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
221220
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
222221
; SM80-NEXT: add.rn.f32 %f6, %f5, %f4;
223-
; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
224-
; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
222+
; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
225223
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
226224
; SM80-NEXT: ret;
227225
;
228226
; SM80-FTZ-LABEL: test_faddx2(
229227
; SM80-FTZ: {
230-
; SM80-FTZ-NEXT: .reg .b16 %rs<7>;
228+
; SM80-FTZ-NEXT: .reg .b16 %rs<5>;
231229
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
232230
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
233231
; SM80-FTZ-EMPTY:
@@ -239,12 +237,10 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
239237
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
240238
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
241239
; SM80-FTZ-NEXT: add.rn.ftz.f32 %f3, %f2, %f1;
242-
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
243240
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
244241
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
245242
; SM80-FTZ-NEXT: add.rn.ftz.f32 %f6, %f5, %f4;
246-
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
247-
; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
243+
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
248244
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
249245
; SM80-FTZ-NEXT: ret;
250246
;
@@ -311,7 +307,7 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
311307
;
312308
; SM80-LABEL: test_fsubx2(
313309
; SM80: {
314-
; SM80-NEXT: .reg .b16 %rs<7>;
310+
; SM80-NEXT: .reg .b16 %rs<5>;
315311
; SM80-NEXT: .reg .b32 %r<4>;
316312
; SM80-NEXT: .reg .f32 %f<7>;
317313
; SM80-EMPTY:
@@ -323,18 +319,16 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
323319
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
324320
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
325321
; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1;
326-
; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
327322
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
328323
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
329324
; SM80-NEXT: sub.rn.f32 %f6, %f5, %f4;
330-
; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
331-
; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
325+
; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
332326
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
333327
; SM80-NEXT: ret;
334328
;
335329
; SM80-FTZ-LABEL: test_fsubx2(
336330
; SM80-FTZ: {
337-
; SM80-FTZ-NEXT: .reg .b16 %rs<7>;
331+
; SM80-FTZ-NEXT: .reg .b16 %rs<5>;
338332
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
339333
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
340334
; SM80-FTZ-EMPTY:
@@ -346,12 +340,10 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
346340
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
347341
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
348342
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f3, %f2, %f1;
349-
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
350343
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
351344
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
352345
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f6, %f5, %f4;
353-
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
354-
; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
346+
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
355347
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
356348
; SM80-FTZ-NEXT: ret;
357349
;
@@ -418,7 +410,7 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
418410
;
419411
; SM80-LABEL: test_fmulx2(
420412
; SM80: {
421-
; SM80-NEXT: .reg .b16 %rs<7>;
413+
; SM80-NEXT: .reg .b16 %rs<5>;
422414
; SM80-NEXT: .reg .b32 %r<4>;
423415
; SM80-NEXT: .reg .f32 %f<7>;
424416
; SM80-EMPTY:
@@ -430,18 +422,16 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
430422
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
431423
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
432424
; SM80-NEXT: mul.rn.f32 %f3, %f2, %f1;
433-
; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
434425
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
435426
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
436427
; SM80-NEXT: mul.rn.f32 %f6, %f5, %f4;
437-
; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
438-
; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
428+
; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
439429
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
440430
; SM80-NEXT: ret;
441431
;
442432
; SM80-FTZ-LABEL: test_fmulx2(
443433
; SM80-FTZ: {
444-
; SM80-FTZ-NEXT: .reg .b16 %rs<7>;
434+
; SM80-FTZ-NEXT: .reg .b16 %rs<5>;
445435
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
446436
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
447437
; SM80-FTZ-EMPTY:
@@ -453,12 +443,10 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
453443
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
454444
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
455445
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f3, %f2, %f1;
456-
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
457446
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
458447
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
459448
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f6, %f5, %f4;
460-
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
461-
; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
449+
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
462450
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
463451
; SM80-FTZ-NEXT: ret;
464452
;
@@ -525,7 +513,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
525513
;
526514
; SM80-LABEL: test_fdiv(
527515
; SM80: {
528-
; SM80-NEXT: .reg .b16 %rs<7>;
516+
; SM80-NEXT: .reg .b16 %rs<5>;
529517
; SM80-NEXT: .reg .b32 %r<4>;
530518
; SM80-NEXT: .reg .f32 %f<7>;
531519
; SM80-EMPTY:
@@ -537,18 +525,16 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
537525
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
538526
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
539527
; SM80-NEXT: div.rn.f32 %f3, %f2, %f1;
540-
; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
541528
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
542529
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
543530
; SM80-NEXT: div.rn.f32 %f6, %f5, %f4;
544-
; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
545-
; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
531+
; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
546532
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
547533
; SM80-NEXT: ret;
548534
;
549535
; SM80-FTZ-LABEL: test_fdiv(
550536
; SM80-FTZ: {
551-
; SM80-FTZ-NEXT: .reg .b16 %rs<7>;
537+
; SM80-FTZ-NEXT: .reg .b16 %rs<5>;
552538
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
553539
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
554540
; SM80-FTZ-EMPTY:
@@ -560,18 +546,16 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
560546
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
561547
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
562548
; SM80-FTZ-NEXT: div.rn.ftz.f32 %f3, %f2, %f1;
563-
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
564549
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
565550
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
566551
; SM80-FTZ-NEXT: div.rn.ftz.f32 %f6, %f5, %f4;
567-
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
568-
; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
552+
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
569553
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
570554
; SM80-FTZ-NEXT: ret;
571555
;
572556
; SM90-LABEL: test_fdiv(
573557
; SM90: {
574-
; SM90-NEXT: .reg .b16 %rs<7>;
558+
; SM90-NEXT: .reg .b16 %rs<5>;
575559
; SM90-NEXT: .reg .b32 %r<4>;
576560
; SM90-NEXT: .reg .f32 %f<7>;
577561
; SM90-EMPTY:
@@ -583,12 +567,10 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
583567
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1;
584568
; SM90-NEXT: cvt.f32.bf16 %f2, %rs4;
585569
; SM90-NEXT: div.rn.f32 %f3, %f2, %f1;
586-
; SM90-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
587570
; SM90-NEXT: cvt.f32.bf16 %f4, %rs1;
588571
; SM90-NEXT: cvt.f32.bf16 %f5, %rs3;
589572
; SM90-NEXT: div.rn.f32 %f6, %f5, %f4;
590-
; SM90-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
591-
; SM90-NEXT: mov.b32 %r3, {%rs6, %rs5};
573+
; SM90-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
592574
; SM90-NEXT: st.param.b32 [func_retval0], %r3;
593575
; SM90-NEXT: ret;
594576
%r = fdiv <2 x bfloat> %a, %b

llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,7 @@ declare <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) #0
1313
; CHECK-DAG: cvt.f32.bf16 [[AF1:%f[0-9]+]], [[A1]];
1414
; CHECK-DAG: sin.approx.f32 [[RF0:%f[0-9]+]], [[AF0]];
1515
; CHECK-DAG: sin.approx.f32 [[RF1:%f[0-9]+]], [[AF1]];
16-
; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]];
17-
; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]];
18-
; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
16+
; CHECK: cvt.rn.bf16x2.f32 [[R:%r[0-9]+]], [[RF0]], [[RF1]]
1917
; CHECK: st.param.b32 [func_retval0], [[R]];
2018
; CHECK: ret;
2119
define <2 x bfloat> @test_sin(<2 x bfloat> %a) #0 #1 {
@@ -30,9 +28,7 @@ define <2 x bfloat> @test_sin(<2 x bfloat> %a) #0 #1 {
3028
; CHECK-DAG: cvt.f32.bf16 [[AF1:%f[0-9]+]], [[A1]];
3129
; CHECK-DAG: cos.approx.f32 [[RF0:%f[0-9]+]], [[AF0]];
3230
; CHECK-DAG: cos.approx.f32 [[RF1:%f[0-9]+]], [[AF1]];
33-
; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]];
34-
; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]];
35-
; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
31+
; CHECK: cvt.rn.bf16x2.f32 [[R:%r[0-9]+]], [[RF0]], [[RF1]]
3632
; CHECK: st.param.b32 [func_retval0], [[R]];
3733
; CHECK: ret;
3834
define <2 x bfloat> @test_cos(<2 x bfloat> %a) #0 #1 {

llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,7 @@ define <2 x bfloat> @test_ret_const() #0 {
2626
; SM80-DAG: cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]]
2727
; SM80-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
2828
; SM80-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
29-
; SM80-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]]
30-
; SM80-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]]
31-
; SM80-DAG: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
29+
; SM80-DAG: cvt.rn.bf16x2.f32 [[R:%r[0-9]+]], [[FR0]], [[FR1]];
3230
;
3331
; CHECK-NEXT: st.param.b32 [func_retval0], [[R]];
3432
; CHECK-NEXT: ret;
@@ -68,9 +66,7 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
6866
; SM80-DAG: cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
6967
; SM80-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
7068
; SM80-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
71-
; SM80-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]];
72-
; SM80-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]];
73-
; SM80: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]};
69+
; SM80-DAG: cvt.rn.bf16x2.f32 [[R:%r[0-9]+]], [[FR0]], [[FR1]];
7470

7571
; CHECK: st.param.b32 [func_retval0], [[R]];
7672
; CHECK: ret;
@@ -93,9 +89,7 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
9389
; SM80-DAG: cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
9490
; SM80-DAG: mul.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
9591
; SM80-DAG: mul.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
96-
; SM80-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]];
97-
; SM80-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]];
98-
; SM80: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]};
92+
; SM80-DAG: cvt.rn.bf16x2.f32 [[R:%r[0-9]+]], [[FR0]], [[FR1]];
9993

10094
; CHECK: st.param.b32 [func_retval0], [[R]];
10195
; CHECK: ret;
@@ -116,9 +110,7 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
116110
; CHECK-DAG: cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
117111
; CHECK-DAG: div.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
118112
; CHECK-DAG: div.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
119-
; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]];
120-
; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]];
121-
; CHECK-NEXT: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
113+
; CHECK: cvt.rn.bf16x2.f32 [[R:%r[0-9]+]], [[FR0]], [[FR1]];
122114
; CHECK-NEXT: st.param.b32 [func_retval0], [[R]];
123115
; CHECK-NEXT: ret;
124116

@@ -287,9 +279,7 @@ define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b,
287279

288280
; CHECK-LABEL: test_fptrunc_2xfloat(
289281
; CHECK: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0];
290-
; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[A0]];
291-
; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[A1]];
292-
; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
282+
; CHECK: cvt.rn.bf16x2.f32 [[R:%r[0-9]+]], [[A0]], [[A1]];
293283
; CHECK: st.param.b32 [func_retval0], [[R]];
294284
; CHECK: ret;
295285
define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
@@ -359,9 +349,7 @@ declare <2 x bfloat> @llvm.fmuladd.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bf
359349
; CHECK-DAG: cvt.f32.bf16 [[AF1:%f[0-9]+]], [[A1]];
360350
; CHECK-DAG: sqrt.rn.f32 [[RF0:%f[0-9]+]], [[AF0]];
361351
; CHECK-DAG: sqrt.rn.f32 [[RF1:%f[0-9]+]], [[AF1]];
362-
; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]];
363-
; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]];
364-
; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
352+
; CHECK-DAG: cvt.rn.bf16x2.f32 [[R:%r[0-9]+]], [[RF0]], [[RF1]];
365353
; CHECK: st.param.b32 [func_retval0], [[R]];
366354
; CHECK: ret;
367355
define <2 x bfloat> @test_sqrt(<2 x bfloat> %a) #0 {
@@ -436,9 +424,7 @@ define <2 x bfloat> @test_maxnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
436424
; SM80-DAG: cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
437425
; SM80-DAG: cvt.rmi.f32.f32 [[RF0:%f[0-9]+]], [[FA0]];
438426
; SM80-DAG: cvt.rmi.f32.f32 [[RF1:%f[0-9]+]], [[FA1]];
439-
; SM80-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]];
440-
; SM80-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]];
441-
; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
427+
; SM80: cvt.rn.bf16x2.f32 [[R:%r[0-9]+]], [[RF0]], [[RF1]];
442428
; CHECK: st.param.b32 [func_retval0], [[R]];
443429
; CHECK: ret;
444430
define <2 x bfloat> @test_floor(<2 x bfloat> %a) #0 {
@@ -455,9 +441,7 @@ define <2 x bfloat> @test_floor(<2 x bfloat> %a) #0 {
455441
; SM80-DAG: cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
456442
; SM80-DAG: cvt.rpi.f32.f32 [[RF0:%f[0-9]+]], [[FA0]];
457443
; SM80-DAG: cvt.rpi.f32.f32 [[RF1:%f[0-9]+]], [[FA1]];
458-
; SM80-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]];
459-
; SM80-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]];
460-
; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
444+
; SM80: cvt.rn.bf16x2.f32 [[R:%r[0-9]+]], [[RF0]], [[RF1]];
461445
; CHECK: st.param.b32 [func_retval0], [[R]];
462446
; CHECK: ret;
463447
define <2 x bfloat> @test_ceil(<2 x bfloat> %a) #0 {
@@ -470,7 +454,7 @@ define <2 x bfloat> @test_ceil(<2 x bfloat> %a) #0 {
470454
; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
471455
; SM90: cvt.rzi.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]];
472456
; SM90: cvt.rzi.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]];
473-
; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
457+
; SM90: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
474458
; CHECK: st.param.b32 [func_retval0], [[R]];
475459
; CHECK: ret;
476460
define <2 x bfloat> @test_trunc(<2 x bfloat> %a) #0 {
@@ -483,7 +467,7 @@ define <2 x bfloat> @test_trunc(<2 x bfloat> %a) #0 {
483467
; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
484468
; SM90: cvt.rni.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]];
485469
; SM90: cvt.rni.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]];
486-
; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
470+
; SM90: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
487471
; CHECK: st.param.b32 [func_retval0], [[R]];
488472
; CHECK: ret;
489473
define <2 x bfloat> @test_rint(<2 x bfloat> %a) #0 {

0 commit comments

Comments
 (0)