@@ -176,12 +176,7 @@ define amdgpu_kernel void @s_fabs_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in
176
176
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
177
177
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
178
178
; VI-NEXT: s_waitcnt lgkmcnt(0)
179
- ; VI-NEXT: s_and_b32 s3, s2, 0x7fff
180
- ; VI-NEXT: s_lshr_b32 s2, s2, 16
181
- ; VI-NEXT: s_and_b32 s2, s2, 0x7fff
182
- ; VI-NEXT: s_and_b32 s3, 0xffff, s3
183
- ; VI-NEXT: s_lshl_b32 s2, s2, 16
184
- ; VI-NEXT: s_or_b32 s2, s3, s2
179
+ ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
185
180
; VI-NEXT: v_mov_b32_e32 v0, s0
186
181
; VI-NEXT: v_mov_b32_e32 v1, s1
187
182
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -194,44 +189,22 @@ define amdgpu_kernel void @s_fabs_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in
194
189
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
195
190
; GFX9-NEXT: v_mov_b32_e32 v0, 0
196
191
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
197
- ; GFX9-NEXT: s_and_b32 s3, s2, 0x7fff
198
- ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
199
- ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff
200
- ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2
192
+ ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
201
193
; GFX9-NEXT: v_mov_b32_e32 v1, s2
202
194
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
203
195
; GFX9-NEXT: s_endpgm
204
196
;
205
- ; GFX11-TRUE16-LABEL: s_fabs_v2bf16:
206
- ; GFX11-TRUE16: ; %bb.0:
207
- ; GFX11-TRUE16-NEXT: s_clause 0x1
208
- ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
209
- ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
210
- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
211
- ; GFX11-TRUE16-NEXT: s_mov_b32 s3, s2
212
- ; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
213
- ; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0x7fff
214
- ; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
215
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
216
- ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2
217
- ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
218
- ; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
219
- ; GFX11-TRUE16-NEXT: s_endpgm
220
- ;
221
- ; GFX11-FAKE16-LABEL: s_fabs_v2bf16:
222
- ; GFX11-FAKE16: ; %bb.0:
223
- ; GFX11-FAKE16-NEXT: s_clause 0x1
224
- ; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
225
- ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
226
- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
227
- ; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
228
- ; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0x7fff
229
- ; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0x7fff
230
- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
231
- ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3
232
- ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
233
- ; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
234
- ; GFX11-FAKE16-NEXT: s_endpgm
197
+ ; GFX11-LABEL: s_fabs_v2bf16:
198
+ ; GFX11: ; %bb.0:
199
+ ; GFX11-NEXT: s_clause 0x1
200
+ ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
201
+ ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
202
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
203
+ ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
204
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
205
+ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
206
+ ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
207
+ ; GFX11-NEXT: s_endpgm
235
208
%fabs = call <2 x bfloat> @llvm.fabs.v2bf16 (<2 x bfloat> %in )
236
209
store <2 x bfloat> %fabs , ptr addrspace (1 ) %out
237
210
ret void
@@ -492,59 +465,34 @@ define amdgpu_kernel void @v_fabs_v2bf16(ptr addrspace(1) %out, ptr addrspace(1)
492
465
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
493
466
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
494
467
; VI-NEXT: flat_load_dword v2, v[0:1]
495
- ; VI-NEXT: v_mov_b32_e32 v3, 0x7fff
496
468
; VI-NEXT: s_waitcnt vmcnt(0)
497
- ; VI-NEXT: v_and_b32_e32 v4, 0x7fff, v2
498
- ; VI-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
499
- ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
469
+ ; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
500
470
; VI-NEXT: flat_store_dword v[0:1], v2
501
471
; VI-NEXT: s_endpgm
502
472
;
503
473
; GFX9-LABEL: v_fabs_v2bf16:
504
474
; GFX9: ; %bb.0:
505
475
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
506
476
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
507
- ; GFX9-NEXT: s_movk_i32 s2, 0x7fff
508
477
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
509
478
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
510
479
; GFX9-NEXT: s_waitcnt vmcnt(0)
511
- ; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff, v1
512
- ; GFX9-NEXT: v_and_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
513
- ; GFX9-NEXT: s_mov_b32 s2, 0x5040100
514
- ; GFX9-NEXT: v_perm_b32 v1, v1, v2, s2
480
+ ; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
515
481
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
516
482
; GFX9-NEXT: s_endpgm
517
483
;
518
- ; GFX11-TRUE16-LABEL: v_fabs_v2bf16:
519
- ; GFX11-TRUE16: ; %bb.0:
520
- ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
521
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
522
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
523
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
524
- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
525
- ; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[0:1]
526
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
527
- ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0x7fff, v1.l
528
- ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v1.h
529
- ; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
530
- ; GFX11-TRUE16-NEXT: s_endpgm
531
- ;
532
- ; GFX11-FAKE16-LABEL: v_fabs_v2bf16:
533
- ; GFX11-FAKE16: ; %bb.0:
534
- ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
535
- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
536
- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
537
- ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
538
- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
539
- ; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[0:1]
540
- ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
541
- ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
542
- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1
543
- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
544
- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff, v2
545
- ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
546
- ; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
547
- ; GFX11-FAKE16-NEXT: s_endpgm
484
+ ; GFX11-LABEL: v_fabs_v2bf16:
485
+ ; GFX11: ; %bb.0:
486
+ ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
487
+ ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
488
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
489
+ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
490
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
491
+ ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
492
+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
493
+ ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
494
+ ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
495
+ ; GFX11-NEXT: s_endpgm
548
496
%tid = call i32 @llvm.amdgcn.workitem.id.x ()
549
497
%gep.in = getelementptr inbounds <2 x bfloat>, ptr addrspace (1 ) %in , i32 %tid
550
498
%gep.out = getelementptr inbounds <2 x bfloat>, ptr addrspace (1 ) %in , i32 %tid
@@ -661,12 +609,12 @@ define amdgpu_kernel void @v_fabs_fold_self_v2bf16(ptr addrspace(1) %out, ptr ad
661
609
; VI-NEXT: v_mov_b32_e32 v0, s0
662
610
; VI-NEXT: v_mov_b32_e32 v1, s1
663
611
; VI-NEXT: s_waitcnt vmcnt(0)
664
- ; VI-NEXT: v_and_b32_sdwa v4, v3 , v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
665
- ; VI-NEXT: v_and_b32_sdwa v3 , v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
666
- ; VI-NEXT: v_lshlrev_b32_e32 v5, 16 , v2
667
- ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000 , v2
668
- ; VI-NEXT: v_mul_f32_e32 v3, v3, v5
669
- ; VI-NEXT: v_mul_f32_e32 v2, v4, v2
612
+ ; VI-NEXT: v_lshlrev_b32_e32 v4, 16 , v2
613
+ ; VI-NEXT: v_and_b32_sdwa v5 , v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
614
+ ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000 , v2
615
+ ; VI-NEXT: v_and_b32_sdwa v2, v3 , v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
616
+ ; VI-NEXT: v_mul_f32_e32 v3, v5, v4
617
+ ; VI-NEXT: v_mul_f32_e32 v2, v2, v6
670
618
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
671
619
; VI-NEXT: v_bfe_u32 v6, v2, 16, 1
672
620
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
@@ -693,20 +641,20 @@ define amdgpu_kernel void @v_fabs_fold_self_v2bf16(ptr addrspace(1) %out, ptr ad
693
641
; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
694
642
; GFX9-NEXT: s_movk_i32 s2, 0x7fff
695
643
; GFX9-NEXT: s_waitcnt vmcnt(0)
644
+ ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
696
645
; GFX9-NEXT: v_and_b32_sdwa v3, s2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
697
- ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
698
- ; GFX9-NEXT: v_and_b32_sdwa v2, s2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
699
- ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
700
- ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4
701
- ; GFX9-NEXT: v_mul_f32_e32 v0, v2, v0
702
- ; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1
703
- ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3
646
+ ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
647
+ ; GFX9-NEXT: v_and_b32_sdwa v0, s2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
648
+ ; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
649
+ ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4
650
+ ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
651
+ ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
704
652
; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1
705
- ; GFX9-NEXT: v_add3_u32 v2, v2, v3 , s2
706
- ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
653
+ ; GFX9-NEXT: v_add3_u32 v3, v3, v2 , s2
654
+ ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
707
655
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v0
708
656
; GFX9-NEXT: v_add3_u32 v5, v5, v0, s2
709
- ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2 , v4, vcc
657
+ ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3 , v4, vcc
710
658
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
711
659
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
712
660
; GFX9-NEXT: s_mov_b32 s2, 0x7060302
@@ -846,24 +794,24 @@ define amdgpu_kernel void @v_fabs_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
846
794
; VI-NEXT: s_and_b32 s1, s4, 0xffff0000
847
795
; VI-NEXT: s_movk_i32 s2, 0x7fff
848
796
; VI-NEXT: s_waitcnt vmcnt(0)
849
- ; VI-NEXT: v_and_b32_sdwa v4, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
850
- ; VI-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
851
- ; VI-NEXT: v_mul_f32_e32 v2 , s0, v2
852
- ; VI-NEXT: v_mul_f32_e32 v3 , s1, v4
853
- ; VI-NEXT: v_bfe_u32 v4, v2 , 16, 1
854
- ; VI-NEXT: v_bfe_u32 v6, v3 , 16, 1
855
- ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2
856
- ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3
797
+ ; VI-NEXT: v_and_b32_sdwa v4, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
798
+ ; VI-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
799
+ ; VI-NEXT: v_mul_f32_e32 v3 , s0, v4
800
+ ; VI-NEXT: v_mul_f32_e32 v2 , s1, v2
801
+ ; VI-NEXT: v_bfe_u32 v4, v3 , 16, 1
802
+ ; VI-NEXT: v_bfe_u32 v6, v2 , 16, 1
803
+ ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
804
+ ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2
857
805
; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v4
858
806
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
859
- ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
860
- ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
861
- ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
862
- ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
807
+ ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
863
808
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
864
- ; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
865
- ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
866
- ; VI-NEXT: v_alignbit_b32 v2, v3, v2, 16
809
+ ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2
810
+ ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
811
+ ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
812
+ ; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
813
+ ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
814
+ ; VI-NEXT: v_alignbit_b32 v2, v2, v3, 16
867
815
; VI-NEXT: flat_store_dword v[0:1], v2
868
816
; VI-NEXT: s_endpgm
869
817
;
@@ -879,22 +827,22 @@ define amdgpu_kernel void @v_fabs_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
879
827
; GFX9-NEXT: s_lshl_b32 s3, s4, 16
880
828
; GFX9-NEXT: s_and_b32 s4, s4, 0xffff0000
881
829
; GFX9-NEXT: s_waitcnt vmcnt(0)
882
- ; GFX9-NEXT: v_and_b32_sdwa v2, s2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
883
- ; GFX9-NEXT: v_and_b32_sdwa v0, s2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
884
- ; GFX9-NEXT: v_mul_f32_e32 v0, s3, v0
885
- ; GFX9-NEXT: v_mul_f32_e32 v2, s4, v2
886
- ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
887
- ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
888
- ; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1
889
- ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s2
890
- ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
891
- ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2
892
- ; GFX9-NEXT: v_add3_u32 v5, v5, v2, s2
893
- ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
830
+ ; GFX9-NEXT: v_and_b32_sdwa v2, s2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
831
+ ; GFX9-NEXT: v_and_b32_sdwa v0, s2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
832
+ ; GFX9-NEXT: v_mul_f32_e32 v2, s3, v2
833
+ ; GFX9-NEXT: v_mul_f32_e32 v0, s4, v0
834
+ ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
835
+ ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
836
+ ; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1
837
+ ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s2
894
838
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
895
- ; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
839
+ ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v0
840
+ ; GFX9-NEXT: v_add3_u32 v5, v5, v0, s2
841
+ ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
842
+ ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
843
+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
896
844
; GFX9-NEXT: s_mov_b32 s2, 0x7060302
897
- ; GFX9-NEXT: v_perm_b32 v0, v2, v0 , s2
845
+ ; GFX9-NEXT: v_perm_b32 v0, v0, v2 , s2
898
846
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
899
847
; GFX9-NEXT: s_endpgm
900
848
;
@@ -1194,10 +1142,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2bf16(ptr addrspace(1) %in) #
1194
1142
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1195
1143
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1196
1144
; VI-NEXT: flat_load_dword v0, v[0:1]
1197
- ; VI-NEXT: v_mov_b32_e32 v1, 0x7fff
1198
1145
; VI-NEXT: s_waitcnt vmcnt(0)
1199
- ; VI-NEXT: v_and_b32_sdwa v1, v1 , v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1200
- ; VI-NEXT: v_and_b32_e32 v0, 0x7fff , v0
1146
+ ; VI-NEXT: v_and_b32_e32 v0, 0x7fff7fff , v0
1147
+ ; VI-NEXT: v_lshrrev_b32_e32 v1, 16 , v0
1201
1148
; VI-NEXT: flat_store_short v[0:1], v0
1202
1149
; VI-NEXT: s_waitcnt vmcnt(0)
1203
1150
; VI-NEXT: flat_store_short v[0:1], v1
@@ -1210,51 +1157,29 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2bf16(ptr addrspace(1) %in) #
1210
1157
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1211
1158
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1212
1159
; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
1213
- ; GFX9-NEXT: s_movk_i32 s0, 0x7fff
1214
1160
; GFX9-NEXT: s_waitcnt vmcnt(0)
1215
- ; GFX9-NEXT: v_and_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1216
- ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
1161
+ ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1217
1162
; GFX9-NEXT: global_store_short v[0:1], v0, off
1218
1163
; GFX9-NEXT: s_waitcnt vmcnt(0)
1219
- ; GFX9-NEXT: global_store_short v[0:1], v1 , off
1164
+ ; GFX9-NEXT: global_store_short_d16_hi v[0:1], v0 , off
1220
1165
; GFX9-NEXT: s_waitcnt vmcnt(0)
1221
1166
; GFX9-NEXT: s_endpgm
1222
1167
;
1223
- ; GFX11-TRUE16-LABEL: v_extract_fabs_no_fold_v2bf16:
1224
- ; GFX11-TRUE16: ; %bb.0:
1225
- ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
1226
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1227
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1228
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1229
- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
1230
- ; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[0:1]
1231
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1232
- ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
1233
- ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.h
1234
- ; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v0, off dlc
1235
- ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
1236
- ; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc
1237
- ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
1238
- ; GFX11-TRUE16-NEXT: s_endpgm
1239
- ;
1240
- ; GFX11-FAKE16-LABEL: v_extract_fabs_no_fold_v2bf16:
1241
- ; GFX11-FAKE16: ; %bb.0:
1242
- ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
1243
- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1244
- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1245
- ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1246
- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
1247
- ; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[0:1]
1248
- ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1249
- ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1250
- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0
1251
- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
1252
- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1
1253
- ; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v0, off dlc
1254
- ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
1255
- ; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v1, off dlc
1256
- ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
1257
- ; GFX11-FAKE16-NEXT: s_endpgm
1168
+ ; GFX11-LABEL: v_extract_fabs_no_fold_v2bf16:
1169
+ ; GFX11: ; %bb.0:
1170
+ ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
1171
+ ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1172
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1173
+ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1174
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1175
+ ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
1176
+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
1177
+ ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1178
+ ; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
1179
+ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1180
+ ; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc
1181
+ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1182
+ ; GFX11-NEXT: s_endpgm
1258
1183
%tid = call i32 @llvm.amdgcn.workitem.id.x ()
1259
1184
%gep.in = getelementptr inbounds <2 x bfloat>, ptr addrspace (1 ) %in , i32 %tid
1260
1185
%val = load <2 x bfloat>, ptr addrspace (1 ) %gep.in
0 commit comments