Skip to content

Commit 04d10f1

Browse files
committed
[RISCV] Expand zvqdotq partial.reduce test variants
Make sure to cover all the scalable types which are legal, plus splitting. Make sure to cover all instructions. Not duplicating vx testing at this time.
1 parent 4042a00 commit 04d10f1

File tree

1 file changed

+271
-6
lines changed

1 file changed

+271
-6
lines changed

llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll

Lines changed: 271 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -523,8 +523,53 @@ entry:
523523
}
524524

525525

526-
define <vscale x 4 x i32> @vqdot_vv_partial_reduce(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
527-
; CHECK-LABEL: vqdot_vv_partial_reduce:
526+
define <vscale x 1 x i32> @partial_reduce_nf2(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) {
527+
; CHECK-LABEL: partial_reduce_nf2:
528+
; CHECK: # %bb.0: # %entry
529+
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
530+
; CHECK-NEXT: vsext.vf2 v10, v8
531+
; CHECK-NEXT: vsext.vf2 v11, v9
532+
; CHECK-NEXT: csrr a0, vlenb
533+
; CHECK-NEXT: vwmul.vv v8, v10, v11
534+
; CHECK-NEXT: srli a0, a0, 3
535+
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
536+
; CHECK-NEXT: vslidedown.vx v10, v9, a0
537+
; CHECK-NEXT: vslidedown.vx v11, v8, a0
538+
; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
539+
; CHECK-NEXT: vadd.vv v8, v10, v8
540+
; CHECK-NEXT: vadd.vv v9, v11, v9
541+
; CHECK-NEXT: vadd.vv v8, v9, v8
542+
; CHECK-NEXT: ret
543+
entry:
544+
%a.sext = sext <vscale x 4 x i8> %a to <vscale x 4 x i32>
545+
%b.sext = sext <vscale x 4 x i8> %b to <vscale x 4 x i32>
546+
%mul = mul nuw nsw <vscale x 4 x i32> %a.sext, %b.sext
547+
%res = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
548+
ret <vscale x 1 x i32> %res
549+
}
550+
551+
define <vscale x 2 x i32> @partial_reduce_m1(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
552+
; CHECK-LABEL: partial_reduce_m1:
553+
; CHECK: # %bb.0: # %entry
554+
; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
555+
; CHECK-NEXT: vsext.vf2 v12, v8
556+
; CHECK-NEXT: vsext.vf2 v14, v9
557+
; CHECK-NEXT: vwmul.vv v8, v12, v14
558+
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
559+
; CHECK-NEXT: vadd.vv v8, v11, v8
560+
; CHECK-NEXT: vadd.vv v9, v9, v10
561+
; CHECK-NEXT: vadd.vv v8, v9, v8
562+
; CHECK-NEXT: ret
563+
entry:
564+
%a.sext = sext <vscale x 8 x i8> %a to <vscale x 8 x i32>
565+
%b.sext = sext <vscale x 8 x i8> %b to <vscale x 8 x i32>
566+
%mul = mul nuw nsw <vscale x 8 x i32> %a.sext, %b.sext
567+
%res = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 2 x i32> zeroinitializer, <vscale x 8 x i32> %mul)
568+
ret <vscale x 2 x i32> %res
569+
}
570+
571+
define <vscale x 4 x i32> @partial_reduce_m2(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
572+
; CHECK-LABEL: partial_reduce_m2:
528573
; CHECK: # %bb.0: # %entry
529574
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
530575
; CHECK-NEXT: vsext.vf2 v16, v8
@@ -543,8 +588,178 @@ entry:
543588
ret <vscale x 4 x i32> %res
544589
}
545590

546-
define <vscale x 4 x i32> @vqdot_vv_partial_reduce2(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 4 x i32> %accum) {
547-
; CHECK-LABEL: vqdot_vv_partial_reduce2:
591+
define <vscale x 8 x i32> @partial_reduce_m4(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
592+
; CHECK-LABEL: partial_reduce_m4:
593+
; CHECK: # %bb.0: # %entry
594+
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
595+
; CHECK-NEXT: vsext.vf2 v24, v8
596+
; CHECK-NEXT: vsext.vf2 v16, v10
597+
; CHECK-NEXT: vsext.vf2 v28, v12
598+
; CHECK-NEXT: vsext.vf2 v20, v14
599+
; CHECK-NEXT: vwmul.vv v8, v16, v20
600+
; CHECK-NEXT: vwmul.vv v16, v24, v28
601+
; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
602+
; CHECK-NEXT: vadd.vv v16, v20, v16
603+
; CHECK-NEXT: vadd.vv v8, v12, v8
604+
; CHECK-NEXT: vadd.vv v8, v8, v16
605+
; CHECK-NEXT: ret
606+
entry:
607+
%a.sext = sext <vscale x 32 x i8> %a to <vscale x 32 x i32>
608+
%b.sext = sext <vscale x 32 x i8> %b to <vscale x 32 x i32>
609+
%mul = mul nuw nsw <vscale x 32 x i32> %a.sext, %b.sext
610+
%res = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 8 x i32> zeroinitializer, <vscale x 32 x i32> %mul)
611+
ret <vscale x 8 x i32> %res
612+
}
613+
614+
define <vscale x 16 x i32> @partial_reduce_m8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b) {
615+
; CHECK-LABEL: partial_reduce_m8:
616+
; CHECK: # %bb.0: # %entry
617+
; CHECK-NEXT: addi sp, sp, -16
618+
; CHECK-NEXT: .cfi_def_cfa_offset 16
619+
; CHECK-NEXT: csrr a0, vlenb
620+
; CHECK-NEXT: slli a0, a0, 2
621+
; CHECK-NEXT: sub sp, sp, a0
622+
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
623+
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
624+
; CHECK-NEXT: vsext.vf2 v24, v10
625+
; CHECK-NEXT: addi a0, sp, 16
626+
; CHECK-NEXT: vs4r.v v24, (a0) # vscale x 32-byte Folded Spill
627+
; CHECK-NEXT: vsext.vf2 v0, v8
628+
; CHECK-NEXT: vsext.vf2 v8, v18
629+
; CHECK-NEXT: vsext.vf2 v4, v16
630+
; CHECK-NEXT: vwmul.vv v24, v0, v4
631+
; CHECK-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
632+
; CHECK-NEXT: vwmacc.vv v24, v16, v8
633+
; CHECK-NEXT: vsext.vf2 v8, v12
634+
; CHECK-NEXT: vsext.vf2 v16, v20
635+
; CHECK-NEXT: vwmacc.vv v24, v8, v16
636+
; CHECK-NEXT: vsext.vf2 v8, v14
637+
; CHECK-NEXT: vsext.vf2 v12, v22
638+
; CHECK-NEXT: vwmacc.vv v24, v8, v12
639+
; CHECK-NEXT: vmv8r.v v8, v24
640+
; CHECK-NEXT: csrr a0, vlenb
641+
; CHECK-NEXT: slli a0, a0, 2
642+
; CHECK-NEXT: add sp, sp, a0
643+
; CHECK-NEXT: .cfi_def_cfa sp, 16
644+
; CHECK-NEXT: addi sp, sp, 16
645+
; CHECK-NEXT: .cfi_def_cfa_offset 0
646+
; CHECK-NEXT: ret
647+
entry:
648+
%a.sext = sext <vscale x 64 x i8> %a to <vscale x 64 x i32>
649+
%b.sext = sext <vscale x 64 x i8> %b to <vscale x 64 x i32>
650+
%mul = mul nuw nsw <vscale x 64 x i32> %a.sext, %b.sext
651+
%res = call <vscale x 16 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 16 x i32> zeroinitializer, <vscale x 64 x i32> %mul)
652+
ret <vscale x 16 x i32> %res
653+
}
654+
655+
define <vscale x 32 x i32> @partial_reduce_m16(<vscale x 128 x i8> %a, <vscale x 128 x i8> %b) {
656+
; CHECK-LABEL: partial_reduce_m16:
657+
; CHECK: # %bb.0: # %entry
658+
; CHECK-NEXT: addi sp, sp, -16
659+
; CHECK-NEXT: .cfi_def_cfa_offset 16
660+
; CHECK-NEXT: csrr a1, vlenb
661+
; CHECK-NEXT: slli a1, a1, 3
662+
; CHECK-NEXT: mv a2, a1
663+
; CHECK-NEXT: slli a1, a1, 1
664+
; CHECK-NEXT: add a1, a1, a2
665+
; CHECK-NEXT: sub sp, sp, a1
666+
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
667+
; CHECK-NEXT: csrr a1, vlenb
668+
; CHECK-NEXT: slli a1, a1, 4
669+
; CHECK-NEXT: add a1, sp, a1
670+
; CHECK-NEXT: addi a1, a1, 16
671+
; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
672+
; CHECK-NEXT: addi a1, sp, 16
673+
; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
674+
; CHECK-NEXT: vl8r.v v16, (a0)
675+
; CHECK-NEXT: csrr a1, vlenb
676+
; CHECK-NEXT: slli a1, a1, 3
677+
; CHECK-NEXT: add a1, sp, a1
678+
; CHECK-NEXT: addi a1, a1, 16
679+
; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
680+
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
681+
; CHECK-NEXT: vsext.vf2 v4, v8
682+
; CHECK-NEXT: vsext.vf2 v0, v16
683+
; CHECK-NEXT: vwmul.vv v24, v4, v0
684+
; CHECK-NEXT: vsext.vf2 v4, v10
685+
; CHECK-NEXT: vsext.vf2 v8, v18
686+
; CHECK-NEXT: vwmacc.vv v24, v4, v8
687+
; CHECK-NEXT: csrr a1, vlenb
688+
; CHECK-NEXT: slli a1, a1, 3
689+
; CHECK-NEXT: add a0, a0, a1
690+
; CHECK-NEXT: vsext.vf2 v0, v12
691+
; CHECK-NEXT: vl8r.v v8, (a0)
692+
; CHECK-NEXT: csrr a0, vlenb
693+
; CHECK-NEXT: slli a0, a0, 3
694+
; CHECK-NEXT: add a0, sp, a0
695+
; CHECK-NEXT: addi a0, a0, 16
696+
; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
697+
; CHECK-NEXT: vsext.vf2 v4, v20
698+
; CHECK-NEXT: vwmacc.vv v24, v0, v4
699+
; CHECK-NEXT: csrr a0, vlenb
700+
; CHECK-NEXT: slli a0, a0, 4
701+
; CHECK-NEXT: add a0, sp, a0
702+
; CHECK-NEXT: addi a0, a0, 16
703+
; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
704+
; CHECK-NEXT: vsext.vf2 v20, v0
705+
; CHECK-NEXT: vsext.vf2 v16, v8
706+
; CHECK-NEXT: vwmul.vv v0, v20, v16
707+
; CHECK-NEXT: csrr a0, vlenb
708+
; CHECK-NEXT: slli a0, a0, 4
709+
; CHECK-NEXT: add a0, sp, a0
710+
; CHECK-NEXT: addi a0, a0, 16
711+
; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
712+
; CHECK-NEXT: vsext.vf2 v20, v18
713+
; CHECK-NEXT: vsext.vf2 v16, v10
714+
; CHECK-NEXT: vwmacc.vv v0, v20, v16
715+
; CHECK-NEXT: csrr a0, vlenb
716+
; CHECK-NEXT: slli a0, a0, 4
717+
; CHECK-NEXT: add a0, sp, a0
718+
; CHECK-NEXT: addi a0, a0, 16
719+
; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
720+
; CHECK-NEXT: vsext.vf2 v8, v20
721+
; CHECK-NEXT: vsext.vf2 v16, v12
722+
; CHECK-NEXT: vwmacc.vv v0, v8, v16
723+
; CHECK-NEXT: csrr a0, vlenb
724+
; CHECK-NEXT: slli a0, a0, 4
725+
; CHECK-NEXT: add a0, sp, a0
726+
; CHECK-NEXT: addi a0, a0, 16
727+
; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
728+
; CHECK-NEXT: vsext.vf2 v8, v22
729+
; CHECK-NEXT: vsext.vf2 v16, v14
730+
; CHECK-NEXT: vwmacc.vv v0, v8, v16
731+
; CHECK-NEXT: addi a0, sp, 16
732+
; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
733+
; CHECK-NEXT: vsext.vf2 v8, v14
734+
; CHECK-NEXT: csrr a0, vlenb
735+
; CHECK-NEXT: slli a0, a0, 3
736+
; CHECK-NEXT: add a0, sp, a0
737+
; CHECK-NEXT: addi a0, a0, 16
738+
; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
739+
; CHECK-NEXT: vsext.vf2 v12, v22
740+
; CHECK-NEXT: vwmacc.vv v24, v8, v12
741+
; CHECK-NEXT: vmv8r.v v8, v24
742+
; CHECK-NEXT: vmv8r.v v16, v0
743+
; CHECK-NEXT: csrr a0, vlenb
744+
; CHECK-NEXT: slli a0, a0, 3
745+
; CHECK-NEXT: mv a1, a0
746+
; CHECK-NEXT: slli a0, a0, 1
747+
; CHECK-NEXT: add a0, a0, a1
748+
; CHECK-NEXT: add sp, sp, a0
749+
; CHECK-NEXT: .cfi_def_cfa sp, 16
750+
; CHECK-NEXT: addi sp, sp, 16
751+
; CHECK-NEXT: .cfi_def_cfa_offset 0
752+
; CHECK-NEXT: ret
753+
entry:
754+
%a.sext = sext <vscale x 128 x i8> %a to <vscale x 128 x i32>
755+
%b.sext = sext <vscale x 128 x i8> %b to <vscale x 128 x i32>
756+
%mul = mul nuw nsw <vscale x 128 x i32> %a.sext, %b.sext
757+
%res = call <vscale x 32 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 32 x i32> zeroinitializer, <vscale x 128 x i32> %mul)
758+
ret <vscale x 32 x i32> %res
759+
}
760+
761+
define <vscale x 4 x i32> @partial_reduce_accum(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 4 x i32> %accum) {
762+
; CHECK-LABEL: partial_reduce_accum:
548763
; CHECK: # %bb.0: # %entry
549764
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
550765
; CHECK-NEXT: vsext.vf2 v24, v8
@@ -564,8 +779,8 @@ entry:
564779
ret <vscale x 4 x i32> %res
565780
}
566781

567-
define <vscale x 16 x i32> @vqdot_vv_partial_reduce3(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
568-
; CHECK-LABEL: vqdot_vv_partial_reduce3:
782+
define <vscale x 16 x i32> @partial_reduce_via_accum(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
783+
; CHECK-LABEL: partial_reduce_via_accum:
569784
; CHECK: # %bb.0: # %entry
570785
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
571786
; CHECK-NEXT: vsext.vf2 v16, v8
@@ -579,3 +794,53 @@ entry:
579794
%res = call <vscale x 16 x i32> @llvm.experimental.vector.partial.reduce.add.nvx16i32.nvx16i32(<vscale x 16 x i32> %mul, <vscale x 16 x i32> zeroinitializer)
580795
ret <vscale x 16 x i32> %res
581796
}
797+
798+
define <vscale x 1 x i32> @partial_reduce_vqdotu(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) {
799+
; CHECK-LABEL: partial_reduce_vqdotu:
800+
; CHECK: # %bb.0: # %entry
801+
; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
802+
; CHECK-NEXT: vwmulu.vv v10, v8, v9
803+
; CHECK-NEXT: csrr a0, vlenb
804+
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
805+
; CHECK-NEXT: vzext.vf2 v8, v10
806+
; CHECK-NEXT: srli a0, a0, 3
807+
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
808+
; CHECK-NEXT: vslidedown.vx v10, v9, a0
809+
; CHECK-NEXT: vslidedown.vx v11, v8, a0
810+
; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
811+
; CHECK-NEXT: vadd.vv v8, v10, v8
812+
; CHECK-NEXT: vadd.vv v9, v11, v9
813+
; CHECK-NEXT: vadd.vv v8, v9, v8
814+
; CHECK-NEXT: ret
815+
entry:
816+
%a.sext = zext <vscale x 4 x i8> %a to <vscale x 4 x i32>
817+
%b.sext = zext <vscale x 4 x i8> %b to <vscale x 4 x i32>
818+
%mul = mul nuw nsw <vscale x 4 x i32> %a.sext, %b.sext
819+
%res = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
820+
ret <vscale x 1 x i32> %res
821+
}
822+
823+
define <vscale x 1 x i32> @partial_reduce_vqdotsu(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) {
824+
; CHECK-LABEL: partial_reduce_vqdotsu:
825+
; CHECK: # %bb.0: # %entry
826+
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
827+
; CHECK-NEXT: vsext.vf2 v10, v8
828+
; CHECK-NEXT: vzext.vf2 v11, v9
829+
; CHECK-NEXT: csrr a0, vlenb
830+
; CHECK-NEXT: vwmulsu.vv v8, v10, v11
831+
; CHECK-NEXT: srli a0, a0, 3
832+
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
833+
; CHECK-NEXT: vslidedown.vx v10, v9, a0
834+
; CHECK-NEXT: vslidedown.vx v11, v8, a0
835+
; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
836+
; CHECK-NEXT: vadd.vv v8, v10, v8
837+
; CHECK-NEXT: vadd.vv v9, v11, v9
838+
; CHECK-NEXT: vadd.vv v8, v9, v8
839+
; CHECK-NEXT: ret
840+
entry:
841+
%a.sext = sext <vscale x 4 x i8> %a to <vscale x 4 x i32>
842+
%b.sext = zext <vscale x 4 x i8> %b to <vscale x 4 x i32>
843+
%mul = mul nuw nsw <vscale x 4 x i32> %a.sext, %b.sext
844+
%res = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
845+
ret <vscale x 1 x i32> %res
846+
}

0 commit comments

Comments
 (0)