Skip to content

Fix BLAS and LAPACK tests for RVV 1.0 target, update to 0.12.0 intrincics #4456

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions kernel/riscv64/axpby_rvv.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
{
FLOAT_V_T vx, vy;

if ( n < 0 ) return(0);
if ( n <= 0 ) return(0);

if ( beta == 0.0 ) {
if ( alpha == 0.0 ) {
Expand All @@ -63,7 +63,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
BLASLONG stride_y = inc_y * sizeof(FLOAT);
size_t vl = VSETVL(n);
vy = VFMVVF_FLOAT(0.0, vl);
for ( ; n > 0; n -= vl, y += vl*stride_y) {
for ( ; n > 0; n -= vl, y += vl*inc_y) {
vl = VSETVL(n);
VSSEV_FLOAT(y, stride_y, vy, vl);
}
Expand Down Expand Up @@ -126,10 +126,12 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *

} else {
if ((1 == inc_x) && (1 == inc_y)) {
for (size_t vl; n > 0; n -= vl, y += vl) {
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vy = VLEV_FLOAT(y, vl);
vy = VFMULVF_FLOAT(vy, beta, vl);
vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
VSEV_FLOAT (y, vy, vl);
}
} else if (1 == inc_x) {
Expand Down
2 changes: 1 addition & 1 deletion kernel/riscv64/copy_rvv.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
if(n < 0) return(0);
if(n <= 0) return(0);

FLOAT_V_T v0;

Expand Down
51 changes: 42 additions & 9 deletions kernel/riscv64/gemm_ncopy_8_rvv.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(DOUBLE)
#define VSETVL(n) __riscv_vsetvl_e32m1(n)
#define FLOAT_V_T vfloat32m1_t
#define FLOAT_VX2_T vfloat32m1x2_t
#define FLOAT_VX4_T vfloat32m1x4_t
#define FLOAT_VX8_T vfloat32m1x8_t
#define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2
#define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4
#define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8
#define VLEV_FLOAT __riscv_vle32_v_f32m1
#define VSEV_FLOAT __riscv_vse32_v_f32m1
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1
#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1
#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2
#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4
#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8
#else
#define VSETVL(n) __riscv_vsetvl_e64m1(n)
#define FLOAT_V_T vfloat64m1_t
#define FLOAT_VX2_T vfloat64m1x2_t
#define FLOAT_VX4_T vfloat64m1x4_t
#define FLOAT_VX8_T vfloat64m1x8_t
#define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2
#define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4
#define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8
#define VLEV_FLOAT __riscv_vle64_v_f64m1
#define VSEV_FLOAT __riscv_vse64_v_f64m1
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1
#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1
#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2
#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4
#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8
#endif

// Optimizes the implementation in ../generic/gemm_ncopy_8.c
Expand All @@ -57,6 +69,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
FLOAT *b_offset;

FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8;
FLOAT_VX2_T vx2;
FLOAT_VX4_T vx4;
FLOAT_VX8_T vx8;

size_t vl;

//fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
Expand Down Expand Up @@ -87,7 +103,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
v7 = VLEV_FLOAT(a_offset7, vl);
v8 = VLEV_FLOAT(a_offset8, vl);

VSSEG8_FLOAT(b_offset, v1, v2, v3, v4, v5, v6, v7, v8, vl);
vx8 = VSET_VX8(vx8, 0, v1);
vx8 = VSET_VX8(vx8, 1, v2);
vx8 = VSET_VX8(vx8, 2, v3);
vx8 = VSET_VX8(vx8, 3, v4);
vx8 = VSET_VX8(vx8, 4, v5);
vx8 = VSET_VX8(vx8, 5, v6);
vx8 = VSET_VX8(vx8, 6, v7);
vx8 = VSET_VX8(vx8, 7, v8);

VSSEG8_FLOAT(b_offset, vx8, vl);

a_offset1 += vl;
a_offset2 += vl;
Expand Down Expand Up @@ -116,7 +141,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
v3 = VLEV_FLOAT(a_offset3, vl);
v4 = VLEV_FLOAT(a_offset4, vl);

VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl);
vx4 = VSET_VX4(vx4, 0, v1);
vx4 = VSET_VX4(vx4, 1, v2);
vx4 = VSET_VX4(vx4, 2, v3);
vx4 = VSET_VX4(vx4, 3, v4);

VSSEG4_FLOAT(b_offset, vx4, vl);

a_offset1 += vl;
a_offset2 += vl;
Expand All @@ -137,7 +167,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
v1 = VLEV_FLOAT(a_offset1, vl);
v2 = VLEV_FLOAT(a_offset2, vl);

VSSEG2_FLOAT(b_offset, v1, v2, vl);
vx2 = VSET_VX2(vx2, 0, v1);
vx2 = VSET_VX2(vx2, 1, v2);

VSSEG2_FLOAT(b_offset, vx2, vl);

a_offset1 += vl;
a_offset2 += vl;
Expand Down
71 changes: 40 additions & 31 deletions kernel/riscv64/gemm_tcopy_8_rvv.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,27 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(DOUBLE)
#define VSETVL(n) __riscv_vsetvl_e32m1(n)
#define FLOAT_V_T vfloat32m1_t
#define FLOAT_VX2_T vfloat32m1x2_t
#define FLOAT_VX4_T vfloat32m1x4_t
#define FLOAT_VX8_T vfloat32m1x8_t
#define VLEV_FLOAT __riscv_vle32_v_f32m1
#define VLSEV_FLOAT __riscv_vlse32_v_f32m1
#define VSEV_FLOAT __riscv_vse32_v_f32m1
#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1
#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1
#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1
#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1
#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1
#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2
#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4
#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4
#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8
#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8
#else
#define VSETVL(n) __riscv_vsetvl_e64m1(n)
#define FLOAT_V_T vfloat64m1_t
#define FLOAT_VX2_T vfloat64m1x2_t
#define FLOAT_VX4_T vfloat64m1x4_t
#define FLOAT_VX8_T vfloat64m1x8_t
#define VLEV_FLOAT __riscv_vle64_v_f64m1
#define VLSEV_FLOAT __riscv_vlse64_v_f64m1
#define VSEV_FLOAT __riscv_vse64_v_f64m1
#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1
#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1
#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1
#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1
#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1
#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2
#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4
#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4
#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8
#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8
#endif

int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
Expand All @@ -62,7 +68,10 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)

IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;

FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7;
FLOAT_V_T v0;
FLOAT_VX2_T vx2;
FLOAT_VX4_T vx4;
FLOAT_VX8_T vx8;

// fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);

Expand All @@ -83,8 +92,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
for(i = (n >> 3); i > 0; i--) {
size_t vl = 8;

VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG8_FLOAT(boffset1, vx8, vl);

aoffset1 += 8;
boffset1 += m * 8;
Expand All @@ -93,8 +102,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
if (n & 4) {
size_t vl = 8;

VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG4_FLOAT(boffset2, vx4, vl);

aoffset1 += 4;
boffset2 += 32;
Expand All @@ -103,8 +112,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
if (n & 2) {
size_t vl = 8;

VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG2_FLOAT(boffset3, v0, v1, vl);
vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG2_FLOAT(boffset3, vx2, vl);

aoffset1 += 2;
boffset3 += 16;
Expand Down Expand Up @@ -133,8 +142,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
for(i = (n >> 3); i > 0; i--) {
size_t vl = 4;

VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG8_FLOAT(boffset1, vx8, vl);

aoffset1 += 8;
boffset1 += m * 8;
Expand All @@ -143,8 +152,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
if (n & 4) {
size_t vl = 4;

VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG4_FLOAT(boffset2, vx4, vl);

aoffset1 += 4;
boffset2 += 16;
Expand All @@ -153,8 +162,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
if (n & 2) {
size_t vl = 4;

VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG2_FLOAT(boffset3, v0, v1, vl);
vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG2_FLOAT(boffset3, vx2, vl);

aoffset1 += 2;
boffset3 += 8;
Expand All @@ -181,8 +190,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
for(i = (n >> 3); i > 0; i--) {
size_t vl = 2;

VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG8_FLOAT(boffset1, vx8, vl);

aoffset1 += 8;
boffset1 += m * 8;
Expand All @@ -191,8 +200,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
if (n & 4) {
size_t vl = 2;

VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG4_FLOAT(boffset2, vx4, vl);

aoffset1 += 4;
boffset2 += 8;
Expand All @@ -201,8 +210,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
if (n & 2) {
size_t vl = 2;

VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG2_FLOAT(boffset3, v0, v1, vl);
vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG2_FLOAT(boffset3, vx2, vl);

aoffset1 += 2;
boffset3 += 4;
Expand Down
23 changes: 17 additions & 6 deletions kernel/riscv64/izamax_rvv.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VSETVL_MAX __riscv_vsetvlmax_e64m4()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define FLOAT_VX2_T vfloat64m4x2_t
#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4
#define VLEV_FLOAT __riscv_vle64_v_f64m4
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4
#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4
#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2
#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1
#define MASK_T vbool16_t
#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16
Expand All @@ -61,10 +63,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VSETVL_MAX __riscv_vsetvlmax_e32m4()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define FLOAT_VX2_T vfloat32m4x2_t
#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4
#define VLEV_FLOAT __riscv_vle32_v_f32m4
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4
#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4
#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2
#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8
Expand Down Expand Up @@ -93,6 +97,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if (n <= 0 || inc_x <= 0) return(max_index);

FLOAT_V_T vx0, vx1, v_max;
FLOAT_VX2_T vxx2;
UINT_V_T v_max_index;
MASK_T mask;

Expand All @@ -107,7 +112,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) {
vl = VSETVL(n);

VLSEG_FLOAT(&vx0, &vx1, x, vl);
vxx2 = VLSEG_FLOAT(x, vl);

vx0 = VGET_VX2(vxx2, 0);
vx1 = VGET_VX2(vxx2, 1);

vx0 = VFABSV_FLOAT(vx0, vl);
vx1 = VFABSV_FLOAT(vx1, vl);
Expand All @@ -129,7 +137,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) {
vl = VSETVL(n);

VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
vxx2 = VLSSEG_FLOAT(x, stride_x, vl);

vx0 = VGET_VX2(vxx2, 0);
vx1 = VGET_VX2(vxx2, 1);

vx0 = VFABSV_FLOAT(vx0, vl);
vx1 = VFABSV_FLOAT(vx1, vl);
Expand Down
Loading