JishinMaster
diff --git a/‎mysincosf.h
+1-22 b/‎mysincosf.h
+1-22
diff --git a/‎simd_test.c
+122-3 b/‎simd_test.c
+122-3
diff --git a/‎simd_utils.h
+37-12 b/‎simd_utils.h
+37-12
@@ -7,27 +7,6 @@
 
 #pragma once
 
-static float FOPI = 1.27323954473516;
-static float PIO4F = 0.7853981633974483096;
-
-/* Note, these constants are for a 32-bit significand: */
-static float DP1 = 0.7853851318359375;
-static float DP2 = 1.30315311253070831298828125e-5;
-static float DP3 = 3.03855025325309630e-11;
-static float lossth = 65536.;
-
-/* These are for a 24-bit significand: */
-/*static float DP1 = 0.78515625;
-static float DP2 = 2.4187564849853515625e-4;
-static float DP3 = 3.77489497744594108e-8;
-static float lossth = 8192.;*/
-
-static float T24M1 = 16777215.;
-
-static float sincof[] = {-1.9515295891E-4, 8.3321608736E-3, -1.6666654611E-1};
-static float coscof[] = {2.443315711809948E-005, -1.388731625493765E-003,
-                         4.166664568298827E-002};
-
 static inline int mysincosf(float xx, float *s, float *c)
 {
     float x, y, y1, y2, z;
@@ -66,7 +45,7 @@ static inline int mysincosf(float xx, float *s, float *c)
         x = x - y * PIO4F;
     } else {
         /* Extended precision modular arithmetic */
-        x = ((x - y * DP1) - y * DP2) - y * DP3;
+        x = ((x + y * minus_cephes_DP1) + y * minus_cephes_DP2) + y * minus_cephes_DP3;
     }
     /*einits();*/
     z = x * x;
 
@@ -38,6 +38,7 @@
 #include <mkl_vml.h>
 #endif
 
+#ifndef RISCV
 typedef ALIGN16_BEG union {
     float f[4];
     int i[4];
@@ -53,6 +54,8 @@ typedef ALIGN32_BEG union {
 
 #endif
 
+#endif
+
 float l2_err(float *test, float *ref, int len)
 {
     float l2_err = 0.0f;
@@ -1140,6 +1143,26 @@ int main(int argc, char **argv)
     l2_err(inout3, inout_ref, len);
 #endif
 
+#ifdef RISCV
+    clock_gettime(CLOCK_REALTIME, &start);
+    maxeveryf_vec(inout, inout2, inout3, len);
+    clock_gettime(CLOCK_REALTIME, &stop);
+
+    elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
+    printf("maxeveryf_vec %d %lf\n", len, elapsed);
+
+    clock_gettime(CLOCK_REALTIME, &start);
+    for (l = 0; l < loop; l++)
+        maxeveryf_vec(inout, inout2, inout3, len);
+
+    clock_gettime(CLOCK_REALTIME, &stop);
+
+    elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
+    printf("maxeveryf_vec %d %lf %0.3lf GFlops/s\n", len, elapsed, flops / (elapsed * 1e3));
+
+    l2_err(inout3, inout_ref, len);
+#endif
+
     printf("\n");
     /////////////////////////////////////////////////////////// MINEVERY //////////////////////////////////////////////////////////////////////////////
     printf("MINEVERY\n");
@@ -1234,6 +1257,26 @@ int main(int argc, char **argv)
     l2_err(inout3, inout_ref, len);
 #endif
 
+#ifdef RISCV
+    clock_gettime(CLOCK_REALTIME, &start);
+    mineveryf_vec(inout, inout2, inout3, len);
+    clock_gettime(CLOCK_REALTIME, &stop);
+
+    elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
+    printf("mineveryf_vec %d %lf\n", len, elapsed);
+
+    clock_gettime(CLOCK_REALTIME, &start);
+    for (l = 0; l < loop; l++)
+        mineveryf_vec(inout, inout2, inout3, len);
+
+    clock_gettime(CLOCK_REALTIME, &stop);
+
+    elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
+    printf("mineveryf_vec %d %lf %0.3lf GFlops/s\n", len, elapsed, flops / (elapsed * 1e3));
+
+    l2_err(inout3, inout_ref, len);
+#endif
+
     /*for (int i = 0; i < len; i++)
 {
 	printf("%f %f %f\n",inout[i],inout2[i],inout2_ref[i]);
@@ -1634,6 +1677,26 @@ printf("\n");
     printf("mean %f ref %f\n", mean, mean_ref);
 #endif
 
+#ifdef RISCV
+    clock_gettime(CLOCK_REALTIME, &start);
+    meanf_vec(inout, &mean, len);
+    clock_gettime(CLOCK_REALTIME, &stop);
+    elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
+    printf("meanf_vec %d %lf\n", len, elapsed);
+    printf("mean %f ref %f\n", mean, mean_ref);
+
+    clock_gettime(CLOCK_REALTIME, &start);
+    for (l = 0; l < loop; l++) {
+        meanf_vec(inout, &mean, len);
+    }
+    clock_gettime(CLOCK_REALTIME, &stop);
+    elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
+    printf("meanf_vec %d %lf\n", len, elapsed);
+
+    printf("mean %f ref %f\n", mean, mean_ref);
+#endif
+
+
     printf("\n");
     /////////////////////////////////////////////////////////// MAGNITUDE_SPLIT //////////////////////////////////////////////////////////////////////////////
     printf("MAGNITUDE_SPLIT\n");
@@ -1728,6 +1791,23 @@ printf("\n");
     l2_err(inout_ref, inout2_ref, len);
 #endif
 
+#ifdef RISCV
+    clock_gettime(CLOCK_REALTIME, &start);
+    magnitudef_split_vec(inout, inout2, inout2_ref, len);
+    clock_gettime(CLOCK_REALTIME, &stop);
+    elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
+    printf("magnitudef_split_vec %d %lf\n", len, elapsed);
+
+    clock_gettime(CLOCK_REALTIME, &start);
+    for (l = 0; l < loop; l++)
+        magnitudef_split_vec(inout, inout2, inout2_ref, len);
+    clock_gettime(CLOCK_REALTIME, &stop);
+    elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
+    printf("magnitudef_split_vec %d %lf\n", len, elapsed);
+
+    l2_err(inout_ref, inout2_ref, len);
+#endif
+
     printf("\n");
     /////////////////////////////////////////////////////////// MAGNITUDE_INTERLEAVE //////////////////////////////////////////////////////////////////////////////
     printf("MAGNITUDE_INTERLEAVE\n");
@@ -2042,7 +2122,7 @@ printf("\n");
     elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
     printf("cplxconjvecmul128f %d %lf %0.3lf GFlops/s\n", len, elapsed, flops / (elapsed * 1e3));
 
-    l2_err(inout_ref, inout2_ref, 2*len);
+    l2_err(inout_ref, inout2_ref, 2 * len);
 #endif
 
 #ifdef AVX
@@ -2059,7 +2139,7 @@ printf("\n");
     elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
     printf("cplxconjvecmul256f %d %lf %0.3lf GFlops/s\n", len, elapsed, flops / (elapsed * 1e3));
 
-    l2_err(inout_ref, inout2_ref, 2*len);
+    l2_err(inout_ref, inout2_ref, 2 * len);
 #endif
 
 #ifdef AVX512
@@ -2076,7 +2156,7 @@ printf("\n");
     elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
     printf("cplxconjvecmul512f %d %lf %0.3lf GFlops/s\n", len, elapsed, flops / (elapsed * 1e3));
 
-    l2_err(inout_ref, inout2_ref, 2*len);
+    l2_err(inout_ref, inout2_ref, 2 * len);
 #endif
     printf("\n");
 
@@ -2532,6 +2612,10 @@ printf("\n");
 
     flops = 34 * len;  //TODO : check the right theoretical value
 
+    for(int i = 0; i < len; i++){
+      inout[i] = -(float)len/16.0f + 0.1f*(float)i;
+    }
+
     clock_gettime(CLOCK_REALTIME, &start);
     sinf_C(inout, inout2_ref, len);
     clock_gettime(CLOCK_REALTIME, &stop);
@@ -2659,6 +2743,23 @@ printf("\n");
     l2_err(inout2_ref, inout2, len);
 #endif
 
+#ifdef RISCV
+    clock_gettime(CLOCK_REALTIME, &start);
+    sinf_vec(inout, inout2, len);
+    clock_gettime(CLOCK_REALTIME, &stop);
+    elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
+    printf("sinf_vec %d %lf\n", len, elapsed);
+
+    clock_gettime(CLOCK_REALTIME, &start);
+    for (l = 0; l < loop; l++)
+        sinf_vec(inout, inout2, len);
+    clock_gettime(CLOCK_REALTIME, &stop);
+    elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
+    printf("sinf_vec %d %lf %0.3lf GFlops/s\n", len, elapsed, flops / (elapsed * 1e3));
+    l2_err(inout2_ref, inout2, len);
+
+    //for(int i = 0; i < len; i++) printf("%f %f %f\n",inout[i], inout2[i], inout2_ref[i]);
+#endif
 
     printf("\n");
     /////////////////////////////////////////////////////////// COS //////////////////////////////////////////////////////////////////////////////
@@ -2935,6 +3036,24 @@ printf("\n");
     l2_err(inout2_ref, inout3, len);
 #endif
 
+/*
+#ifdef RISCV
+    clock_gettime(CLOCK_REALTIME, &start);
+    sincosf_vec(inout, inout2, inout3, len);
+    clock_gettime(CLOCK_REALTIME, &stop);
+    elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
+    printf("sincosf_vec %d %lf\n", len, elapsed);
+
+    clock_gettime(CLOCK_REALTIME, &start);
+    for (l = 0; l < loop; l++)
+        sincosf_vec(inout, inout2, inout3, len);
+    clock_gettime(CLOCK_REALTIME, &stop);
+    elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
+    printf("sincosf_vec %d %lf\n", len, elapsed);
+    l2_err(inout_ref, inout2, len);
+    l2_err(inout2_ref, inout3, len);
+#endif
+*/
     printf("\n");
     /////////////////////////////////////////////////////////// SINCOSD //////////////////////////////////////////////////////////////////////////////
     printf("SINCOSD\n");
 
@@ -22,6 +22,31 @@ extern "C" {
 #include <math.h>
 #include <stdint.h>
 
+static const float FOPI = 1.27323954473516f;
+static const float PIO4F = 0.7853981633974483096f;
+
+/* Note, these constants are for a 32-bit significand: */
+/*
+static const float DP1 = 0.7853851318359375f;
+static const float DP2 = 1.30315311253070831298828125e-5f;
+static const float DP3 = 3.03855025325309630e-11f;
+static const float lossth = 65536.f;
+*/
+
+/* These are for a 24-bit significand: */
+static const float minus_cephes_DP1 = -0.78515625f;
+static const float minus_cephes_DP2 = -2.4187564849853515625e-4f;
+static const float minus_cephes_DP3 = -3.77489497744594108e-8f;
+static float lossth = 8192.;
+
+static const float T24M1 = 16777215.f;
+
+static const float sincof[] = {-1.9515295891E-4f, 8.3321608736E-3f, -1.6666654611E-1f};
+static const float coscof[] = {2.443315711809948E-5f, -1.388731625493765E-3f,
+                               4.166664568298827E-2f};
+static const int32_t sign_mask = 0x80000000;
+static const int32_t inv_sign_mask = ~sign_mask;
+
 #include "mysincosf.h"
 
 #define INVLN10 0.4342944819032518f     //0.4342944819f
@@ -89,7 +114,7 @@ typedef enum {
 static inline int isAligned(uintptr_t ptr, size_t alignment)
 {
 #ifndef ALWAYS_ALIGNED
-    if (((uintptr_t)(ptr) % alignment) == 0)
+    if (((uintptr_t) (ptr) % alignment) == 0)
         return 1;
     return 0;
 #else
@@ -100,8 +125,8 @@ static inline int isAligned(uintptr_t ptr, size_t alignment)
 static inline int areAligned2(uintptr_t ptr1, uintptr_t ptr2, size_t alignment)
 {
 #ifndef ALWAYS_ALIGNED
-    if (((uintptr_t)(ptr1) % alignment) == 0)
-        if (((uintptr_t)(ptr2) % alignment) == 0)
+    if (((uintptr_t) (ptr1) % alignment) == 0)
+        if (((uintptr_t) (ptr2) % alignment) == 0)
             return 1;
     return 0;
 #else
@@ -112,9 +137,9 @@ static inline int areAligned2(uintptr_t ptr1, uintptr_t ptr2, size_t alignment)
 static inline int areAligned3(uintptr_t ptr1, uintptr_t ptr2, uintptr_t ptr3, size_t alignment)
 {
 #ifndef ALWAYS_ALIGNED
-    if (((uintptr_t)(ptr1) % alignment) == 0)
-        if (((uintptr_t)(ptr2) % alignment) == 0)
-            if (((uintptr_t)(ptr3) % alignment) == 0)
+    if (((uintptr_t) (ptr1) % alignment) == 0)
+        if (((uintptr_t) (ptr2) % alignment) == 0)
+            if (((uintptr_t) (ptr3) % alignment) == 0)
                 return 1;
     return 0;
 #else
@@ -681,7 +706,7 @@ static inline int posix_memalign(void **pointer, size_t len, int alignement)
     void *p, *p0 = malloc(len + alignement);
     if (!p0)
         return (void *) NULL;
-    p = (void *) (((size_t) p0 + alignement) & (~((size_t)(alignement - 1))));
+    p = (void *) (((size_t) p0 + alignement) & (~((size_t) (alignement - 1))));
     *((void **) p - 1) = p0;
 
     *pointer = p;
@@ -694,7 +719,7 @@ static inline void *aligned_malloc(size_t len, int alignement)
     void *p, *p0 = malloc(len + alignement);
     if (!p0)
         return (void *) NULL;
-    p = (void *) (((size_t) p0 + alignement) & (~((size_t)(alignement - 1))));
+    p = (void *) (((size_t) p0 + alignement) & (~((size_t) (alignement - 1))));
     *((void **) p - 1) = p0;
     return p;
 }
@@ -944,31 +969,31 @@ static inline void convertFloat32ToU8_C(float *src, uint8_t *dst, int len, int r
 #endif
         for (int i = 0; i < len; i++) {
             float tmp = floorf(src[i] * scale_fact_mult);
-            dst[i] = (uint8_t)(tmp > 255.0f ? 255.0f : tmp);
+            dst[i] = (uint8_t) (tmp > 255.0f ? 255.0f : tmp);
         }
     } else if (rounding_mode == RndNear) {
 #ifdef OMP
 #pragma omp simd
 #endif
         for (int i = 0; i < len; i++) {
             float tmp = roundf(src[i] * scale_fact_mult);
-            dst[i] = (uint8_t)(tmp > 255.0f ? 255.0f : tmp);
+            dst[i] = (uint8_t) (tmp > 255.0f ? 255.0f : tmp);
         }
     } else if (rounding_mode == RndFinancial) {
 #ifdef OMP
 #pragma omp simd
 #endif
         for (int i = 0; i < len; i++) {
             float tmp = (roundf(src[i] * scale_fact_mult * 0.5f) / 2.0f);
-            dst[i] = (uint8_t)(tmp > 255.0f ? 255.0f : tmp);
+            dst[i] = (uint8_t) (tmp > 255.0f ? 255.0f : tmp);
         }
     } else {
 #ifdef OMP
 #pragma omp simd
 #endif
         for (int i = 0; i < len; i++) {
             float tmp = src[i] * scale_fact_mult;
-            dst[i] = (uint8_t)(tmp > 255.0f ? 255.0f : tmp);
+            dst[i] = (uint8_t) (tmp > 255.0f ? 255.0f : tmp);
         }
     }
 }