version 0.2.3 : added _precise C functions for cplxvecmul/cplxconjvecmul/cplxvecdiv/powerspectr/magnitude

JishinMaster · JishinMaster · commit c6646ba3bf87 · 2022-10-25T12:47:37.000+02:00
diff --git a/README.md b/README.md
@@ -144,8 +144,8 @@ The following table is a work in progress, "X" means there is not yet an impleme
 | ceil128f                   | ceil256f                   | ceil512f                   | ceilf_C                   | ippsCeil_32f                 | X                           | X                    |
 | floor128f                  | floor256f                  | floor512f                  | floorf_C                  | ippsFloor_32f                | X                           | X                    |
 | trunc128f                  | trunc256f                  | trunc512f                  | truncf_C                  | ippsTrunc_32f                | X                           | X                    |
-| cplxvecmul128f             | cplxvecmul256f             | cplxvecmul512f             | cplxvecmul_C              | ippsMul_32fc_A24             | cplxvecmul_vec              | X                    |
-| cplxvecmul128f_split       | cplxvecmul256f_split       | cplxvecmul512f_split       | cplxvecmul_C_split        | X                            | cplxvecmul_vec_split        | X                    |
+| cplxvecmul128f             | cplxvecmul256f             | cplxvecmul512f             | cplxvecmul_C/precise      | ippsMul_32fc_A11/24          | cplxvecmul_vec              | X                    |
+| cplxvecmul128f_split       | cplxvecmul256f_split       | cplxvecmul512f_split       | cplxvecmul_C_split/precise| X                            | cplxvecmul_vec_split        | X                    |
 | cplxconjvecmul128f         | cplxconjvecmul256f         | cplxconjvecmul512f         | cplxconjvecmul_C          | ippsMulByConj_32fc_A24       | X                           | X                    |
 | cplxconjvecmul128f_split   | cplxconjvecmul256f_split   | cplxconjvecmul512f_split   | cplxconjvecmul_C_split    | X                            | X                           | X                    |
 | cplxconj128f               | cplxconj256f               | cplxconj512f               | cplxconj_C                | ippsConj_32fc_A24            | X                           | X                    |
diff --git a/avx512_mathfun.h b/avx512_mathfun.h
@@ -1,6 +1,6 @@
 /*
  * Project : SIMD_Utils
- * Version : 0.2.2
+ * Version : 0.2.3
  * Author  : JishinMaster
  * Licence : BSD-2
  */
diff --git a/mysincosf.h b/mysincosf.h
@@ -1,6 +1,6 @@
 /*
  * Project : SIMD_Utils
- * Version : 0.2.2
+ * Version : 0.2.3
  * Author  : JishinMaster
  * Licence : BSD-2
  */
diff --git a/simd_test.c b/simd_test.c
diff --git a/simd_test_opencl.c b/simd_test_opencl.c
@@ -1,6 +1,6 @@
 /*
  * Project : SIMD_Utils
- * Version : 0.2.2
+ * Version : 0.2.3
  * Author  : JishinMaster
  * Licence : BSD-2
  */
diff --git a/simd_utils.h b/simd_utils.h
@@ -1,6 +1,6 @@
 /*
  * Project : SIMD_Utils
- * Version : 0.2.2
+ * Version : 0.2.3
  * Author  : JishinMaster
  * Licence : BSD-2
  */
@@ -932,7 +932,19 @@ static inline void magnitudef_C_interleaved(complex32_t *src, float *dst, int le
 #pragma omp simd
 #endif
     for (int i = 0; i < len; i++) {
-        dst[i] = sqrtf(src[i].re * src[i].re + (src[i].im * src[i].im));
+        dst[i] = sqrtf((src[i].re * src[i].re) + src[i].im * src[i].im);
+    }
+}
+
+static inline void magnitudef_C_interleaved_precise(complex32_t *src, float *dst, int len)
+{
+#ifdef OMP
+#pragma omp simd
+#endif
+    for (int i = 0; i < len; i++) {
+        double srcRe_64 = (double) src[i].re;
+        double srcIm_64 = (double) src[i].im;
+        dst[i] = (float) (sqrt((srcRe_64 * srcRe_64) + srcIm_64 * srcIm_64));
     }
 }
 
@@ -942,18 +954,41 @@ static inline void magnitudef_C_split(float *srcRe, float *srcIm, float *dst, in
 #pragma omp simd
 #endif
     for (int i = 0; i < len; i++) {
-        dst[i] = sqrtf(srcRe[i] * srcRe[i] + (srcIm[i] * srcIm[i]));
+        dst[i] = sqrtf((srcRe[i] * srcRe[i]) + srcIm[i] * srcIm[i]);
     }
 }
 
+static inline void magnitudef_C_split_precise(float *srcRe, float *srcIm, float *dst, int len)
+{
+#ifdef OMP
+#pragma omp simd
+#endif
+    for (int i = 0; i < len; i++) {
+        double srcRe_64 = (double) srcRe[i];
+        double srcIm_64 = (double) srcIm[i];
+        dst[i] = (float) (sqrt((srcRe_64 * srcRe_64) + srcIm_64 * srcIm_64));
+    }
+}
 
 static inline void powerspectf_C_split(float *srcRe, float *srcIm, float *dst, int len)
 {
 #ifdef OMP
 #pragma omp simd
 #endif
     for (int i = 0; i < len; i++) {
-        dst[i] = srcRe[i] * srcRe[i] + (srcIm[i] * srcIm[i]);
+        dst[i] = (srcRe[i] * srcRe[i]) + srcIm[i] * srcIm[i];
+    }
+}
+
+static inline void powerspectf_C_split_precise(float *srcRe, float *srcIm, float *dst, int len)
+{
+#ifdef OMP
+#pragma omp simd
+#endif
+    for (int i = 0; i < len; i++) {
+        double srcRe_64 = (double) srcRe[i];
+        double srcIm_64 = (double) srcIm[i];
+        dst[i] = (float) ((srcRe_64 * srcRe_64) + srcIm_64 * srcIm_64);
     }
 }
 
@@ -963,7 +998,19 @@ static inline void powerspectf_C_interleaved(complex32_t *src, float *dst, int l
 #pragma omp simd
 #endif
     for (int i = 0; i < len; i++) {
-        dst[i] = src[i].re * src[i].re + (src[i].im * src[i].im);
+        dst[i] = (src[i].re * src[i].re) + src[i].im * src[i].im;
+    }
+}
+
+static inline void powerspectf_C_interleaved_precise(complex32_t *src, float *dst, int len)
+{
+#ifdef OMP
+#pragma omp simd
+#endif
+    for (int i = 0; i < len; i++) {
+        double srcRe_64 = (double) src[i].re;
+        double srcIm_64 = (double) src[i].im;
+        dst[i] = (float) ((srcRe_64 * srcRe_64) + srcIm_64 * srcIm_64);
     }
 }
 
@@ -1305,6 +1352,22 @@ static inline void cplxvecdiv_C(complex32_t *src1, complex32_t *src2, complex32_
     }
 }
 
+static inline void cplxvecdiv_C_precise(complex32_t *src1, complex32_t *src2, complex32_t *dst, int len)
+{
+#ifdef OMP
+#pragma omp simd
+#endif
+    for (int i = 0; i < len; i++) {
+        double src1Re_64 = (double) src1[i].re;
+        double src1Im_64 = (double) src1[i].im;
+        double src2Re_64 = (double) src2[i].re;
+        double src2Im_64 = (double) src2[i].im;
+        double c2d2 = src2Re_64 * src2Re_64 + src2Im_64 * src2Im_64;
+        dst[i].re = (float) ((src1Re_64 * src2Re_64 + (src1Im_64 * src2Im_64)) / c2d2);
+        dst[i].im = (float) ((-src1Re_64 * src2Im_64 + (src2Re_64 * src1Im_64)) / c2d2);
+    }
+}
+
 
 static inline void cplxvecdiv_C_split(float *src1Re, float *src1Im, float *src2Re, float *src2Im, float *dstRe, float *dstIm, int len)
 {
@@ -1318,6 +1381,22 @@ static inline void cplxvecdiv_C_split(float *src1Re, float *src1Im, float *src2R
     }
 }
 
+static inline void cplxvecdiv_C_split_precise(float *src1Re, float *src1Im, float *src2Re, float *src2Im, float *dstRe, float *dstIm, int len)
+{
+#ifdef OMP
+#pragma omp simd
+#endif
+    for (int i = 0; i < len; i++) {
+        double src1Re_64 = (double) src1Re[i];
+        double src1Im_64 = (double) src1Im[i];
+        double src2Re_64 = (double) src2Re[i];
+        double src2Im_64 = (double) src2Im[i];
+        double c2d2 = src2Re_64 * src2Re_64 + src2Im_64 * src2Im_64;
+        dstRe[i] = (float) ((src1Re_64 * src2Re_64 + (src1Im_64 * src2Im_64)) / c2d2);
+        dstIm[i] = (float) ((-src1Re_64 * src2Im_64 + (src2Re_64 * src1Im_64)) / c2d2);
+    }
+}
+
 static inline void cplxvecmul_C(complex32_t *src1, complex32_t *src2, complex32_t *dst, int len)
 {
 #ifdef OMP
@@ -1329,6 +1408,21 @@ static inline void cplxvecmul_C(complex32_t *src1, complex32_t *src2, complex32_
     }
 }
 
+static inline void cplxvecmul_C_precise(complex32_t *src1, complex32_t *src2, complex32_t *dst, int len)
+{
+#ifdef OMP
+#pragma omp simd
+#endif
+    for (int i = 0; i < len; i++) {
+        double src1Re_64 = (double) src1[i].re;
+        double src1Im_64 = (double) src1[i].im;
+        double src2Re_64 = (double) src2[i].re;
+        double src2Im_64 = (double) src2[i].im;
+        dst[i].re = (float) ((src1Re_64 * src2Re_64) - src1Im_64 * src2Im_64);
+        dst[i].im = (float) (src1Re_64 * src2Im_64 + (src2Re_64 * src1Im_64));
+    }
+}
+
 static inline void cplxvecmul_C_unrolled8(complex32_t *src1, complex32_t *src2, complex32_t *dst, int len)
 {
     int stop_len = len / 8;
@@ -1361,29 +1455,32 @@ static inline void cplxvecmul_C_unrolled8(complex32_t *src1, complex32_t *src2,
     }
 }
 
-static inline void cplxvecmul_C2(complex32_t *src1, complex32_t *src2, complex32_t *dst, int len)
+static inline void cplxvecmul_C_split(float *src1Re, float *src1Im, float *src2Re, float *src2Im, float *dstRe, float *dstIm, int len)
 {
 #ifdef OMP
 #pragma omp simd
 #endif
     for (int i = 0; i < len; i++) {
-        dst[i].re = (float) ((double) src1[i].re * (double) src2[i].re - (double) src1[i].im * (double) src2[i].im);
-        dst[i].im = (float) ((double) src1[i].re * (double) src2[i].im + (double) src2[i].re * (double) src1[i].im);
+        dstRe[i] = (src1Re[i] * src2Re[i]) - src1Im[i] * src2Im[i];
+        dstIm[i] = src1Re[i] * src2Im[i] + (src2Re[i] * src1Im[i]);
     }
 }
 
-static inline void cplxvecmul_C_split(float *src1Re, float *src1Im, float *src2Re, float *src2Im, float *dstRe, float *dstIm, int len)
+static inline void cplxvecmul_C_split_precise(float *src1Re, float *src1Im, float *src2Re, float *src2Im, float *dstRe, float *dstIm, int len)
 {
 #ifdef OMP
 #pragma omp simd
 #endif
     for (int i = 0; i < len; i++) {
-        dstRe[i] = (src1Re[i] * src2Re[i]) - src1Im[i] * src2Im[i];
-        dstIm[i] = src1Re[i] * src2Im[i] + (src2Re[i] * src1Im[i]);
+        double src1Re_64 = (double) src1Re[i];
+        double src1Im_64 = (double) src1Im[i];
+        double src2Re_64 = (double) src2Re[i];
+        double src2Im_64 = (double) src2Im[i];
+        dstRe[i] = (float) ((src1Re_64 * src2Re_64) - src1Im_64 * src2Im_64);
+        dstIm[i] = (float) (src1Re_64 * src2Im_64 + (src2Re_64 * src1Im_64));
     }
 }
 
-
 static inline void cplxconjvecmul_C(complex32_t *src1, complex32_t *src2, complex32_t *dst, int len)
 {
 #ifdef OMP
@@ -1395,7 +1492,7 @@ static inline void cplxconjvecmul_C(complex32_t *src1, complex32_t *src2, comple
     }
 }
 
-static inline void cplxconjvecmul_C2(complex32_t *src1, complex32_t *src2, complex32_t *dst, int len)
+static inline void cplxconjvecmul_C_precise(complex32_t *src1, complex32_t *src2, complex32_t *dst, int len)
 {
 #ifdef OMP
 #pragma omp simd
@@ -1417,7 +1514,7 @@ static inline void cplxconjvecmul_C_split(float *src1Re, float *src1Im, float *s
     }
 }
 
-static inline void cplxconjvecmul_C_split2(float *src1Re, float *src1Im, float *src2Re, float *src2Im, float *dstRe, float *dstIm, int len)
+static inline void cplxconjvecmul_C_split_precise(float *src1Re, float *src1Im, float *src2Re, float *src2Im, float *dstRe, float *dstIm, int len)
 {
 #ifdef OMP
 #pragma omp simd
diff --git a/simd_utils_altivec_float.h b/simd_utils_altivec_float.h
@@ -1,6 +1,6 @@
 /*
  * Project : SIMD_Utils
- * Version : 0.2.2
+ * Version : 0.2.3
  * Author  : JishinMaster
  * Licence : BSD-2
  */
diff --git a/simd_utils_avx512_double.h b/simd_utils_avx512_double.h
@@ -1,6 +1,6 @@
 /*
  * Project : SIMD_Utils
- * Version : 0.2.2
+ * Version : 0.2.3
  * Author  : JishinMaster
  * Licence : BSD-2
  */
diff --git a/simd_utils_avx512_float.h b/simd_utils_avx512_float.h
@@ -1,6 +1,6 @@
 /*
  * Project : SIMD_Utils
- * Version : 0.2.2
+ * Version : 0.2.3
  * Author  : JishinMaster
  * Licence : BSD-2
  */
diff --git a/simd_utils_avx512_int32.h b/simd_utils_avx512_int32.h
@@ -1,6 +1,6 @@
 /*
  * Project : SIMD_Utils
- * Version : 0.2.2
+ * Version : 0.2.3
  * Author  : JishinMaster
  * Licence : BSD-2
  */
diff --git a/simd_utils_avx_double.h b/simd_utils_avx_double.h
@@ -1,6 +1,6 @@
 /*
  * Project : SIMD_Utils
- * Version : 0.2.2
+ * Version : 0.2.3
  * Author  : JishinMaster
  * Licence : BSD-2
  */
diff --git a/simd_utils_avx_float.h b/simd_utils_avx_float.h
@@ -1,6 +1,6 @@
 /*
  * Project : SIMD_Utils
- * Version : 0.2.2
+ * Version : 0.2.3
  * Author  : JishinMaster
  * Licence : BSD-2
  */
diff --git a/simd_utils_avx_int32.h b/simd_utils_avx_int32.h
@@ -1,6 +1,6 @@
 /*
  * Project : SIMD_Utils
- * Version : 0.2.2
+ * Version : 0.2.3
  * Author  : JishinMaster
  * Licence : BSD-2
  */
diff --git a/simd_utils_constants.h b/simd_utils_constants.h
@@ -1,6 +1,6 @@
 /*
  * Project : SIMD_Utils
- * Version : 0.2.2
+ * Version : 0.2.3
  * Author  : JishinMaster
  * Licence : BSD-2
  */
diff --git a/simd_utils_kernel.cl b/simd_utils_kernel.cl
@@ -2,7 +2,7 @@
 
 /*
  * Project : SIMD_Utils
- * Version : 0.2.2
+ * Version : 0.2.3
  * Author  : JishinMaster
  * Licence : BSD-2
  */
diff --git a/simd_utils_riscv.h b/simd_utils_riscv.h
@@ -1,6 +1,6 @@
 /*
  * Project : SIMD_Utils
- * Version : 0.2.2
+ * Version : 0.2.3
  * Author  : JishinMaster
  * Licence : BSD-2
  */
diff --git a/simd_utils_sse_double.h b/simd_utils_sse_double.h
@@ -1,6 +1,6 @@
 /*
  * Project : SIMD_Utils
- * Version : 0.2.2
+ * Version : 0.2.3
  * Author  : JishinMaster
  * Licence : BSD-2
  */
diff --git a/simd_utils_sse_float.h b/simd_utils_sse_float.h
diff --git a/simd_utils_sse_int32.h b/simd_utils_sse_int32.h
diff --git a/simd_utils_svml.h b/simd_utils_svml.h