@@ -314,8 +314,7 @@ SINT Vec<2*N,T> join(const Vec<N,T>& lo, const Vec<N,T>& hi) {
314314 }
315315#endif
316316
317- // Some operations we want are not expressible with Clang/GCC vector
318- // extensions, so we implement them using the recursive approach.
317+ // Some operations we want are not expressible with Clang/GCC vector extensions.
319318
320319// N == 1 scalar implementations.
321320SIT Vec<1 ,T> if_then_else (const Vec<1 ,M<T>>& cond, const Vec<1 ,T>& t, const Vec<1 ,T>& e) {
@@ -324,8 +323,6 @@ SIT Vec<1,T> if_then_else(const Vec<1,M<T>>& cond, const Vec<1,T>& t, const Vec<
324323 (~cond & bit_pun<Vec<1 , M<T>>>(e)) );
325324}
326325
327- SIT Vec<1 ,T> pow (const Vec<1 ,T>& x, const Vec<1 ,T>& y) { return std::pow (x.val , y.val ); }
328-
329326// All default N != 1 implementations just recurse on lo and hi halves.
330327
331328// Clang can reason about naive_if_then_else() and optimize through it better
@@ -403,9 +400,6 @@ SINT bool all(const Vec<N,T>& x) {
403400 && all (x.hi );
404401}
405402
406- SINT Vec<N,T> pow (const Vec<N,T>& x, const Vec<N,T>& y) {
407- return join (pow (x.lo , y.lo ), pow (x.hi , y.hi ));
408- }
409403
410404// Scalar/vector operations just splat the scalar to a vector...
411405SINTU Vec<N,T> operator + (U x, const Vec<N,T>& y) { return Vec<N,T>(x) + y; }
@@ -421,8 +415,6 @@ SINTU Vec<N,M<T>> operator<=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) <= y;
421415SINTU Vec<N,M<T>> operator >=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) >= y; }
422416SINTU Vec<N,M<T>> operator < (U x, const Vec<N,T>& y) { return Vec<N,T>(x) < y; }
423417SINTU Vec<N,M<T>> operator > (U x, const Vec<N,T>& y) { return Vec<N,T>(x) > y; }
424- SINTU Vec<N,T> pow (U x, const Vec<N,T>& y) { return pow (Vec<N,T>(x), y); }
425-
426418// ... and same deal for vector/scalar operations.
427419SINTU Vec<N,T> operator + (const Vec<N,T>& x, U y) { return x + Vec<N,T>(y); }
428420SINTU Vec<N,T> operator - (const Vec<N,T>& x, U y) { return x - Vec<N,T>(y); }
@@ -437,7 +429,7 @@ SINTU Vec<N,M<T>> operator<=(const Vec<N,T>& x, U y) { return x <= Vec<N,T>(y);
437429SINTU Vec<N,M<T>> operator >=(const Vec<N,T>& x, U y) { return x >= Vec<N,T>(y); }
438430SINTU Vec<N,M<T>> operator < (const Vec<N,T>& x, U y) { return x < Vec<N,T>(y); }
439431SINTU Vec<N,M<T>> operator > (const Vec<N,T>& x, U y) { return x > Vec<N,T>(y); }
440- SINTU Vec<N,T> pow ( const Vec<N,T>& x, U y) { return pow (x, Vec<N,T>(y)); }
432+
441433
442434// The various op= operators, for vectors...
443435SINT Vec<N,T>& operator +=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x + y); }
@@ -505,16 +497,10 @@ SI Vec<sizeof...(Ix),T> shuffle(const Vec<N,T>& x) {
505497#endif
506498}
507499
508- // fma() delivers a fused mul-add, even if that's really expensive.
509- SI Vec<1 ,float > fma (const Vec<1 ,float >& x, const Vec<1 ,float >& y, const Vec<1 ,float >& z) {
510- return std::fma (x.val , y.val , z.val );
511- }
512- SIN Vec<N,float > fma (const Vec<N,float >& x, const Vec<N,float >& y, const Vec<N,float >& z) {
513- return join (fma (x.lo , y.lo , z.lo ),
514- fma (x.hi , y.hi , z.hi ));
515- }
500+ // Call map(fn, x) for a vector with fn() applied to each lane of x, { fn(x[0]), fn(x[1]), ... },
501+ // or map(fn, x,y) for a vector of fn(x[i], y[i]), etc.
516502
517- template <int N, typename T , typename Fn, std:: size_t ... I>
503+ template <typename Fn , typename ... Args, size_t ... I>
518504#if defined(__clang__)
519505// CFI, specifically -fsanitize=cfi-icall, seems to give a false positive here,
520506// with errors like "control flow integrity check for type 'float (float)
@@ -523,25 +509,37 @@ template <int N, typename T, typename Fn, std::size_t... I>
523509// So, stifle CFI in this function.
524510__attribute__ ((no_sanitize(" cfi" )))
525511#endif
526- SI auto map (const skvx::Vec<N,T>& x, Fn&& fn,
527- std::index_sequence<I...> ix = {}) -> skvx::Vec<N, decltype(fn(x[0 ]))> {
528- if /* constexpr*/ (sizeof ...(I) == 0 ) {
529- // When called as map(x, fn), bootstrap the index_sequence we want: 0,1,...,N-1.
530- return map (x, fn, std::make_index_sequence<N>{});
531- }
532- return { fn (x[I])... };
512+ SI auto map (std::index_sequence<I...>,
513+ Fn&& fn, const Args&... args) -> skvx::Vec<sizeof...(I), decltype(fn(args[0 ]...))> {
514+ auto lane = [&](size_t i) { return fn (args[i]...); };
515+ return { lane (I)... };
516+ }
517+
518+ template <typename Fn, int N, typename T, typename ... Rest>
519+ auto map (Fn&& fn, const Vec<N,T>& first, const Rest&... rest) {
520+ // Derive an {0...N-1} index_sequence from the size of the first arg: N lanes in, N lanes out.
521+ return map (std::make_index_sequence<N>{}, fn, first,rest...);
533522}
534523
535- SIN Vec<N,float > atan (const Vec<N,float >& x) { return map (x, atanf); }
536- SIN Vec<N,float > ceil (const Vec<N,float >& x) { return map (x, ceilf); }
537- SIN Vec<N,float > floor (const Vec<N,float >& x) { return map (x, floorf); }
538- SIN Vec<N,float > trunc (const Vec<N,float >& x) { return map (x, truncf); }
539- SIN Vec<N,float > round (const Vec<N,float >& x) { return map (x, roundf); }
540- SIN Vec<N,float > sqrt (const Vec<N,float >& x) { return map (x, sqrtf); }
541- SIN Vec<N,float > abs (const Vec<N,float >& x) { return map (x, fabsf); }
542- SIN Vec<N,float > sin (const Vec<N,float >& x) { return map (x, sinf); }
543- SIN Vec<N,float > cos (const Vec<N,float >& x) { return map (x, cosf); }
544- SIN Vec<N,float > tan (const Vec<N,float >& x) { return map (x, tanf); }
524+ SIN Vec<N,float > atan (const Vec<N,float >& x) { return map ( atanf, x); }
525+ SIN Vec<N,float > ceil (const Vec<N,float >& x) { return map ( ceilf, x); }
526+ SIN Vec<N,float > floor (const Vec<N,float >& x) { return map (floorf, x); }
527+ SIN Vec<N,float > trunc (const Vec<N,float >& x) { return map (truncf, x); }
528+ SIN Vec<N,float > round (const Vec<N,float >& x) { return map (roundf, x); }
529+ SIN Vec<N,float > sqrt (const Vec<N,float >& x) { return map ( sqrtf, x); }
530+ SIN Vec<N,float > abs (const Vec<N,float >& x) { return map ( fabsf, x); }
531+ SIN Vec<N,float > sin (const Vec<N,float >& x) { return map ( sinf, x); }
532+ SIN Vec<N,float > cos (const Vec<N,float >& x) { return map ( cosf, x); }
533+ SIN Vec<N,float > tan (const Vec<N,float >& x) { return map ( tanf, x); }
534+ SIN Vec<N,float > pow (const Vec<N,float >& x,
535+ const Vec<N,float >& y) { return map (powf, x,y); }
536+ SIN Vec<N,float > fma (const Vec<N,float >& x,
537+ const Vec<N,float >& y,
538+ const Vec<N,float >& z) {
539+ // I don't understand why Clang's codegen is terrible if we write map(fmaf, x,y,z) directly.
540+ auto fn = [](float x, float y, float z) { return fmaf (x,y,z); };
541+ return map (fn, x,y,z);
542+ }
545543
546544SI Vec<1 ,int > lrint (const Vec<1 ,float >& x) {
547545 return (int )lrintf (x.val );
@@ -704,27 +702,6 @@ SIN Vec<N,uint8_t> approx_scale(const Vec<N,uint8_t>& x, const Vec<N,uint8_t>& y
704702 }
705703 #endif
706704
707- #if defined(__AVX2__)
708- SI Vec<4 ,float > fma (const Vec<4 ,float >& x, const Vec<4 ,float >& y, const Vec<4 ,float >& z) {
709- return bit_pun<Vec<4 ,float >>(_mm_fmadd_ps (bit_pun<__m128>(x),
710- bit_pun<__m128>(y),
711- bit_pun<__m128>(z)));
712- }
713-
714- SI Vec<8 ,float > fma (const Vec<8 ,float >& x, const Vec<8 ,float >& y, const Vec<8 ,float >& z) {
715- return bit_pun<Vec<8 ,float >>(_mm256_fmadd_ps (bit_pun<__m256>(x),
716- bit_pun<__m256>(y),
717- bit_pun<__m256>(z)));
718- }
719- #elif defined(__aarch64__)
720- SI Vec<4 ,float > fma (const Vec<4 ,float >& x, const Vec<4 ,float >& y, const Vec<4 ,float >& z) {
721- // These instructions tend to work like z += xy, so the order here is z,x,y.
722- return bit_pun<Vec<4 ,float >>(vfmaq_f32 (bit_pun<float32x4_t >(z),
723- bit_pun<float32x4_t >(x),
724- bit_pun<float32x4_t >(y)));
725- }
726- #endif
727-
728705#endif // !defined(SKNX_NO_SIMD)
729706
730707} // namespace skvx
0 commit comments