@@ -2832,7 +2832,7 @@ long_add_would_overflow(long a, long b)
28322832}
28332833
28342834/*
2835- Double length extended precision floating point arithmetic
2835+ Double and triple length extended precision floating point arithmetic
28362836based on ideas from three sources:
28372837
28382838 Improved Kahan–Babuška algorithm by Arnold Neumaier
@@ -2845,22 +2845,22 @@ based on ideas from three sources:
28452845 Ultimately Fast Accurate Summation by Siegfried M. Rump
28462846 https://www.tuhh.de/ti3/paper/rump/Ru08b.pdf
28472847
2848- The double length routines allow for quite a bit of instruction
2849- level parallelism. On a 3.22 Ghz Apple M1 Max, the incremental
2850- cost of increasing the input vector size by one is 6.0 nsec .
2848+ Double length functions:
2849+ * dl_split() exact split of a C double into two half precision components.
2850+ * dl_mul() exact multiplication of two C doubles .
28512851
2852- dl_zero() returns an extended precision zero
2853- dl_split() exactly splits a double into two half precision components.
2854- dl_add() performs compensated summation to keep a running total.
2855- dl_mul() implements lossless multiplication of doubles.
2856- dl_fma() implements an extended precision fused-multiply-add.
2857- dl_to_d() converts from extended precision to double precision.
2852+ Triple length functions and constant:
2853+ * tl_zero is a triple length zero for starting or resetting an accumulation.
2854+ * tl_add() compensated addition of a C double to a triple length number.
2855+ * tl_fma() performs a triple length fused-multiply-add.
2856+ * tl_to_d() converts from triple length number back to a C double.
28582857
28592858*/
28602859
28612860typedef struct { double hi ; double lo ; } DoubleLength ;
2861+ typedef struct { double hi ; double lo ; double tiny ; } TripleLength ;
28622862
2863- static const DoubleLength dl_zero = {0.0 , 0.0 };
2863+ static const TripleLength tl_zero = {0.0 , 0.0 , 0.0 };
28642864
28652865static inline DoubleLength
28662866twosum (double a , double b )
@@ -2874,11 +2874,20 @@ twosum(double a, double b)
28742874 return (DoubleLength ) {s , t };
28752875}
28762876
2877- static inline DoubleLength
2878- dl_add ( DoubleLength total , double x )
2877+ static inline TripleLength
2878+ tl_add ( TripleLength total , double x )
28792879{
2880- DoubleLength s = twosum (total .hi , x );
2881- return (DoubleLength ) {s .hi , total .lo + s .lo };
2880+ /* Input: x total.hi total.lo total.tiny
2881+ |--- twosum ---|
2882+ s.hi s.lo
2883+ |--- twosum ---|
2884+ t.hi t.lo
2885+ |--- single sum ---|
2886+ Output: s.hi t.hi tiny
2887+ */
2888+ DoubleLength s = twosum (x , total .hi );
2889+ DoubleLength t = twosum (s .lo , total .lo );
2890+ return (TripleLength ) {s .hi , t .hi , t .lo + total .tiny };
28822891}
28832892
28842893static inline DoubleLength
@@ -2902,18 +2911,18 @@ dl_mul(double x, double y)
29022911 return (DoubleLength ) {z , zz };
29032912}
29042913
2905- static inline DoubleLength
2906- dl_fma ( DoubleLength total , double p , double q )
2914+ static inline TripleLength
2915+ tl_fma ( TripleLength total , double p , double q )
29072916{
29082917 DoubleLength product = dl_mul (p , q );
2909- total = dl_add (total , product .hi );
2910- return dl_add (total , product .lo );
2918+ total = tl_add (total , product .hi );
2919+ return tl_add (total , product .lo );
29112920}
29122921
29132922static inline double
2914- dl_to_d ( DoubleLength total )
2923+ tl_to_d ( TripleLength total )
29152924{
2916- return total .hi + total .lo ;
2925+ return total .tiny + total .lo + total . hi ;
29172926}
29182927
29192928/*[clinic input]
@@ -2944,7 +2953,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q)
29442953 bool int_path_enabled = true, int_total_in_use = false;
29452954 bool flt_path_enabled = true, flt_total_in_use = false;
29462955 long int_total = 0 ;
2947- DoubleLength flt_total = dl_zero ;
2956+ TripleLength flt_total = tl_zero ;
29482957
29492958 p_it = PyObject_GetIter (p );
29502959 if (p_it == NULL ) {
@@ -3079,7 +3088,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q)
30793088 } else {
30803089 goto finalize_flt_path ;
30813090 }
3082- DoubleLength new_flt_total = dl_fma (flt_total , flt_p , flt_q );
3091+ TripleLength new_flt_total = tl_fma (flt_total , flt_p , flt_q );
30833092 if (isfinite (new_flt_total .hi )) {
30843093 flt_total = new_flt_total ;
30853094 flt_total_in_use = true;
@@ -3093,7 +3102,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q)
30933102 // We're finished, overflowed, have a non-float, or got a non-finite value
30943103 flt_path_enabled = false;
30953104 if (flt_total_in_use ) {
3096- term_i = PyFloat_FromDouble (dl_to_d (flt_total ));
3105+ term_i = PyFloat_FromDouble (tl_to_d (flt_total ));
30973106 if (term_i == NULL ) {
30983107 goto err_exit ;
30993108 }
@@ -3104,7 +3113,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q)
31043113 Py_SETREF (total , new_total );
31053114 new_total = NULL ;
31063115 Py_CLEAR (term_i );
3107- flt_total = dl_zero ;
3116+ flt_total = tl_zero ;
31083117 flt_total_in_use = false;
31093118 }
31103119 }
0 commit comments