@@ -2851,6 +2851,8 @@ dl_sum(double a, double b)
28512851 return (DoubleLength ) {x , y };
28522852}
28532853
2854+ #ifndef UNRELIABLE_FMA
2855+
28542856static DoubleLength
28552857dl_mul (double x , double y )
28562858{
@@ -2860,6 +2862,47 @@ dl_mul(double x, double y)
28602862 return (DoubleLength ) {z , zz };
28612863}
28622864
2865+ #else
2866+
2867+ /*
2868+ The default implementation of dl_mul() depends on the C math library
2869+ having an accurate fma() function as required by § 7.12.13.1 of the
2870+ C99 standard.
2871+
2872+ The UNRELIABLE_FMA option is provided as a slower but accurate
2873+ alternative for builds where the fma() function is found wanting.
2874+ The speed penalty may be modest (17% slower on an Apple M1 Max),
2875+ so don't hesitate to enable this build option.
2876+
2877+ The algorithms are from the T. J. Dekker paper:
2878+ A Floating-Point Technique for Extending the Available Precision
2879+ https://csclub.uwaterloo.ca/~pbarfuss/dekker1971.pdf
2880+ */
2881+
2882+ static DoubleLength
2883+ dl_split (double x ) {
2884+ // Dekker (5.5) and (5.6).
2885+ double t = x * 134217729.0 ; // Veltkamp constant = 2.0 ** 27 + 1
2886+ double hi = t - (t - x );
2887+ double lo = x - hi ;
2888+ return (DoubleLength ) {hi , lo };
2889+ }
2890+
2891+ static DoubleLength
2892+ dl_mul (double x , double y )
2893+ {
2894+ // Dekker (5.12) and mul12()
2895+ DoubleLength xx = dl_split (x );
2896+ DoubleLength yy = dl_split (y );
2897+ double p = xx .hi * yy .hi ;
2898+ double q = xx .hi * yy .lo + xx .lo * yy .hi ;
2899+ double z = p + q ;
2900+ double zz = p - z + q + xx .lo * yy .lo ;
2901+ return (DoubleLength ) {z , zz };
2902+ }
2903+
2904+ #endif
2905+
28632906typedef struct { double hi ; double lo ; double tiny ; } TripleLength ;
28642907
28652908static const TripleLength tl_zero = {0.0 , 0.0 , 0.0 };
0 commit comments