Add COMPLEX_RETSTYLE_FNDA for Windows x64

Windows x64 automatically forces return values onto the stack if they are larger than 64 bits wide [0]. This causes return values from e.g. `zdotc` to be pushed onto a secret first argument, but not the return values from e.g. `cdotc`. To address this, we add a new complex return style, "Float Normal, Double Argument", to specify that `complex float`-returning functions use the normal return style, whereas `complex double`-returning functions use the argument return style. This should fix JuliaLinearAlgebra/BLISBLAS.jl#15 [0] https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170
JuliaLinearAlgebra · Jun 12, 2024 · 516e47b · 516e47b
1 parent 1a3c7f2
commit 516e47b
Show file tree

Hide file tree

Showing 7 changed files with 151 additions and 51 deletions.
diff --git a/src/Make.inc b/src/Make.inc
@@ -24,7 +24,7 @@ ifneq (,$(findstring MINGW,$(OS))$(findstring MSYS,$(OS))$(findstring CYGWIN,$(O
 endif
 
 LBT_SOVERSION_MAJOR := 5
-LBT_SOVERSION_MINOR := 4
+LBT_SOVERSION_MINOR := 5
 LBT_SOVERSION_PATCH := 0
 
 ifeq ($(OS), WINNT)

diff --git a/src/autodetection.c b/src/autodetection.c
@@ -214,6 +214,9 @@ int32_t autodetect_complex_return_style(void * handle, const char * suffix) {
     if (env_lowercase_match("LBT_FORCE_RETSTYLE", "argument")) {
         return LBT_COMPLEX_RETSTYLE_ARGUMENT;
     }
+    if (env_lowercase_match("LBT_FORCE_RETSTYLE", "fnda")) {
+        return LBT_COMPLEX_RETSTYLE_FNDA;
+    }
     char symbol_name[MAX_SYMBOL_LEN];
 
     build_symbol_name(symbol_name, "zdotc_", suffix);
@@ -222,37 +225,84 @@ int32_t autodetect_complex_return_style(void * handle, const char * suffix) {
         return LBT_COMPLEX_RETSTYLE_UNKNOWN;
     }
 
+    build_symbol_name(symbol_name, "cdotc_", suffix);
+    void * cdotc_addr = lookup_symbol(handle, symbol_name);
+    if (cdotc_addr == NULL) {
+        return LBT_COMPLEX_RETSTYLE_UNKNOWN;
+    }
+
     // Typecast to function pointer for easier usage below
     double complex (*zdotc_normal)(                  int64_t *, double complex *, int64_t *, double complex *, int64_t *) = zdotc_addr;
     void           (*zdotc_retarg)(double complex *, int64_t *, double complex *, int64_t *, double complex *, int64_t *) = zdotc_addr;
 
+    // Typecast to function pointer for easier usage below
+    float complex (*cdotc_normal)(                 int64_t *, float complex *, int64_t *, float complex *, int64_t *) = cdotc_addr;
+    void          (*cdotc_retarg)(float complex *, int64_t *, float complex *, int64_t *, float complex *, int64_t *) = cdotc_addr;
+
     /*
      * First, check to see if `zdotc` zeros out the first argument if all arguments are zero.
      * Supposedly, most well-behaved implementations will return `0 + 0*I` if the length of
      * the inputs is zero; so if it is using a "return argument", that's a good way to find out.
      * 
-     * We detect this by setting `retval` to an initial value of `0.0 + 1.0*I`.  This has the
-     * added benefit of being interpretable as `0` if looked at as an `int{32,64}_t *`, which
-     * makes this invocation safe across the full normal-return/argument-return vs. lp64/ilp64
-     * compatibility square.
+     * We detect this by setting `retval` to an initial value of `-1` typecast to a complex
+     * value.  The floating-point values are unimportant as they will be written to, but if
+     * it is interpreted as an `int{32,64}_t`, it will be a negative value (which is not
+     * allowed and should end the routine immediately).  This makes this invocation safe
+     * across the full normal/argument, lp64/ilp64, cdotc/zdotc compatibility cube.
      */
-    double complex retval = 0.0 + 1.0*I;
+    double complex retval_double = 0.0 + 1.0*I;
     int64_t zero = 0;
-    double complex zeroc = 0.0 + 0.0*I;
-    zdotc_retarg(&retval, &zero, &zeroc, &zero, &zeroc, &zero);
+    double complex zeroc_double = 0.0 + 0.0*I;
+    zdotc_retarg(&retval_double, &zero, &zeroc_double, &zero, &zeroc_double, &zero);
 
-    if (creal(retval) == 0.0 && cimag(retval) == 0.0) {
-        return LBT_COMPLEX_RETSTYLE_ARGUMENT;
+    /*
+     * Next, do the same with `cdotc`, in order to detect situations where the ABI is
+     * automatically inserting an extra argument to return 128-bit-wide values.
+     * We call this `FNDA` for "Float Normal, Double Argument" style.
+     */
+    int64_t neg1 = -1;
+    float complex retval_float = *(complex float *)(&neg1);
+    float complex zeroc_float = 0.0f + 0.0f*I;
+    cdotc_retarg(&retval_float, &zero, &zeroc_float, &zero, &zeroc_float, &zero);
+
+    if (creal(retval_double) == 0.0 && cimag(retval_double) == 0.0) {
+        // If the double values were reset, and the float values were also,
+        // this is easy, we're just always argument-style:
+        if (creal(retval_float) == 0.0f && cimag(retval_float) == 0.0f) {
+            return LBT_COMPLEX_RETSTYLE_ARGUMENT;
+        }
+
+        // If the float values were not, let's try the normal return style:
+        retval_float = 0.0f + 1.0f*I;
+        retval_float = cdotc_normal(&zero, &zeroc_float, &zero, &zeroc_float, &zero);
+
+
+        // If this works, we are in FNDA style (currently only observed on Windows x64)
+        if (creal(retval_float) == 0.0f && cimag(retval_float) == 0.0f) {
+            return LBT_COMPLEX_RETSTYLE_FNDA;
+        }
+
+        // Otherwise, cdotc is throwing a fit and we don't know what's up.
+        return LBT_COMPLEX_RETSTYLE_UNKNOWN;
     }
 
-    // If it was _not_ reset, let's hazard a guess that we're dealing with a normal return style:
-    retval = 0.0 + 1.0*I;
-    retval = zdotc_normal(&zero, &zeroc, &zero, &zeroc, &zero);
-    if (creal(retval) == 0.0 && cimag(retval) == 0.0) {
+    // If our double values were _not_ reset, let's hazard a guess that
+    // we're dealing with a normal return style and test both types again:
+    retval_double = 0.0 + 1.0*I;
+    retval_double = zdotc_normal(&zero, &zeroc_double, &zero, &zeroc_double, &zero);
+    retval_float = 0.0f + 1.0f*I;
+    retval_float = cdotc_normal(&zero, &zeroc_float, &zero, &zeroc_float, &zero);
+
+
+    // We only test for both working; we don't have a retstyle for float
+    // being argument style and double being normal style.
+    if ((creal(retval_double) == 0.0 && cimag(retval_double) == 0.0) &&
+        (creal(retval_float) == 0.0f && cimag(retval_float) == 0.0f)) {
         return LBT_COMPLEX_RETSTYLE_NORMAL;
     }
 
-    // If that was not reset either, we have no idea what's going on.
+    // If we get here, zdotc and cdotc are being uncooperative and we
+    // do not appreciate it at all, not we don't my precious.
     return LBT_COMPLEX_RETSTYLE_UNKNOWN;
 }
 #endif // COMPLEX_RETSTYLE_AUTODETECTION

diff --git a/src/cblas_adapters.c b/src/cblas_adapters.c
@@ -73,8 +73,8 @@ void lbt_cblas_cdotc_sub(const int32_t N,
 }
 
 extern float complex cdotc_64_(const int64_t *,
-                                const float complex *, const int64_t *,
-                                const float complex *, const int64_t *);
+                               const float complex *, const int64_t *,
+                               const float complex *, const int64_t *);
 void lbt_cblas_cdotc_sub64_(const int64_t N,
                             const float complex *X, const int64_t incX,
                             const float complex *Y, const int64_t incY,

diff --git a/src/complex_return_style_adapters.c b/src/complex_return_style_adapters.c
@@ -72,8 +72,8 @@ extern void (*cmplxret_cdotc__addr)(float complex * z,
                                     const float complex *, const int32_t *,
                                     const float complex *, const int32_t *);
 float complex cmplxret_cdotc_(const int32_t * N,
-                               const float complex *X, const int32_t * incX,
-                               const float complex *Y, const int32_t * incY)
+                              const float complex *X, const int32_t * incX,
+                              const float complex *Y, const int32_t * incY)
 {
    float complex c;
    cmplxret_cdotc__addr(&c, N, X, incX, Y, incY);
@@ -85,8 +85,8 @@ extern void (*cmplxret_cdotc_64__addr)(float complex * z,
                                        const float complex *, const int64_t *,
                                        const float complex *, const int64_t *);
 float complex cmplxret_cdotc_64_(const int64_t * N,
-                                  const float complex *X, const int64_t * incX,
-                                  const float complex *Y, const int64_t * incY)
+                                 const float complex *X, const int64_t * incX,
+                                 const float complex *Y, const int64_t * incY)
 {
    float complex c;
    cmplxret_cdotc_64__addr(&c, N, X, incX, Y, incY);
@@ -100,8 +100,8 @@ extern void (*cmplxret_cdotu__addr)(float complex * z,
                                     const float complex *, const int32_t *,
                                     const float complex *, const int32_t *);
 float complex cmplxret_cdotu_(const int32_t * N,
-                               const float complex *X, const int32_t * incX,
-                               const float complex *Y, const int32_t * incY)
+                              const float complex *X, const int32_t * incX,
+                              const float complex *Y, const int32_t * incY)
 {
    float complex c;
    cmplxret_cdotu__addr(&c, N, X, incX, Y, incY);
@@ -113,8 +113,8 @@ extern void (*cmplxret_cdotu_64__addr)(float complex * z,
                                        const float complex *, const int64_t *,
                                        const float complex *, const int64_t *);
 float complex cmplxret_cdotu_64_(const int64_t * N,
-                                  const float complex *X, const int64_t * incX,
-                                  const float complex *Y, const int64_t * incY)
+                                 const float complex *X, const int64_t * incX,
+                                 const float complex *Y, const int64_t * incY)
 {
    float complex c;
    cmplxret_cdotu_64__addr(&c, N, X, incX, Y, incY);

diff --git a/src/libblastrampoline.c b/src/libblastrampoline.c
@@ -70,26 +70,30 @@ int32_t set_forward_by_index(int32_t symbol_idx, const void * addr, int32_t inte
     }
 
 #ifdef COMPLEX_RETSTYLE_AUTODETECTION
-    if (complex_retstyle == LBT_COMPLEX_RETSTYLE_ARGUMENT) {
-        // Check to see if this symbol is one of the complex-returning functions
-        for (int complex_symbol_idx=0; cmplxret_func_idxs[complex_symbol_idx] != -1; ++complex_symbol_idx) {
-            // Skip any symbols that aren't ours
-            if (cmplxret_func_idxs[complex_symbol_idx] != symbol_idx)
-                continue;
-
-            // Report to the user that we're cblas-wrapping this one
-            if (verbose) {
-                char exported_name[MAX_SYMBOL_LEN];
-                build_symbol_name(exported_name, exported_func_names[symbol_idx], interface == LBT_INTERFACE_ILP64 ? "64_" : "");
-                printf(" - [%04d] complex(%s)\n", symbol_idx, exported_name);
-            }
+    for (int array_idx=0; array_idx < sizeof(cmplxret_func_idxs)/sizeof(int *); ++array_idx) {
+        if ((complex_retstyle == LBT_COMPLEX_RETSTYLE_ARGUMENT) ||
+           ((complex_retstyle == LBT_COMPLEX_RETSTYLE_FNDA) && array_idx == 1)) {
+            // Check to see if this symbol is one of the complex-returning functions
+            for (int complex_symbol_idx=0; cmplxret_func_idxs[array_idx][complex_symbol_idx] != -1; ++complex_symbol_idx) {
+                printf("array_idx: %d\ncomplex_symbol_idx: %d\n", array_idx, complex_symbol_idx);
+                // Skip any symbols that aren't ours
+                if (cmplxret_func_idxs[array_idx][complex_symbol_idx] != symbol_idx)
+                    continue;
+
+                // Report to the user that we're cmplxret-wrapping this one
+                if (verbose) {
+                    char exported_name[MAX_SYMBOL_LEN];
+                    build_symbol_name(exported_name, exported_func_names[symbol_idx], interface == LBT_INTERFACE_ILP64 ? "64_" : "");
+                    printf(" - [%04d] complex(%s)\n", symbol_idx, exported_name);
+                }
 
-            if (interface == LBT_INTERFACE_LP64) {
-                (*cmplxret_func32_addrs[complex_symbol_idx]) = (*exported_func32_addrs[symbol_idx]);
-                (*exported_func32_addrs[symbol_idx]) = cmplxret32_func_wrappers[complex_symbol_idx];
-            } else {
-                (*cmplxret_func64_addrs[complex_symbol_idx]) = (*exported_func64_addrs[symbol_idx]);
-                (*exported_func64_addrs[symbol_idx]) = cmplxret64_func_wrappers[complex_symbol_idx];
+                if (interface == LBT_INTERFACE_LP64) {
+                    (*cmplxret_func32_addrs[array_idx][complex_symbol_idx]) = (*exported_func32_addrs[symbol_idx]);
+                    (*exported_func32_addrs[symbol_idx]) = cmplxret_func32_wrappers[array_idx][complex_symbol_idx];
+                } else {
+                    (*cmplxret_func64_addrs[array_idx][complex_symbol_idx]) = (*exported_func64_addrs[symbol_idx]);
+                    (*exported_func64_addrs[symbol_idx]) = cmplxret_func64_wrappers[array_idx][complex_symbol_idx];
+                }
             }
         }
     }

diff --git a/src/libblastrampoline.h b/src/libblastrampoline.h
@@ -85,8 +85,13 @@ typedef struct {
 // Possible values for `retstyle` in `lbt_library_info_t`
 // These describe whether a library is using "normal" return value passing (e.g. through
 // the `XMM{0,1}` registers on x86_64, or the `ST{0,1}` floating-point registers on i686)
+// This is further complicated by the fact that on certain platforms (such as Windows x64
+// this is dependent on the size of the value being returned, e.g. a complex64 value will
+// be returned through registers, but a complex128 value will not.  We therefore have a
+// special value that denotes this situation)
 #define LBT_COMPLEX_RETSTYLE_NORMAL     0
 #define LBT_COMPLEX_RETSTYLE_ARGUMENT   1
+#define LBT_COMPLEX_RETSTYLE_FNDA       2 // "Float Normal, Double Argument"
 #define LBT_COMPLEX_RETSTYLE_UNKNOWN   -1
 
 // Possible values for `cblas` in `lbt_library_info_t`

diff --git a/src/libblastrampoline_complex_retdata.h b/src/libblastrampoline_complex_retdata.h
@@ -11,19 +11,35 @@ COMPLEX128_FUNCS(XX_64)
 // Build mapping from cmplxret-index to `_addr` instance
 #define XX(name, index)    &cmplxret_##name##_addr,
 #define XX_64(name, index) &cmplxret_##name##64__addr,
-const void ** cmplxret_func32_addrs[] = {
+const void ** cmplx64ret_func32_addrs[] = {
     COMPLEX64_FUNCS(XX)
+    NULL
+};
+const void ** cmplx128ret_func32_addrs[] = {
     COMPLEX128_FUNCS(XX)
     NULL
 };
-const void ** cmplxret_func64_addrs[] = {
+const void ** cmplx64ret_func64_addrs[] = {
     COMPLEX64_FUNCS(XX_64)
+    NULL
+};
+const void ** cmplx128ret_func64_addrs[] = {
     COMPLEX128_FUNCS(XX_64)
     NULL
 };
 #undef XX
 #undef XX_64
 
+const void *** cmplxret_func32_addrs[] = {
+    cmplx64ret_func32_addrs,
+    cmplx128ret_func32_addrs
+};
+const void *** cmplxret_func64_addrs[] = {
+    cmplx64ret_func64_addrs,
+    cmplx128ret_func64_addrs
+};
+
+
 
 // Forward-declare some functions
 #define XX(name, index)     extern const void * cmplxret_##name ;
@@ -40,24 +56,49 @@ COMPLEX128_FUNCS(XX_64)
 // locations, allowing a cblas index -> function lookup
 #define XX(name, index)    &cmplxret_##name,
 #define XX_64(name, index) &cmplxret_##name##64_,
-const void ** cmplxret32_func_wrappers[] = {
+const void ** cmplx64ret_func32_wrappers[] = {
     COMPLEX64_FUNCS(XX)
+    NULL
+};
+const void ** cmplx128ret_func32_wrappers[] = {
     COMPLEX128_FUNCS(XX)
     NULL
 };
-const void ** cmplxret64_func_wrappers[] = {
+const void ** cmplx64ret_func64_wrappers[] = {
     COMPLEX64_FUNCS(XX_64)
+    NULL
+};
+const void ** cmplx128ret_func64_wrappers[] = {
     COMPLEX128_FUNCS(XX_64)
     NULL
 };
 #undef XX
 #undef XX_64
 
-// Finally, an array that maps cblas index -> exported symbol index
+const void *** cmplxret_func32_wrappers[] = {
+    cmplx64ret_func32_wrappers,
+    cmplx128ret_func32_wrappers
+};
+const void *** cmplxret_func64_wrappers[] = {
+    cmplx64ret_func64_wrappers,
+    cmplx128ret_func64_wrappers
+};
+
+
+
+// Finally, an array that maps cmplxret index -> exported symbol index
 #define XX(name, index)    index,
-const int cmplxret_func_idxs[] = {
+const int cmplx64ret_func_idxs[] = {
     COMPLEX64_FUNCS(XX)
+    -1
+};
+const int cmplx128ret_func_idxs[] = {
     COMPLEX128_FUNCS(XX)
     -1
 };
-#undef XX
+#undef XX
+
+const int * cmplxret_func_idxs[] = {
+    cmplx64ret_func_idxs,
+    cmplx128ret_func_idxs
+};