Fix validation tolerance: use relative error instead of absolute

dbsanfte · dbsanfte · commit 13ed1772e840 · 2025-10-07T10:10:35.000Z
The validation was using absolute error tolerance (1e-8) which fails for large matrix multiplication results (magnitude ~1e4). This caused false negatives where COSMA computed correct results but failed validation. Changes: - Switch from absolute error to relative error for validation - Use 1e-5 tolerance for float32 (appropriate for single precision) - Use 1e-8 tolerance for float64 (appropriate for double precision) - Handle small values near zero with absolute error fallback This fixes issue #153 where K-split strategy was incorrectly reported as producing 93.6% errors when actual relative errors were < 1e-6. Tested with: - 32x896x896 float32: now passes (was 93.8% false errors) - 32x10000x896 float32: now passes (was 93.6% false errors) - 32x32x32 float64: still passes (regression test)
diff --git a/utils/cosma_utils.hpp b/utils/cosma_utils.hpp
@@ -333,23 +333,38 @@ bool test_cosma(Strategy s,
         // Now Check result
         isOK = globCcheck.size() == globC.size();
         for (int i = 0; i < globC.size(); ++i) {
-            isOK = isOK && (std::abs(globC[i] - globCcheck[i]) < epsilon);
+            // Use relative error for large values, absolute error for small values
+            double abs_error = std::abs(globC[i] - globCcheck[i]);
+            double scale = std::max(std::abs(globC[i]), std::abs(globCcheck[i]));
+            double rel_error = (scale > 1e-10) ? abs_error / scale : abs_error;
+            // For float32, relative error tolerance should be ~1e-6
+            // For float64, relative error tolerance should be ~1e-12
+            double tolerance = (sizeof(Scalar) == 4) ? 1e-5 : epsilon;
+            isOK = isOK && (rel_error < tolerance);
         }
 
         if (!isOK) {
             std::cout << "Result is NOT OK" << std::endl;
+            int error_count = 0;
+            const int MAX_ERRORS_TO_PRINT = 20;
             for (int i = 0; i < m * n; i++) {
                 if (globCcheck[i] != globC[i]) {
-                    int x = i % m;
-                    int y = i / m;
-                    int locidx, rank;
-                    std::tie(locidx, rank) = C.local_coordinates(x, y);
-                    std::cout << "global(" << x << ", " << y
-                              << ") = (loc = " << locidx << ", rank = " << rank
-                              << ") = " << globC.at(i) << " and should be "
-                              << globCcheck.at(i) << std::endl;
+                    error_count++;
+                    if (error_count <= MAX_ERRORS_TO_PRINT) {
+                        int x = i % m;
+                        int y = i / m;
+                        int locidx, rank;
+                        std::tie(locidx, rank) = C.local_coordinates(x, y);
+                        std::cout << "global(" << x << ", " << y
+                                  << ") = (loc = " << locidx << ", rank = " << rank
+                                  << ") = " << globC.at(i) << " and should be "
+                                  << globCcheck.at(i) << " (diff = " 
+                                  << std::abs(globC.at(i) - globCcheck.at(i)) << ")" << std::endl;
+                    }
                 }
             }
+            std::cout << "Total errors: " << error_count << " out of " << (m * n) << " elements ("
+                      << (100.0 * error_count / (m * n)) << "%)" << std::endl;
         }
         else {
             std::cout <<"Result is OK"<<std::endl;
@@ -376,5 +391,9 @@ bool test_cosma(Strategy s,
         MPI_Barrier(comm);
     }
 #endif // DEBUG
+    
+    // Synchronize all ranks before returning to prevent hangs
+    MPI_Barrier(comm);
+    
     return rank > 0 || isOK;
 }