Skip to content

Commit 13ed177

Browse files
committed
Fix validation tolerance: use relative error instead of absolute
The validation was using absolute error tolerance (1e-8) which fails for large matrix multiplication results (magnitude ~1e4). This caused false negatives where COSMA computed correct results but failed validation. Changes: - Switch from absolute error to relative error for validation - Use 1e-5 tolerance for float32 (appropriate for single precision) - Use 1e-8 tolerance for float64 (appropriate for double precision) - Handle small values near zero with absolute error fallback This fixes issue #153 where K-split strategy was incorrectly reported as producing 93.6% errors when actual relative errors were < 1e-6. Tested with: - 32x896x896 float32: now passes (was 93.8% false errors) - 32x10000x896 float32: now passes (was 93.6% false errors) - 32x32x32 float64: still passes (regression test)
1 parent a516444 commit 13ed177

File tree

1 file changed

+28
-9
lines changed

1 file changed

+28
-9
lines changed

utils/cosma_utils.hpp

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -333,23 +333,38 @@ bool test_cosma(Strategy s,
333333
// Now Check result
334334
isOK = globCcheck.size() == globC.size();
335335
for (int i = 0; i < globC.size(); ++i) {
336-
isOK = isOK && (std::abs(globC[i] - globCcheck[i]) < epsilon);
336+
// Use relative error for large values, absolute error for small values
337+
double abs_error = std::abs(globC[i] - globCcheck[i]);
338+
double scale = std::max(std::abs(globC[i]), std::abs(globCcheck[i]));
339+
double rel_error = (scale > 1e-10) ? abs_error / scale : abs_error;
340+
// For float32, relative error tolerance should be ~1e-6
341+
// For float64, relative error tolerance should be ~1e-12
342+
double tolerance = (sizeof(Scalar) == 4) ? 1e-5 : epsilon;
343+
isOK = isOK && (rel_error < tolerance);
337344
}
338345

339346
if (!isOK) {
340347
std::cout << "Result is NOT OK" << std::endl;
348+
int error_count = 0;
349+
const int MAX_ERRORS_TO_PRINT = 20;
341350
for (int i = 0; i < m * n; i++) {
342351
if (globCcheck[i] != globC[i]) {
343-
int x = i % m;
344-
int y = i / m;
345-
int locidx, rank;
346-
std::tie(locidx, rank) = C.local_coordinates(x, y);
347-
std::cout << "global(" << x << ", " << y
348-
<< ") = (loc = " << locidx << ", rank = " << rank
349-
<< ") = " << globC.at(i) << " and should be "
350-
<< globCcheck.at(i) << std::endl;
352+
error_count++;
353+
if (error_count <= MAX_ERRORS_TO_PRINT) {
354+
int x = i % m;
355+
int y = i / m;
356+
int locidx, rank;
357+
std::tie(locidx, rank) = C.local_coordinates(x, y);
358+
std::cout << "global(" << x << ", " << y
359+
<< ") = (loc = " << locidx << ", rank = " << rank
360+
<< ") = " << globC.at(i) << " and should be "
361+
<< globCcheck.at(i) << " (diff = "
362+
<< std::abs(globC.at(i) - globCcheck.at(i)) << ")" << std::endl;
363+
}
351364
}
352365
}
366+
std::cout << "Total errors: " << error_count << " out of " << (m * n) << " elements ("
367+
<< (100.0 * error_count / (m * n)) << "%)" << std::endl;
353368
}
354369
else {
355370
std::cout <<"Result is OK"<<std::endl;
@@ -376,5 +391,9 @@ bool test_cosma(Strategy s,
376391
MPI_Barrier(comm);
377392
}
378393
#endif // DEBUG
394+
395+
// Synchronize all ranks before returning to prevent hangs
396+
MPI_Barrier(comm);
397+
379398
return rank > 0 || isOK;
380399
}

0 commit comments

Comments
 (0)