Closed
Description
The following code compiled with -O3 -march=znver4
(or any other znver
) runs around 25% slower on Zen hardware than when compiled with -O3 -march=x86-64-v4
or the baseline x86-64
.
bool check_prime(int64_t n) {
if (n < 2) {
return true;
}
int64_t lim = (int64_t)ceil((double)n / 2.0);
for (int64_t i = 2; i < lim; i++) {
if (n % i == 0) {
return false;
}
}
return true;
}
Full code
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
bool check_prime(int64_t n) {
if (n < 2) {
return true;
}
int64_t lim = (int64_t)ceil((double)n / 2.0);
for (int64_t i = 2; i < lim; i++) {
if (n % i == 0) {
return false;
}
}
return true;
}
int main() {
clock_t now = clock();
int sum = 0;
for (int i = 0; i < 1000000; i++) {
if (check_prime(i)) {
sum += 1;
}
}
printf("%f, %d\n", (double)(clock() - now) / CLOCKS_PER_SEC, sum);
return 0;
}
Running on a Ryzen 7950X:
> clang.exe -std=c11 -O3 -march=znver4 ./src/perf.c && ./a.exe
24.225000 seconds, 78501
> clang.exe -std=c11 -O3 -march=x86-64-v4 ./src/perf.c && ./a.exe
20.866000 seconds, 78501
> clang.exe -std=c11 -O3 ./src/perf.c && ./a.exe
20.819000 seconds, 78501
> clang.exe --version
clang version 18.1.4
Target: x86_64-pc-windows-msvc
Thread model: posix
InstalledDir: C:\Program Files\LLVM\bin
Disassembly here: https://godbolt.org/z/orssnKP74
I originally noticed the issue with Rust: https://godbolt.org/z/Kh1v3G74K