-
Notifications
You must be signed in to change notification settings - Fork 96
Description
Hi,
While debugging severe performance issues in one of our Intel SGX workload, I traced the issue to specific functions in IPP Cryptography Primitives. In short, there seem to be a severe bottleneck in the hash method family when executed in parallel by multiple threads.
First of all, here's a sample code which reproduce this issue with ippcp v1.0.1:
#include <cstdlib>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <thread>
#include <vector>
#include "ippcp.h"
uint64_t benchmark_sha256(uint64_t n) {
timespec start, end;
Ipp8u hash[32] = {0};
Ipp8u dest[32] = {0};
clock_gettime(CLOCK_MONOTONIC, &start);
for(uint64_t i = 0; i < n; i++) {
ippsHashMessage_rmf(hash, sizeof(hash), dest, ippsHashMethod_SHA256_TT());
memcpy(hash, dest, sizeof(hash));
}
clock_gettime(CLOCK_MONOTONIC, &end);
uint64_t time = (end.tv_sec - start.tv_sec) * 1000000000 + (end.tv_nsec - start.tv_nsec);
printf("SHA256 result: ");
for(size_t i = 0; i < 32; i++) {
printf("%02x", hash[i]);
}
printf(" in %lu ns\n", time);
return time;
}
void usage(const char* argv0) {
fprintf(stderr, "Usage: %s <number of threads> <number of iterations>\n", argv0);
}
int main(int argc, char** argv) {
std::vector<std::thread> threads;
if(argc < 3) {
usage(argv[0]);
return -1;
}
size_t num_threads = atoi(argv[1]);
uint64_t num_iterations = atoi(argv[2]);
threads.reserve(num_threads);
for(size_t i = 0; i < num_threads; i++) {
threads.push_back(std::thread(benchmark_sha256, num_iterations));
}
for(auto& thread : threads) {
thread.join();
}
return 0;
}This code simply compute a set number of SHA-256 iterations on a set number of threads.
Executed on a machine with 2 Xeon Silver 4310 (24 physical core total, HT disabled), we would expect 24 threads to take roughly the same amount of time as 1 thread (i.e. performance to scale linearly). However, what we observe is a severe per-thread slowdown as the number of threads increases:
$ time ./main 1 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 92855596 ns
real 0m0.095s
user 0m0.093s
sys 0m0.001s
$ time ./main 2 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 258089852 ns
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 290812058 ns
real 0m0.292s
user 0m0.548s
sys 0m0.002s
$ time ./main 4 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 720734061 ns
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 732083234 ns
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 757913301 ns
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 758453090 ns
real 0m0.760s
user 0m2.965s
sys 0m0.001s
$ time ./main 24 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 3543314846 ns
# many more similar lines
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 3750203957 ns
real 0m3.752s
user 1m27.931s
sys 0m0.002s
I've traced down the issue to the update of the hashing function API which happened in the 2020 update: https://www.intel.com/content/www/us/en/docs/ipp-crypto/developer-guide-reference/2021-9/removed-functions.html#REMOVED-FROM-2020-UPDATE1
Below is a sample which can be compiled using ICC Cryptography Primitives 2021.12.1 (the same one used by the Intel SGX SDK) and allow to execute the same workload with the, now removed, ippsSHA256MessageDigest API and the new ippsHashMessage_rmf API:
#include <cstdlib>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <thread>
#include <vector>
#include "ippcp.h"
uint64_t benchmark_sha256_old(uint64_t n) {
timespec start, end;
Ipp8u hash[32] = {0};
Ipp8u dest[32] = {0};
clock_gettime(CLOCK_MONOTONIC, &start);
for(uint64_t i = 0; i < n; i++) {
ippsSHA256MessageDigest(hash, sizeof(hash), dest);
memcpy(hash, dest, sizeof(hash));
}
clock_gettime(CLOCK_MONOTONIC, &end);
uint64_t time = (end.tv_sec - start.tv_sec) * 1000000000 + (end.tv_nsec - start.tv_nsec);
printf("SHA256 result: ");
for(size_t i = 0; i < 32; i++) {
printf("%02x", hash[i]);
}
printf(" in %lu ns\n", time);
return time;
}
uint64_t benchmark_sha256_new(uint64_t n) {
timespec start, end;
Ipp8u hash[32] = {0};
Ipp8u dest[32] = {0};
clock_gettime(CLOCK_MONOTONIC, &start);
for(uint64_t i = 0; i < n; i++) {
ippsHashMessage_rmf(hash, sizeof(hash), dest, ippsHashMethod_SHA256_TT());
memcpy(hash, dest, sizeof(hash));
}
clock_gettime(CLOCK_MONOTONIC, &end);
uint64_t time = (end.tv_sec - start.tv_sec) * 1000000000 + (end.tv_nsec - start.tv_nsec);
printf("SHA256 result: ");
for(size_t i = 0; i < 32; i++) {
printf("%02x", hash[i]);
}
printf(" in %lu ns\n", time);
return time;
}
void usage(const char* argv0) {
fprintf(stderr, "Usage: %s <old|new> <number of threads> <number of iterations>\n", argv0);
}
int main(int argc, char** argv) {
std::vector<std::thread> threads;
bool old = false;
if(argc < 4) {
usage(argv[0]);
return -1;
}
if(strcmp(argv[1], "new") == 0) {
old = false;
} else if(strcmp(argv[1], "old") == 0) {
old = true;
} else {
usage(argv[0]);
return -1;
}
size_t num_threads = atoi(argv[2]);
uint64_t num_iterations = atoi(argv[3]);
threads.reserve(num_threads);
for(size_t i = 0; i < num_threads; i++) {
if(old) {
threads.push_back(std::thread(benchmark_sha256_old, num_iterations));
} else {
threads.push_back(std::thread(benchmark_sha256_new, num_iterations));
}
}
for(auto& thread : threads) {
thread.join();
}
return 0;
}When executed with the old API, the workload scale without any issues. However, executing it with the new API yield the same result as with v1.0.1:
$ time ./main old 24 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 91790300 ns
# many more similar lines
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 106826749 ns
real 0m0.109s
user 0m2.206s
sys 0m0.012s
$ time ./main new 24 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 2468927587 ns
# many more similar lines
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 3432265027 ns
real 0m3.434s
user 1m10.501s
sys 0m0.002s
This leads me to believe there is a serious multi-threading bottleneck in the way the new hashing method was implemented.