Very Poor Multi-Threaded Performance of Hash Method APIs

Hi,

While debugging severe performance issues in one of our Intel SGX workload, I traced the issue to specific functions in IPP Cryptography Primitives. In short, there seem to be a severe bottleneck in the hash method family when executed in parallel by multiple threads.

First of all, here's a sample code which reproduce this issue with ippcp v1.0.1:

```c
#include <cstdlib>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <thread>
#include <vector>
#include "ippcp.h"

uint64_t benchmark_sha256(uint64_t n) {
    timespec start, end;
    Ipp8u hash[32] = {0};
    Ipp8u dest[32] = {0};


    clock_gettime(CLOCK_MONOTONIC, &start);
    for(uint64_t i = 0; i < n; i++) {
        ippsHashMessage_rmf(hash, sizeof(hash), dest, ippsHashMethod_SHA256_TT());
        memcpy(hash, dest, sizeof(hash));
    }
    clock_gettime(CLOCK_MONOTONIC, &end);

    uint64_t time = (end.tv_sec - start.tv_sec) * 1000000000 + (end.tv_nsec - start.tv_nsec);
    printf("SHA256 result: ");
    for(size_t i = 0; i < 32; i++) {
        printf("%02x", hash[i]);
    }
    printf(" in %lu ns\n", time);
    return time;
}

void usage(const char* argv0) {
    fprintf(stderr, "Usage: %s <number of threads> <number of iterations>\n", argv0);
}

int main(int argc, char** argv) {
    std::vector<std::thread> threads;

    if(argc < 3) {
        usage(argv[0]);
        return -1;
    }

    size_t num_threads = atoi(argv[1]);
    uint64_t num_iterations = atoi(argv[2]);
    threads.reserve(num_threads);

    for(size_t i = 0; i < num_threads; i++) {
        threads.push_back(std::thread(benchmark_sha256, num_iterations));
    }

    for(auto& thread : threads) {
        thread.join();
    }

    return 0;
}
```

This code simply compute a set number of SHA-256 iterations on a set number of threads.
Executed on a machine with 2 Xeon Silver 4310 (24 physical core total, HT disabled), we would expect 24 threads to take roughly the same amount of time as 1 thread (i.e. performance to scale linearly). However, what we observe is a severe per-thread slowdown as the number of threads increases:

```
$ time ./main 1 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 92855596 ns

real    0m0.095s
user    0m0.093s
sys     0m0.001s
$ time ./main 2 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 258089852 ns
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 290812058 ns

real    0m0.292s
user    0m0.548s
sys     0m0.002s
$ time ./main 4 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 720734061 ns
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 732083234 ns
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 757913301 ns
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 758453090 ns

real    0m0.760s
user    0m2.965s
sys     0m0.001s
$ time ./main 24 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 3543314846 ns
# many more similar lines
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 3750203957 ns

real    0m3.752s
user    1m27.931s
sys     0m0.002s
```

I've traced down the issue to the update of the hashing function API which happened in the 2020 update: https://www.intel.com/content/www/us/en/docs/ipp-crypto/developer-guide-reference/2021-9/removed-functions.html#REMOVED-FROM-2020-UPDATE1

Below is a sample which can be compiled using ICC Cryptography Primitives 2021.12.1 (the same one used by the Intel SGX SDK) and allow to execute the same workload with the, now removed, `ippsSHA256MessageDigest` API and the new `ippsHashMessage_rmf` API:

```c
#include <cstdlib>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <thread>
#include <vector>
#include "ippcp.h"

uint64_t benchmark_sha256_old(uint64_t n) {
    timespec start, end;
    Ipp8u hash[32] = {0};
    Ipp8u dest[32] = {0};


    clock_gettime(CLOCK_MONOTONIC, &start);
    for(uint64_t i = 0; i < n; i++) {
        ippsSHA256MessageDigest(hash, sizeof(hash), dest);
        memcpy(hash, dest, sizeof(hash));
    }
    clock_gettime(CLOCK_MONOTONIC, &end);

    uint64_t time = (end.tv_sec - start.tv_sec) * 1000000000 + (end.tv_nsec - start.tv_nsec);
    printf("SHA256 result: ");
    for(size_t i = 0; i < 32; i++) {
        printf("%02x", hash[i]);
    }
    printf(" in %lu ns\n", time);
    return time;
}

uint64_t benchmark_sha256_new(uint64_t n) {
    timespec start, end;
    Ipp8u hash[32] = {0};
    Ipp8u dest[32] = {0};


    clock_gettime(CLOCK_MONOTONIC, &start);
    for(uint64_t i = 0; i < n; i++) {
        ippsHashMessage_rmf(hash, sizeof(hash), dest, ippsHashMethod_SHA256_TT());
        memcpy(hash, dest, sizeof(hash));
    }
    clock_gettime(CLOCK_MONOTONIC, &end);

    uint64_t time = (end.tv_sec - start.tv_sec) * 1000000000 + (end.tv_nsec - start.tv_nsec);
    printf("SHA256 result: ");
    for(size_t i = 0; i < 32; i++) {
        printf("%02x", hash[i]);
    }
    printf(" in %lu ns\n", time);
    return time;
}

void usage(const char* argv0) {
    fprintf(stderr, "Usage: %s <old|new> <number of threads> <number of iterations>\n", argv0);
}

int main(int argc, char** argv) {
    std::vector<std::thread> threads;
    bool old = false;

    if(argc < 4) {
        usage(argv[0]);
        return -1;
    }

    if(strcmp(argv[1], "new") == 0) {
        old = false;
    } else if(strcmp(argv[1], "old") == 0) {
        old = true;
    } else {
        usage(argv[0]);
        return -1;
    }

    size_t num_threads = atoi(argv[2]);
    uint64_t num_iterations = atoi(argv[3]);
    threads.reserve(num_threads);

    for(size_t i = 0; i < num_threads; i++) {
        if(old) {
            threads.push_back(std::thread(benchmark_sha256_old, num_iterations));
        } else {
            threads.push_back(std::thread(benchmark_sha256_new, num_iterations));
        }
    }

    for(auto& thread : threads) {
        thread.join();
    }

    return 0;
}
```

When executed with the old API, the workload scale without any issues. However, executing it with the new API yield the same result as with v1.0.1:

```
$ time ./main old 24 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 91790300 ns
# many more similar lines
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 106826749 ns

real    0m0.109s
user    0m2.206s
sys     0m0.012s
$ time ./main new 24 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 2468927587 ns
# many more similar lines
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 3432265027 ns

real    0m3.434s
user    1m10.501s
sys     0m0.002s
```

This leads me to believe there is a serious multi-threading bottleneck in the way the new hashing method was implemented.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Very Poor Multi-Threaded Performance of Hash Method APIs #93

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Very Poor Multi-Threaded Performance of Hash Method APIs #93

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions