Skip to content

Very Poor Multi-Threaded Performance of Hash Method APIs #93

@haxelion

Description

@haxelion

Hi,

While debugging severe performance issues in one of our Intel SGX workload, I traced the issue to specific functions in IPP Cryptography Primitives. In short, there seem to be a severe bottleneck in the hash method family when executed in parallel by multiple threads.

First of all, here's a sample code which reproduce this issue with ippcp v1.0.1:

#include <cstdlib>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <thread>
#include <vector>
#include "ippcp.h"

uint64_t benchmark_sha256(uint64_t n) {
    timespec start, end;
    Ipp8u hash[32] = {0};
    Ipp8u dest[32] = {0};


    clock_gettime(CLOCK_MONOTONIC, &start);
    for(uint64_t i = 0; i < n; i++) {
        ippsHashMessage_rmf(hash, sizeof(hash), dest, ippsHashMethod_SHA256_TT());
        memcpy(hash, dest, sizeof(hash));
    }
    clock_gettime(CLOCK_MONOTONIC, &end);

    uint64_t time = (end.tv_sec - start.tv_sec) * 1000000000 + (end.tv_nsec - start.tv_nsec);
    printf("SHA256 result: ");
    for(size_t i = 0; i < 32; i++) {
        printf("%02x", hash[i]);
    }
    printf(" in %lu ns\n", time);
    return time;
}

void usage(const char* argv0) {
    fprintf(stderr, "Usage: %s <number of threads> <number of iterations>\n", argv0);
}

int main(int argc, char** argv) {
    std::vector<std::thread> threads;

    if(argc < 3) {
        usage(argv[0]);
        return -1;
    }

    size_t num_threads = atoi(argv[1]);
    uint64_t num_iterations = atoi(argv[2]);
    threads.reserve(num_threads);

    for(size_t i = 0; i < num_threads; i++) {
        threads.push_back(std::thread(benchmark_sha256, num_iterations));
    }

    for(auto& thread : threads) {
        thread.join();
    }

    return 0;
}

This code simply compute a set number of SHA-256 iterations on a set number of threads.
Executed on a machine with 2 Xeon Silver 4310 (24 physical core total, HT disabled), we would expect 24 threads to take roughly the same amount of time as 1 thread (i.e. performance to scale linearly). However, what we observe is a severe per-thread slowdown as the number of threads increases:

$ time ./main 1 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 92855596 ns

real    0m0.095s
user    0m0.093s
sys     0m0.001s
$ time ./main 2 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 258089852 ns
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 290812058 ns

real    0m0.292s
user    0m0.548s
sys     0m0.002s
$ time ./main 4 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 720734061 ns
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 732083234 ns
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 757913301 ns
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 758453090 ns

real    0m0.760s
user    0m2.965s
sys     0m0.001s
$ time ./main 24 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 3543314846 ns
# many more similar lines
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 3750203957 ns

real    0m3.752s
user    1m27.931s
sys     0m0.002s

I've traced down the issue to the update of the hashing function API which happened in the 2020 update: https://www.intel.com/content/www/us/en/docs/ipp-crypto/developer-guide-reference/2021-9/removed-functions.html#REMOVED-FROM-2020-UPDATE1

Below is a sample which can be compiled using ICC Cryptography Primitives 2021.12.1 (the same one used by the Intel SGX SDK) and allow to execute the same workload with the, now removed, ippsSHA256MessageDigest API and the new ippsHashMessage_rmf API:

#include <cstdlib>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <thread>
#include <vector>
#include "ippcp.h"

uint64_t benchmark_sha256_old(uint64_t n) {
    timespec start, end;
    Ipp8u hash[32] = {0};
    Ipp8u dest[32] = {0};


    clock_gettime(CLOCK_MONOTONIC, &start);
    for(uint64_t i = 0; i < n; i++) {
        ippsSHA256MessageDigest(hash, sizeof(hash), dest);
        memcpy(hash, dest, sizeof(hash));
    }
    clock_gettime(CLOCK_MONOTONIC, &end);

    uint64_t time = (end.tv_sec - start.tv_sec) * 1000000000 + (end.tv_nsec - start.tv_nsec);
    printf("SHA256 result: ");
    for(size_t i = 0; i < 32; i++) {
        printf("%02x", hash[i]);
    }
    printf(" in %lu ns\n", time);
    return time;
}

uint64_t benchmark_sha256_new(uint64_t n) {
    timespec start, end;
    Ipp8u hash[32] = {0};
    Ipp8u dest[32] = {0};


    clock_gettime(CLOCK_MONOTONIC, &start);
    for(uint64_t i = 0; i < n; i++) {
        ippsHashMessage_rmf(hash, sizeof(hash), dest, ippsHashMethod_SHA256_TT());
        memcpy(hash, dest, sizeof(hash));
    }
    clock_gettime(CLOCK_MONOTONIC, &end);

    uint64_t time = (end.tv_sec - start.tv_sec) * 1000000000 + (end.tv_nsec - start.tv_nsec);
    printf("SHA256 result: ");
    for(size_t i = 0; i < 32; i++) {
        printf("%02x", hash[i]);
    }
    printf(" in %lu ns\n", time);
    return time;
}

void usage(const char* argv0) {
    fprintf(stderr, "Usage: %s <old|new> <number of threads> <number of iterations>\n", argv0);
}

int main(int argc, char** argv) {
    std::vector<std::thread> threads;
    bool old = false;

    if(argc < 4) {
        usage(argv[0]);
        return -1;
    }

    if(strcmp(argv[1], "new") == 0) {
        old = false;
    } else if(strcmp(argv[1], "old") == 0) {
        old = true;
    } else {
        usage(argv[0]);
        return -1;
    }

    size_t num_threads = atoi(argv[2]);
    uint64_t num_iterations = atoi(argv[3]);
    threads.reserve(num_threads);

    for(size_t i = 0; i < num_threads; i++) {
        if(old) {
            threads.push_back(std::thread(benchmark_sha256_old, num_iterations));
        } else {
            threads.push_back(std::thread(benchmark_sha256_new, num_iterations));
        }
    }

    for(auto& thread : threads) {
        thread.join();
    }

    return 0;
}

When executed with the old API, the workload scale without any issues. However, executing it with the new API yield the same result as with v1.0.1:

$ time ./main old 24 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 91790300 ns
# many more similar lines
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 106826749 ns

real    0m0.109s
user    0m2.206s
sys     0m0.012s
$ time ./main new 24 1000000
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 2468927587 ns
# many more similar lines
SHA256 result: 2a5e8b87894fc2d1be46c40ce8f95745cc6a4821d3b1be93e4fba5205c757c40 in 3432265027 ns

real    0m3.434s
user    1m10.501s
sys     0m0.002s

This leads me to believe there is a serious multi-threading bottleneck in the way the new hashing method was implemented.

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions