Skip to content

Commit

Permalink
Joserochh/ntt avoid memcpy (#72)
Browse files Browse the repository at this point in the history
Avoiding memcpy calls on NTT

* Avoiding memcpys on Fwd NTT
* Avoiding memcpy on INV NTT
* Fixing some lines length
* using only one out-of-place on first passes
* Adding out-of-place for raddix 4 NTT
* Adding gpg issue
* Adding test cases for out place NTT
* Removing commented code and testing GPG Signing
* Fboemer/fix 32 bit invntt (#73)
* Fix 32-bit AVX512DQ InvNT
* Refactor NTT tests for better coverage
* Added performance tips to README (#74)
* small fix on test case (missed during merge)

Co-authored-by: Fabian Boemer <fabian.boemer@intel.com>
  • Loading branch information
joserochh and fboemer committed Nov 8, 2021
1 parent b394857 commit f2d202b
Show file tree
Hide file tree
Showing 13 changed files with 969 additions and 372 deletions.
17 changes: 17 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,20 @@ cmake --build build --target check unittest
to make sure the formatting checks and all unit tests pass.

Please sign your commits before making a pull request. See instructions [here](https://docs.github.com/en/github/authenticating-to-github/managing-commit-signature-verification/signing-commits) for how to sign commits.

### Known Issues ###

* ```Executable `cpplint` not found```

Make sure you install cpplint: ```pip install cpplint```.
If you install `cpplint` locally, make sure to add it to your PATH.

* ```/bin/sh: 1: pre-commit: not found```

Install `pre-commit`. More info at https://pre-commit.com/

* ```
error: gpg failed to sign the data
fatal: failed to write commit object
```
Try adding ```export GPG_TTY=$(tty)``` to `~/.bashrc`
167 changes: 145 additions & 22 deletions benchmark/bench-ntt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ namespace hexl {

//=================================================================

static void BM_FwdNTTNativeRadix2(benchmark::State& state) { // NOLINT
static void BM_FwdNTTNativeRadix2InPlace(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

Expand All @@ -30,19 +30,43 @@ static void BM_FwdNTTNativeRadix2(benchmark::State& state) { // NOLINT

for (auto _ : state) {
ForwardTransformToBitReverseRadix2(
input.data(), ntt_size, modulus, ntt.GetRootOfUnityPowers().data(),
input.data(), input.data(), ntt_size, modulus,
ntt.GetRootOfUnityPowers().data(),
ntt.GetPrecon64RootOfUnityPowers().data(), 2, 1);
}
}

BENCHMARK(BM_FwdNTTNativeRadix2)
BENCHMARK(BM_FwdNTTNativeRadix2InPlace)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
->Args({16384});
//=================================================================

static void BM_FwdNTTNativeRadix4(benchmark::State& state) { // NOLINT
static void BM_FwdNTTNativeRadix2Copy(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

auto input = GenerateInsecureUniformRandomValues(ntt_size, 0, modulus);
AlignedVector64<uint64_t> output(ntt_size, 1);
NTT ntt(ntt_size, modulus);

for (auto _ : state) {
ForwardTransformToBitReverseRadix2(
output.data(), input.data(), ntt_size, modulus,
ntt.GetRootOfUnityPowers().data(),
ntt.GetPrecon64RootOfUnityPowers().data(), 2, 1);
}
}

BENCHMARK(BM_FwdNTTNativeRadix2Copy)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
->Args({16384});
//=================================================================

static void BM_FwdNTTNativeRadix4InPlace(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

Expand All @@ -51,18 +75,43 @@ static void BM_FwdNTTNativeRadix4(benchmark::State& state) { // NOLINT

for (auto _ : state) {
ForwardTransformToBitReverseRadix4(
input.data(), ntt_size, modulus, ntt.GetRootOfUnityPowers().data(),
input.data(), input.data(), ntt_size, modulus,
ntt.GetRootOfUnityPowers().data(),
ntt.GetPrecon64RootOfUnityPowers().data(), 2, 1);
}
}

BENCHMARK(BM_FwdNTTNativeRadix4)
BENCHMARK(BM_FwdNTTNativeRadix4InPlace)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
->Args({16384});
//=================================================================

static void BM_FwdNTTNativeRadix4Copy(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

auto input = GenerateInsecureUniformRandomValues(ntt_size, 0, modulus);
AlignedVector64<uint64_t> output(ntt_size, 1);
NTT ntt(ntt_size, modulus);

for (auto _ : state) {
ForwardTransformToBitReverseRadix4(
output.data(), input.data(), ntt_size, modulus,
ntt.GetRootOfUnityPowers().data(),
ntt.GetPrecon64RootOfUnityPowers().data(), 2, 1);
}
}

BENCHMARK(BM_FwdNTTNativeRadix4Copy)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
->Args({16384});

//=================================================================

#ifdef HEXL_HAS_AVX512IFMA
// state[0] is the degree
static void BM_FwdNTT_AVX512IFMA(benchmark::State& state) { // NOLINT
Expand All @@ -80,7 +129,7 @@ static void BM_FwdNTT_AVX512IFMA(benchmark::State& state) { // NOLINT

for (auto _ : state) {
ForwardTransformToBitReverseAVX512<NTT::s_ifma_shift_bits>(
input.data(), ntt_size, modulus, root_of_unity.data(),
input.data(), input.data(), ntt_size, modulus, root_of_unity.data(),
precon_root_of_unity.data(), 2, 1);
}
}
Expand Down Expand Up @@ -109,7 +158,7 @@ static void BM_FwdNTT_AVX512IFMALazy(benchmark::State& state) { // NOLINT

for (auto _ : state) {
ForwardTransformToBitReverseAVX512<NTT::s_ifma_shift_bits>(
input.data(), ntt_size, modulus, root_of_unity.data(),
input.data(), input.data(), ntt_size, modulus, root_of_unity.data(),
precon_root_of_unity.data(), 4, 4);
}
}
Expand Down Expand Up @@ -144,7 +193,7 @@ static void BM_FwdNTT_AVX512DQ_32(benchmark::State& state) { // NOLINT
ntt.GetAVX512Precon32RootOfUnityPowers();
for (auto _ : state) {
ForwardTransformToBitReverseAVX512<32>(
input.data(), ntt_size, modulus, root_of_unity.data(),
input.data(), input.data(), ntt_size, modulus, root_of_unity.data(),
precon_root_of_unity.data(), 4, output_mod_factor);
}
}
Expand Down Expand Up @@ -175,7 +224,7 @@ static void BM_FwdNTT_AVX512DQ_64(benchmark::State& state) { // NOLINT
ntt.GetAVX512Precon64RootOfUnityPowers();
for (auto _ : state) {
ForwardTransformToBitReverseAVX512<64>(
input.data(), ntt_size, modulus, root_of_unity.data(),
input.data(), input.data(), ntt_size, modulus, root_of_unity.data(),
precon_root_of_unity.data(), 4, output_mod_factor);
}
}
Expand Down Expand Up @@ -234,6 +283,28 @@ BENCHMARK(BM_FwdNTTCopy)
->Args({4096})
->Args({16384});

//=================================================================

static void BM_InvNTTInPlace(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

auto input = GenerateInsecureUniformRandomValues(ntt_size, 0, modulus);
NTT ntt(ntt_size, modulus);

for (auto _ : state) {
ntt.ComputeInverse(input.data(), input.data(), 2, 1);
}
}

BENCHMARK(BM_InvNTTInPlace)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
->Args({16384});

//=================================================================

// state[0] is the degree
static void BM_InvNTTCopy(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
Expand All @@ -258,7 +329,7 @@ BENCHMARK(BM_InvNTTCopy)

// Inverse transforms

static void BM_InvNTTNativeRadix2(benchmark::State& state) { // NOLINT
static void BM_InvNTTNativeRadix2InPlace(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

Expand All @@ -269,38 +340,90 @@ static void BM_InvNTTNativeRadix2(benchmark::State& state) { // NOLINT
const AlignedVector64<uint64_t> precon_root_of_unity =
ntt.GetPrecon64InvRootOfUnityPowers();
for (auto _ : state) {
InverseTransformFromBitReverseRadix2(input.data(), ntt_size, modulus,
root_of_unity.data(),
InverseTransformFromBitReverseRadix2(input.data(), input.data(), ntt_size,
modulus, root_of_unity.data(),
precon_root_of_unity.data(), 1, 1);
}
}

BENCHMARK(BM_InvNTTNativeRadix2)
BENCHMARK(BM_InvNTTNativeRadix2InPlace)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
->Args({16384});

//=================================================================

static void BM_InvNTTNativeRadix4(benchmark::State& state) { // NOLINT
static void BM_InvNTTNativeRadix2Copy(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

auto input = GenerateInsecureUniformRandomValues(ntt_size, 0, modulus);
AlignedVector64<uint64_t> output(ntt_size, 1);
NTT ntt(ntt_size, modulus);

const AlignedVector64<uint64_t> root_of_unity = ntt.GetInvRootOfUnityPowers();
const AlignedVector64<uint64_t> precon_root_of_unity =
ntt.GetPrecon64InvRootOfUnityPowers();
for (auto _ : state) {
InverseTransformFromBitReverseRadix2(output.data(), input.data(), ntt_size,
modulus, root_of_unity.data(),
precon_root_of_unity.data(), 1, 1);
}
}

BENCHMARK(BM_InvNTTNativeRadix2Copy)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
->Args({16384});

//=================================================================

static void BM_InvNTTNativeRadix4InPlace(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

auto input = GenerateInsecureUniformRandomValues(ntt_size, 0, modulus);
NTT ntt(ntt_size, modulus);

const AlignedVector64<uint64_t> root_of_unity = ntt.GetInvRootOfUnityPowers();
const AlignedVector64<uint64_t> precon_root_of_unity =
ntt.GetPrecon64InvRootOfUnityPowers();
for (auto _ : state) {
InverseTransformFromBitReverseRadix4(input.data(), input.data(), ntt_size,
modulus, root_of_unity.data(),
precon_root_of_unity.data(), 1, 1);
}
}

BENCHMARK(BM_InvNTTNativeRadix4InPlace)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
->Args({16384});

//=================================================================

static void BM_InvNTTNativeRadix4Copy(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

auto input = GenerateInsecureUniformRandomValues(ntt_size, 0, modulus);
AlignedVector64<uint64_t> output(ntt_size, 1);
NTT ntt(ntt_size, modulus);

const AlignedVector64<uint64_t> root_of_unity = ntt.GetInvRootOfUnityPowers();
const AlignedVector64<uint64_t> precon_root_of_unity =
ntt.GetPrecon64InvRootOfUnityPowers();
for (auto _ : state) {
InverseTransformFromBitReverseRadix4(input.data(), ntt_size, modulus,
root_of_unity.data(),
InverseTransformFromBitReverseRadix4(output.data(), input.data(), ntt_size,
modulus, root_of_unity.data(),
precon_root_of_unity.data(), 1, 1);
}
}

BENCHMARK(BM_InvNTTNativeRadix4)
BENCHMARK(BM_InvNTTNativeRadix4Copy)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
Expand All @@ -322,7 +445,7 @@ static void BM_InvNTT_AVX512IFMA(benchmark::State& state) { // NOLINT
ntt.GetPrecon52InvRootOfUnityPowers();
for (auto _ : state) {
InverseTransformFromBitReverseAVX512<NTT::s_ifma_shift_bits>(
input.data(), ntt_size, modulus, root_of_unity.data(),
input.data(), input.data(), ntt_size, modulus, root_of_unity.data(),
precon_root_of_unity.data(), 1, 1);
}
}
Expand All @@ -348,7 +471,7 @@ static void BM_InvNTT_AVX512IFMALazy(benchmark::State& state) { // NOLINT
ntt.GetPrecon52InvRootOfUnityPowers();
for (auto _ : state) {
InverseTransformFromBitReverseAVX512<NTT::s_ifma_shift_bits>(
input.data(), ntt_size, modulus, root_of_unity.data(),
input.data(), input.data(), ntt_size, modulus, root_of_unity.data(),
precon_root_of_unity.data(), 2, 2);
}
}
Expand Down Expand Up @@ -379,7 +502,7 @@ static void BM_InvNTT_AVX512DQ_32(benchmark::State& state) { // NOLINT

for (auto _ : state) {
InverseTransformFromBitReverseAVX512<32>(
input.data(), ntt_size, modulus, root_of_unity.data(),
input.data(), input.data(), ntt_size, modulus, root_of_unity.data(),
precon_root_of_unity.data(), output_mod_factor, output_mod_factor);
}
}
Expand Down Expand Up @@ -407,7 +530,7 @@ static void BM_InvNTT_AVX512DQ_64(benchmark::State& state) { // NOLINT

for (auto _ : state) {
InverseTransformFromBitReverseAVX512<NTT::s_default_shift_bits>(
input.data(), ntt_size, modulus, root_of_unity.data(),
input.data(), input.data(), ntt_size, modulus, root_of_unity.data(),
precon_root_of_unity.data(), output_mod_factor, output_mod_factor);
}
}
Expand Down
Loading

0 comments on commit f2d202b

Please sign in to comment.