Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Joserochh/ntt avoid memcpy #72

Merged
merged 12 commits into from
Oct 11, 2021
17 changes: 17 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,20 @@ cmake --build build --target check unittest
to make sure the formatting checks and all unit tests pass.

Please sign your commits before making a pull request. See instructions [here](https://docs.github.com/en/github/authenticating-to-github/managing-commit-signature-verification/signing-commits) for how to sign commits.

### Known Issues ###

* ```Executable `cpplint` not found```

Make sure you install cpplint: ```pip install cpplint```.
If you install `cpplint` locally, make sure to add it to your PATH.

* ```/bin/sh: 1: pre-commit: not found```

Install `pre-commit`. More info at https://pre-commit.com/

* ```
error: gpg failed to sign the data
fatal: failed to write commit object
```
Try adding ```export GPG_TTY=$(tty)``` to `~/.bashrc`
167 changes: 145 additions & 22 deletions benchmark/bench-ntt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ namespace hexl {

//=================================================================

static void BM_FwdNTTNativeRadix2(benchmark::State& state) { // NOLINT
static void BM_FwdNTTNativeRadix2InPlace(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

Expand All @@ -30,19 +30,43 @@ static void BM_FwdNTTNativeRadix2(benchmark::State& state) { // NOLINT

for (auto _ : state) {
ForwardTransformToBitReverseRadix2(
input.data(), ntt_size, modulus, ntt.GetRootOfUnityPowers().data(),
input.data(), input.data(), ntt_size, modulus,
ntt.GetRootOfUnityPowers().data(),
ntt.GetPrecon64RootOfUnityPowers().data(), 2, 1);
}
}

BENCHMARK(BM_FwdNTTNativeRadix2)
BENCHMARK(BM_FwdNTTNativeRadix2InPlace)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
->Args({16384});
//=================================================================

static void BM_FwdNTTNativeRadix4(benchmark::State& state) { // NOLINT
static void BM_FwdNTTNativeRadix2Copy(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

auto input = GenerateInsecureUniformRandomValues(ntt_size, 0, modulus);
AlignedVector64<uint64_t> output(ntt_size, 1);
NTT ntt(ntt_size, modulus);

for (auto _ : state) {
ForwardTransformToBitReverseRadix2(
output.data(), input.data(), ntt_size, modulus,
ntt.GetRootOfUnityPowers().data(),
ntt.GetPrecon64RootOfUnityPowers().data(), 2, 1);
}
}

BENCHMARK(BM_FwdNTTNativeRadix2Copy)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
->Args({16384});
//=================================================================

static void BM_FwdNTTNativeRadix4InPlace(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

Expand All @@ -51,18 +75,43 @@ static void BM_FwdNTTNativeRadix4(benchmark::State& state) { // NOLINT

for (auto _ : state) {
ForwardTransformToBitReverseRadix4(
input.data(), ntt_size, modulus, ntt.GetRootOfUnityPowers().data(),
input.data(), input.data(), ntt_size, modulus,
ntt.GetRootOfUnityPowers().data(),
ntt.GetPrecon64RootOfUnityPowers().data(), 2, 1);
}
}

BENCHMARK(BM_FwdNTTNativeRadix4)
BENCHMARK(BM_FwdNTTNativeRadix4InPlace)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
->Args({16384});
//=================================================================

static void BM_FwdNTTNativeRadix4Copy(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

auto input = GenerateInsecureUniformRandomValues(ntt_size, 0, modulus);
AlignedVector64<uint64_t> output(ntt_size, 1);
NTT ntt(ntt_size, modulus);

for (auto _ : state) {
ForwardTransformToBitReverseRadix4(
output.data(), input.data(), ntt_size, modulus,
ntt.GetRootOfUnityPowers().data(),
ntt.GetPrecon64RootOfUnityPowers().data(), 2, 1);
}
}

BENCHMARK(BM_FwdNTTNativeRadix4Copy)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
->Args({16384});

//=================================================================

#ifdef HEXL_HAS_AVX512IFMA
// state[0] is the degree
static void BM_FwdNTT_AVX512IFMA(benchmark::State& state) { // NOLINT
Expand All @@ -80,7 +129,7 @@ static void BM_FwdNTT_AVX512IFMA(benchmark::State& state) { // NOLINT

for (auto _ : state) {
ForwardTransformToBitReverseAVX512<NTT::s_ifma_shift_bits>(
input.data(), ntt_size, modulus, root_of_unity.data(),
input.data(), input.data(), ntt_size, modulus, root_of_unity.data(),
precon_root_of_unity.data(), 2, 1);
}
}
Expand Down Expand Up @@ -109,7 +158,7 @@ static void BM_FwdNTT_AVX512IFMALazy(benchmark::State& state) { // NOLINT

for (auto _ : state) {
ForwardTransformToBitReverseAVX512<NTT::s_ifma_shift_bits>(
input.data(), ntt_size, modulus, root_of_unity.data(),
input.data(), input.data(), ntt_size, modulus, root_of_unity.data(),
precon_root_of_unity.data(), 4, 4);
}
}
Expand Down Expand Up @@ -144,7 +193,7 @@ static void BM_FwdNTT_AVX512DQ_32(benchmark::State& state) { // NOLINT
ntt.GetAVX512Precon32RootOfUnityPowers();
for (auto _ : state) {
ForwardTransformToBitReverseAVX512<32>(
input.data(), ntt_size, modulus, root_of_unity.data(),
input.data(), input.data(), ntt_size, modulus, root_of_unity.data(),
precon_root_of_unity.data(), 4, output_mod_factor);
}
}
Expand Down Expand Up @@ -175,7 +224,7 @@ static void BM_FwdNTT_AVX512DQ_64(benchmark::State& state) { // NOLINT
ntt.GetAVX512Precon64RootOfUnityPowers();
for (auto _ : state) {
ForwardTransformToBitReverseAVX512<64>(
input.data(), ntt_size, modulus, root_of_unity.data(),
input.data(), input.data(), ntt_size, modulus, root_of_unity.data(),
precon_root_of_unity.data(), 4, output_mod_factor);
}
}
Expand Down Expand Up @@ -234,6 +283,28 @@ BENCHMARK(BM_FwdNTTCopy)
->Args({4096})
->Args({16384});

//=================================================================

static void BM_InvNTTInPlace(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

auto input = GenerateInsecureUniformRandomValues(ntt_size, 0, modulus);
NTT ntt(ntt_size, modulus);

for (auto _ : state) {
ntt.ComputeInverse(input.data(), input.data(), 2, 1);
}
}

BENCHMARK(BM_InvNTTInPlace)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
->Args({16384});

//=================================================================

// state[0] is the degree
static void BM_InvNTTCopy(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
Expand All @@ -258,7 +329,7 @@ BENCHMARK(BM_InvNTTCopy)

// Inverse transforms

static void BM_InvNTTNativeRadix2(benchmark::State& state) { // NOLINT
static void BM_InvNTTNativeRadix2InPlace(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

Expand All @@ -269,38 +340,90 @@ static void BM_InvNTTNativeRadix2(benchmark::State& state) { // NOLINT
const AlignedVector64<uint64_t> precon_root_of_unity =
ntt.GetPrecon64InvRootOfUnityPowers();
for (auto _ : state) {
InverseTransformFromBitReverseRadix2(input.data(), ntt_size, modulus,
root_of_unity.data(),
InverseTransformFromBitReverseRadix2(input.data(), input.data(), ntt_size,
modulus, root_of_unity.data(),
precon_root_of_unity.data(), 1, 1);
}
}

BENCHMARK(BM_InvNTTNativeRadix2)
BENCHMARK(BM_InvNTTNativeRadix2InPlace)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
->Args({16384});

//=================================================================

static void BM_InvNTTNativeRadix4(benchmark::State& state) { // NOLINT
static void BM_InvNTTNativeRadix2Copy(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

auto input = GenerateInsecureUniformRandomValues(ntt_size, 0, modulus);
AlignedVector64<uint64_t> output(ntt_size, 1);
NTT ntt(ntt_size, modulus);

const AlignedVector64<uint64_t> root_of_unity = ntt.GetInvRootOfUnityPowers();
const AlignedVector64<uint64_t> precon_root_of_unity =
ntt.GetPrecon64InvRootOfUnityPowers();
for (auto _ : state) {
InverseTransformFromBitReverseRadix2(output.data(), input.data(), ntt_size,
modulus, root_of_unity.data(),
precon_root_of_unity.data(), 1, 1);
}
}

BENCHMARK(BM_InvNTTNativeRadix2Copy)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
->Args({16384});

//=================================================================

static void BM_InvNTTNativeRadix4InPlace(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

auto input = GenerateInsecureUniformRandomValues(ntt_size, 0, modulus);
NTT ntt(ntt_size, modulus);

const AlignedVector64<uint64_t> root_of_unity = ntt.GetInvRootOfUnityPowers();
const AlignedVector64<uint64_t> precon_root_of_unity =
ntt.GetPrecon64InvRootOfUnityPowers();
for (auto _ : state) {
InverseTransformFromBitReverseRadix4(input.data(), input.data(), ntt_size,
modulus, root_of_unity.data(),
precon_root_of_unity.data(), 1, 1);
}
}

BENCHMARK(BM_InvNTTNativeRadix4InPlace)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
->Args({16384});

//=================================================================

static void BM_InvNTTNativeRadix4Copy(benchmark::State& state) { // NOLINT
size_t ntt_size = state.range(0);
size_t modulus = GeneratePrimes(1, 45, true, ntt_size)[0];

auto input = GenerateInsecureUniformRandomValues(ntt_size, 0, modulus);
AlignedVector64<uint64_t> output(ntt_size, 1);
NTT ntt(ntt_size, modulus);

const AlignedVector64<uint64_t> root_of_unity = ntt.GetInvRootOfUnityPowers();
const AlignedVector64<uint64_t> precon_root_of_unity =
ntt.GetPrecon64InvRootOfUnityPowers();
for (auto _ : state) {
InverseTransformFromBitReverseRadix4(input.data(), ntt_size, modulus,
root_of_unity.data(),
InverseTransformFromBitReverseRadix4(output.data(), input.data(), ntt_size,
modulus, root_of_unity.data(),
precon_root_of_unity.data(), 1, 1);
}
}

BENCHMARK(BM_InvNTTNativeRadix4)
BENCHMARK(BM_InvNTTNativeRadix4Copy)
->Unit(benchmark::kMicrosecond)
->Args({1024})
->Args({4096})
Expand All @@ -322,7 +445,7 @@ static void BM_InvNTT_AVX512IFMA(benchmark::State& state) { // NOLINT
ntt.GetPrecon52InvRootOfUnityPowers();
for (auto _ : state) {
InverseTransformFromBitReverseAVX512<NTT::s_ifma_shift_bits>(
input.data(), ntt_size, modulus, root_of_unity.data(),
input.data(), input.data(), ntt_size, modulus, root_of_unity.data(),
precon_root_of_unity.data(), 1, 1);
}
}
Expand All @@ -348,7 +471,7 @@ static void BM_InvNTT_AVX512IFMALazy(benchmark::State& state) { // NOLINT
ntt.GetPrecon52InvRootOfUnityPowers();
for (auto _ : state) {
InverseTransformFromBitReverseAVX512<NTT::s_ifma_shift_bits>(
input.data(), ntt_size, modulus, root_of_unity.data(),
input.data(), input.data(), ntt_size, modulus, root_of_unity.data(),
precon_root_of_unity.data(), 2, 2);
}
}
Expand Down Expand Up @@ -379,7 +502,7 @@ static void BM_InvNTT_AVX512DQ_32(benchmark::State& state) { // NOLINT

for (auto _ : state) {
InverseTransformFromBitReverseAVX512<32>(
input.data(), ntt_size, modulus, root_of_unity.data(),
input.data(), input.data(), ntt_size, modulus, root_of_unity.data(),
precon_root_of_unity.data(), output_mod_factor, output_mod_factor);
}
}
Expand Down Expand Up @@ -407,7 +530,7 @@ static void BM_InvNTT_AVX512DQ_64(benchmark::State& state) { // NOLINT

for (auto _ : state) {
InverseTransformFromBitReverseAVX512<NTT::s_default_shift_bits>(
input.data(), ntt_size, modulus, root_of_unity.data(),
input.data(), input.data(), ntt_size, modulus, root_of_unity.data(),
precon_root_of_unity.data(), output_mod_factor, output_mod_factor);
}
}
Expand Down
Loading