feat(crypto): use gpu msm when building with --config cuda

kroma-network · May 30, 2024 · c21fb4c · c21fb4c
1 parent 621aaff
commit c21fb4c
Show file tree

Hide file tree

Showing 4 changed files with 167 additions and 26 deletions.
diff --git a/tachyon/crypto/commitments/kzg/BUILD.bazel b/tachyon/crypto/commitments/kzg/BUILD.bazel
@@ -27,7 +27,10 @@ tachyon_cc_library(
         "//tachyon/base/buffer:copyable",
         "//tachyon/base/containers:container_util",
         "//tachyon/crypto/commitments:batch_commitment_state",
+        "//tachyon/device/gpu:scoped_mem_pool",
+        "//tachyon/device/gpu:scoped_stream",
         "//tachyon/math/elliptic_curves/msm:variable_base_msm",
+        "//tachyon/math/elliptic_curves/msm:variable_base_msm_gpu",
         "//tachyon/math/polynomials/univariate:univariate_evaluation_domain",
     ],
 )

diff --git a/tachyon/crypto/commitments/kzg/kzg.h b/tachyon/crypto/commitments/kzg/kzg.h
@@ -10,6 +10,7 @@
 #include <stddef.h>
 
 #include <algorithm>
+#include <limits>
 #include <memory>
 #include <utility>
 #include <vector>
@@ -21,6 +22,12 @@
 #include "tachyon/math/elliptic_curves/point_conversions.h"
 #include "tachyon/math/polynomials/univariate/univariate_evaluation_domain.h"
 
+#if TACHYON_CUDA
+#include "tachyon/device/gpu/scoped_mem_pool.h"
+#include "tachyon/device/gpu/scoped_stream.h"
+#include "tachyon/math/elliptic_curves/msm/variable_base_msm_gpu.h"
+#endif
+
 namespace tachyon {
 namespace crypto {
 
@@ -42,6 +49,9 @@ class KZG {
         g1_powers_of_tau_lagrange_(std::move(g1_powers_of_tau_lagrange)) {
     CHECK_EQ(g1_powers_of_tau_.size(), g1_powers_of_tau_lagrange_.size());
     CHECK_LE(g1_powers_of_tau_.size(), kMaxDegree + 1);
+#if TACHYON_CUDA
+    SetupForGpu();
+#endif
   }
 
   const std::vector<G1Point>& g1_powers_of_tau() const {
@@ -52,21 +62,73 @@ class KZG {
     return g1_powers_of_tau_lagrange_;
   }
 
-  void ResizeBatchCommitments(size_t size) { batch_commitments_.resize(size); }
+#if TACHYON_CUDA
+  void SetupForGpu() {
+    CHECK(!msm_gpu_);
+
+    gpuMemPoolProps props = {gpuMemAllocationTypePinned,
+                             gpuMemHandleTypeNone,
+                             {gpuMemLocationTypeDevice, 0}};
+    mem_pool_ = device::gpu::CreateMemPool(&props);
+
+    uint64_t mem_pool_threshold = std::numeric_limits<uint64_t>::max();
+    gpuError_t error = gpuMemPoolSetAttribute(
+        mem_pool_.get(), gpuMemPoolAttrReleaseThreshold, &mem_pool_threshold);
+    CHECK_EQ(error, gpuSuccess);
+    stream_ = device::gpu::CreateStream();
+
+    msm_gpu_.reset(
+        new math::VariableBaseMSMGpu<G1Point>(mem_pool_.get(), stream_.get()));
+  }
+#endif
+
+  void ResizeBatchCommitments(size_t size) {
+#if TACHYON_CUDA
+    if (msm_gpu_) {
+      gpu_batch_commitments_.resize(size);
+      return;
+    }
+#endif
+    cpu_batch_commitments_.resize(size);
+  }
 
   std::vector<Commitment> GetBatchCommitments(BatchCommitmentState& state) {
     std::vector<Commitment> batch_commitments;
-    if constexpr (std::is_same_v<Commitment, Bucket>) {
-      batch_commitments = std::move(batch_commitments_);
-    } else if constexpr (std::is_same_v<Commitment, math::AffinePoint<Curve>>) {
-      batch_commitments.resize(batch_commitments_.size());
-      CHECK(Bucket::BatchNormalize(batch_commitments_, &batch_commitments));
-      batch_commitments_.clear();
+#if TACHYON_CUDA
+    if (msm_gpu_) {
+      if constexpr (std::is_same_v<Commitment, math::ProjectivePoint<Curve>>) {
+        batch_commitments = std::move(gpu_batch_commitments_);
+        // NOLINTNEXTLINE(readability/braces)
+      } else if constexpr (std::is_same_v<Commitment,
+                                          math::AffinePoint<Curve>>) {
+        batch_commitments.resize(gpu_batch_commitments_.size());
+        CHECK(math::ProjectivePoint<Curve>::BatchNormalize(
+            gpu_batch_commitments_, &batch_commitments));
+        gpu_batch_commitments_.clear();
+      } else {
+        batch_commitments.resize(gpu_batch_commitments_.size());
+        CHECK(math::ConvertPoints(gpu_batch_commitments_, &batch_commitments));
+        gpu_batch_commitments_.clear();
+      }
     } else {
-      batch_commitments.resize(batch_commitments_.size());
-      CHECK(math::ConvertPoints(batch_commitments_, &batch_commitments));
-      batch_commitments_.clear();
+#endif
+      if constexpr (std::is_same_v<Commitment, Bucket>) {
+        batch_commitments = std::move(cpu_batch_commitments_);
+        // NOLINTNEXTLINE(readability/braces)
+      } else if constexpr (std::is_same_v<Commitment,
+                                          math::AffinePoint<Curve>>) {
+        batch_commitments.resize(cpu_batch_commitments_.size());
+        CHECK(
+            Bucket::BatchNormalize(cpu_batch_commitments_, &batch_commitments));
+        cpu_batch_commitments_.clear();
+      } else {
+        batch_commitments.resize(cpu_batch_commitments_.size());
+        CHECK(math::ConvertPoints(cpu_batch_commitments_, &batch_commitments));
+        cpu_batch_commitments_.clear();
+      }
+#if TACHYON_CUDA
     }
+#endif
     state.Reset();
     return batch_commitments;
   }
@@ -133,8 +195,22 @@ class KZG {
 
  private:
   template <typename BaseContainer, typename ScalarContainer>
-  static bool DoMSM(const BaseContainer& bases, const ScalarContainer& scalars,
-                    Commitment* out) {
+  bool DoMSM(const BaseContainer& bases, const ScalarContainer& scalars,
+             Commitment* out) const {
+#if TACHYON_CUDA
+    if (msm_gpu_) {
+      absl::Span<const G1Point> bases_span = absl::Span<const G1Point>(
+          bases.data(), std::min(bases.size(), scalars.size()));
+      if constexpr (std::is_same_v<Commitment, math::ProjectivePoint<Curve>>) {
+        return msm_gpu_->Run(bases_span, scalars, out);
+      } else {
+        math::ProjectivePoint<Curve> result;
+        if (!msm_gpu_->Run(bases_span, scalars, &result)) return false;
+        *out = math::ConvertPoint<Commitment>(result);
+        return true;
+      }
+    }
+#endif
     math::VariableBaseMSM<G1Point> msm;
     absl::Span<const G1Point> bases_span = absl::Span<const G1Point>(
         bases.data(), std::min(bases.size(), scalars.size()));
@@ -151,15 +227,28 @@ class KZG {
   template <typename BaseContainer, typename ScalarContainer>
   bool DoMSM(const BaseContainer& bases, const ScalarContainer& scalars,
              BatchCommitmentState& state, size_t index) {
+#if TACHYON_CUDA
+    if (msm_gpu_) {
+      absl::Span<const G1Point> bases_span = absl::Span<const G1Point>(
+          bases.data(), std::min(bases.size(), scalars.size()));
+      return msm_gpu_->Run(bases_span, scalars, &gpu_batch_commitments_[index]);
+    }
+#endif
     math::VariableBaseMSM<G1Point> msm;
     absl::Span<const G1Point> bases_span = absl::Span<const G1Point>(
         bases.data(), std::min(bases.size(), scalars.size()));
-    return msm.Run(bases_span, scalars, &batch_commitments_[index]);
+    return msm.Run(bases_span, scalars, &cpu_batch_commitments_[index]);
   }
 
   std::vector<G1Point> g1_powers_of_tau_;
   std::vector<G1Point> g1_powers_of_tau_lagrange_;
-  std::vector<Bucket> batch_commitments_;
+  std::vector<Bucket> cpu_batch_commitments_;
+#if TACHYON_CUDA
+  device::gpu::ScopedMemPool mem_pool_;
+  device::gpu::ScopedStream stream_;
+  std::unique_ptr<math::VariableBaseMSMGpu<G1Point>> msm_gpu_;
+  std::vector<math::ProjectivePoint<Curve>> gpu_batch_commitments_;
+#endif
 };
 
 }  // namespace crypto

diff --git a/tachyon/crypto/commitments/kzg/kzg_unittest.cc b/tachyon/crypto/commitments/kzg/kzg_unittest.cc
@@ -42,16 +42,32 @@ TEST_F(KZGTest, CommitLagrange) {
 
   Poly poly = Poly::Random(N - 1);
 
-  math::bn254::G1AffinePoint commit;
-  ASSERT_TRUE(pcs.Commit(poly.coefficients().coefficients(), &commit));
+  math::bn254::G1AffinePoint cpu_commit;
+  ASSERT_TRUE(pcs.Commit(poly.coefficients().coefficients(), &cpu_commit));
 
   std::unique_ptr<Domain> domain = Domain::Create(N);
-  Evals poly_evals = domain->FFT(std::move(poly));
+  Evals poly_evals = domain->FFT(poly);
 
-  math::bn254::G1AffinePoint commit_lagrange;
-  ASSERT_TRUE(pcs.CommitLagrange(poly_evals.evaluations(), &commit_lagrange));
+  math::bn254::G1AffinePoint cpu_commit_lagrange;
+  ASSERT_TRUE(
+      pcs.CommitLagrange(poly_evals.evaluations(), &cpu_commit_lagrange));
 
-  EXPECT_EQ(commit, commit_lagrange);
+  EXPECT_EQ(cpu_commit, cpu_commit_lagrange);
+
+#if TACHYON_CUDA
+  pcs.SetupForGpu();
+
+  math::bn254::G1AffinePoint gpu_commit;
+  ASSERT_TRUE(pcs.Commit(poly.coefficients().coefficients(), &gpu_commit));
+
+  EXPECT_EQ(gpu_commit, cpu_commit);
+
+  math::bn254::G1AffinePoint gpu_commit_lagrange;
+  ASSERT_TRUE(
+      pcs.CommitLagrange(poly_evals.evaluations(), &gpu_commit_lagrange));
+
+  EXPECT_EQ(gpu_commit_lagrange, cpu_commit_lagrange);
+#endif
 }
 
 TEST_F(KZGTest, BatchCommitLagrange) {
@@ -67,27 +83,57 @@ TEST_F(KZGTest, BatchCommitLagrange) {
   for (size_t i = 0; i < num_polys; ++i) {
     ASSERT_TRUE(pcs.Commit(polys[i].coefficients().coefficients(), state, i));
   }
-  std::vector<math::bn254::G1AffinePoint> batch_commitments =
+  std::vector<math::bn254::G1AffinePoint> cpu_batch_commitments =
       pcs.GetBatchCommitments(state);
   EXPECT_EQ(state.batch_mode, false);
   EXPECT_EQ(state.batch_count, size_t{0});
 
   std::unique_ptr<Domain> domain = Domain::Create(N);
-  std::vector<Evals> poly_evals = base::Map(
-      polys, [&domain](Poly& poly) { return domain->FFT(std::move(poly)); });
+  std::vector<Evals> poly_evals =
+      base::Map(polys, [&domain](Poly& poly) { return domain->FFT(poly); });
+
+  state.batch_mode = true;
+  state.batch_count = num_polys;
+  pcs.ResizeBatchCommitments(num_polys);
+  for (size_t i = 0; i < num_polys; ++i) {
+    ASSERT_TRUE(pcs.CommitLagrange(poly_evals[i].evaluations(), state, i));
+  }
+  std::vector<math::bn254::G1AffinePoint> cpu_batch_commitments_lagrange =
+      pcs.GetBatchCommitments(state);
+  EXPECT_EQ(state.batch_mode, false);
+  EXPECT_EQ(state.batch_count, size_t{0});
+
+  EXPECT_EQ(cpu_batch_commitments, cpu_batch_commitments_lagrange);
+
+#if TACHYON_CUDA
+  pcs.SetupForGpu();
+
+  state.batch_mode = true;
+  state.batch_count = num_polys;
+  pcs.ResizeBatchCommitments(num_polys);
+  for (size_t i = 0; i < num_polys; ++i) {
+    ASSERT_TRUE(pcs.Commit(polys[i].coefficients().coefficients(), state, i));
+  }
+  std::vector<math::bn254::G1AffinePoint> gpu_batch_commitments =
+      pcs.GetBatchCommitments(state);
+  EXPECT_EQ(state.batch_mode, false);
+  EXPECT_EQ(state.batch_count, size_t{0});
+
+  EXPECT_EQ(gpu_batch_commitments, cpu_batch_commitments);
 
   state.batch_mode = true;
   state.batch_count = num_polys;
   pcs.ResizeBatchCommitments(num_polys);
   for (size_t i = 0; i < num_polys; ++i) {
     ASSERT_TRUE(pcs.CommitLagrange(poly_evals[i].evaluations(), state, i));
   }
-  std::vector<math::bn254::G1AffinePoint> batch_commitments_lagrange =
+  std::vector<math::bn254::G1AffinePoint> gpu_batch_commitments_lagrange =
       pcs.GetBatchCommitments(state);
   EXPECT_EQ(state.batch_mode, false);
   EXPECT_EQ(state.batch_count, size_t{0});
 
-  EXPECT_EQ(batch_commitments, batch_commitments_lagrange);
+  EXPECT_EQ(gpu_batch_commitments_lagrange, cpu_batch_commitments_lagrange);
+#endif
 }
 
 TEST_F(KZGTest, Downsize) {

diff --git a/tachyon/math/elliptic_curves/msm/BUILD.bazel b/tachyon/math/elliptic_curves/msm/BUILD.bazel
@@ -1,3 +1,4 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//bazel:tachyon.bzl", "if_gpu_is_configured")
 load(
     "//bazel:tachyon_cc.bzl",
@@ -54,7 +55,9 @@ tachyon_cc_library(
 tachyon_cc_library(
     name = "variable_base_msm_gpu",
     hdrs = ["variable_base_msm_gpu.h"],
-    deps = ["//tachyon/math/elliptic_curves/msm/algorithms/icicle:icicle_msm"],
+    deps = ["//tachyon/math/elliptic_curves/msm/algorithms/icicle:icicle_msm"] + if_cuda([
+        "@local_config_cuda//cuda:cudart_static",
+    ]),
 )
 
 tachyon_cc_unittest(