diff --git a/CMakeLists.txt b/CMakeLists.txt index bd1b2894..861f4d65 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,7 +88,7 @@ if(NOT Kokkos_FOUND) FetchContent_Declare( kokkos GIT_REPOSITORY https://github.com/kokkos/kokkos - GIT_TAG 3.7.00 + GIT_TAG 4.2.00 GIT_SHALLOW TRUE ) FetchContent_MakeAvailable(kokkos) diff --git a/MParT/TrainMap.h b/MParT/TrainMap.h index 3e97f0a7..3125f6d3 100644 --- a/MParT/TrainMap.h +++ b/MParT/TrainMap.h @@ -3,8 +3,9 @@ #include #include -#include "ConditionalMapBase.h" -#include "MapObjective.h" +#include +#include "MParT/ConditionalMapBase.h" +#include "MParT/MapObjective.h" namespace mpart { diff --git a/MParT/Utilities/LinearAlgebra.h b/MParT/Utilities/LinearAlgebra.h index b0c3ff2f..ae758c40 100644 --- a/MParT/Utilities/LinearAlgebra.h +++ b/MParT/Utilities/LinearAlgebra.h @@ -195,55 +195,6 @@ StridedMatrix operator*(TransposeObject A, return C; } -enum class ReduceDimMap { sum, norm }; - -template -KOKKOS_INLINE_FUNCTION double ReduceDimMapKernel(double) {return 0.;}; - -template<> -KOKKOS_INLINE_FUNCTION double ReduceDimMapKernel(double x) {return x;}; - -template<> -KOKKOS_INLINE_FUNCTION double ReduceDimMapKernel(double x) {return x*x;}; - - -// This struct allows you to reduce the columns of a mxn matrix -// Performs alpha*A*[1,...,1] -template > -struct ReduceDim { - using value_type = double[]; - using size_type = typename StridedMatrix::size_type; - - // Keep track of the dimension we are not reducing - size_type value_count; - - StridedMatrix A_; - double alpha_; - - ReduceDim(StridedMatrix A, double alpha): value_count(A.extent(1-Dim)), A_(A), alpha_(alpha) {} - - KOKKOS_INLINE_FUNCTION void operator()(const size_type reduce_idx, value_type sum) const { - for(size_type full_idx=0; full_idx(aij)*alpha_; - } - } - - KOKKOS_INLINE_FUNCTION void join (volatile value_type dst, const volatile value_type src) const { - for (size_type i = 0; i < value_count; ++i) { - dst[i] += src[i]; - } - } - - KOKKOS_INLINE_FUNCTION void init (value_type sum) const { - for (size_type i = 0; i < value_count; ++i) { - sum[i] = 0.0; - } - } -}; - /** Mimics the interface of the Eigen::PartialPivLU class, but using Kokkos::Views and CUBLAS/CUSOLVER linear algebra. Note that the layout of the matrices used in this class is important. Cublas expects column major (layout left). diff --git a/bindings/julia/src/CommonJuliaUtilities.cpp b/bindings/julia/src/CommonJuliaUtilities.cpp index 5a04a2d6..a3d00991 100644 --- a/bindings/julia/src/CommonJuliaUtilities.cpp +++ b/bindings/julia/src/CommonJuliaUtilities.cpp @@ -25,6 +25,5 @@ void mpart::binding::CommonUtilitiesWrapper(jlcxx::Module &mod) mod.method("Initialize", [](){mpart::binding::Initialize(std::vector {});}); mod.method("Initialize", [](std::vector v){mpart::binding::Initialize(makeInitArguments(v));}); mod.method("Concurrency", &Kokkos::DefaultExecutionSpace::concurrency); - mod.add_type("HostSpace"); mod.add_type("LayoutStride"); } diff --git a/src/Distributions/GaussianSamplerDensity.cpp b/src/Distributions/GaussianSamplerDensity.cpp index a4a725e2..9d991842 100644 --- a/src/Distributions/GaussianSamplerDensity.cpp +++ b/src/Distributions/GaussianSamplerDensity.cpp @@ -44,10 +44,11 @@ void GaussianSamplerDensity::LogDensityImpl(StridedMatrix rr(diff, -0.5); - Kokkos::parallel_reduce(M, rr, &output(0)); - Kokkos::parallel_for( "log terms", N, KOKKOS_LAMBDA (const int& j) { - output(j) -= 0.5*( M*logtau_ + logDetCov_ ); + Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int& j){ + output(j) = -0.5*( M*logtau_ + logDetCov_ ); + for(int d=0; d> ObjectiveFactory::CreateGaussianKLObj template double KLObjective::ObjectivePlusCoeffGradImpl(StridedMatrix data, StridedVector grad, std::shared_ptr> map) const { unsigned int N_samps = data.extent(1); + unsigned int grad_dim = grad.extent(0); PullbackDensity pullback {map, density_}; StridedVector densityX = pullback.LogDensity(data); StridedMatrix densityGradX = pullback.LogDensityCoeffGrad(data); double sumDensity = 0.; + Kokkos::parallel_reduce ("Sum Negative Log Likelihood", N_samps, KOKKOS_LAMBDA (const int i, double &sum) { sum -= densityX(i); }, sumDensity); - ReduceDim rc(densityGradX, -1.0/((double) N_samps)); - Kokkos::parallel_reduce(N_samps, rc, &grad(0)); + + + if (grad.data()!=nullptr){ + double scale = -1.0/((double) N_samps); + + Kokkos::TeamPolicy> policy(grad_dim, Kokkos::AUTO()); + + Kokkos::parallel_for(policy, + KOKKOS_LAMBDA(auto& tag, auto&& teamMember){ + int row = teamMember.league_rank(); + double thisRowSum = 0.0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, N_samps), + KOKKOS_LAMBDA(int col, double& innerUpdate){ + innerUpdate += scale*densityGradX(row,col); + }, + thisRowSum); + + grad(row) = thisRowSum; + } + ); + } return sumDensity/N_samps; } @@ -79,10 +100,21 @@ double KLObjective::ObjectiveImpl(StridedMatrix void KLObjective::CoeffGradImpl(StridedMatrix data, StridedVector grad, std::shared_ptr> map) const { unsigned int N_samps = data.extent(1); + unsigned int grad_dim = grad.extent(0); PullbackDensity pullback {map, density_}; StridedMatrix densityGradX = pullback.LogDensityCoeffGrad(data); - ReduceDim rc(densityGradX, -1.0/((double) N_samps)); - Kokkos::parallel_reduce(N_samps, rc, &grad(0)); + + double scale = -1.0/((double) N_samps); + Kokkos::parallel_for("grad", Kokkos::TeamPolicy>(grad_dim, Kokkos::AUTO()), + KOKKOS_LAMBDA(auto t, typename Kokkos::TeamPolicy>::member_type teamMember){ + int row = teamMember.league_rank(); + double thisRowSum = 0.0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, N_samps), KOKKOS_LAMBDA(int col, double& innerUpdate){ + innerUpdate += scale*densityGradX(row,col); + }, thisRowSum); + grad(row) = thisRowSum; + } + ); } // Explicit template instantiation diff --git a/tests/Test_LinearAlgebra.cpp b/tests/Test_LinearAlgebra.cpp index 604cf419..d51ded4f 100644 --- a/tests/Test_LinearAlgebra.cpp +++ b/tests/Test_LinearAlgebra.cpp @@ -139,47 +139,6 @@ TEST_CASE( "Testing Linear Mat-Mat product", "[LinearAlgebra_MatMat]" ) { } } -TEST_CASE( "Testing ReduceDim", "ReduceDim" ) { - int N = 23; - StridedMatrix A = Kokkos::View("A", 3, N); - StridedMatrix AT = Kokkos::View("AT", N, 3); - double ref_avg = ((double) (N+1)) / 2.0; - for(int i = 0; i < N; i++) { - A(0,i) = i+1; - A(1,i) = N-i; - A(2,i) = ref_avg; - - AT(i,0) = i+1; - AT(i,1) = N-i; - AT(i,2) = ref_avg; - } - ReduceDim rc (A, 1.0); - Kokkos::View avg_col("avg_col", 3); - Kokkos::parallel_reduce(N, rc, avg_col.data()); - CHECK(avg_col(0) == Approx(N*ref_avg).epsilon(1e-14).margin(1e-14)); - CHECK(avg_col(1) == Approx(N*ref_avg).epsilon(1e-14).margin(1e-14)); - CHECK(avg_col(2) == Approx(N*ref_avg).epsilon(1e-14).margin(1e-14)); - - ReduceDim rr (AT,1.0); - Kokkos::View avg_row("avg_row", 3); - Kokkos::parallel_reduce(N, rr, avg_row.data()); - CHECK(avg_row(0) == Approx(N*ref_avg).epsilon(1e-14).margin(1e-14)); - CHECK(avg_row(1) == Approx(N*ref_avg).epsilon(1e-14).margin(1e-14)); - CHECK(avg_row(2) == Approx(N*ref_avg).epsilon(1e-14).margin(1e-14)); - - rc = ReduceDim(A, -1.0/((double) N)); - Kokkos::parallel_reduce(N, rc, avg_col.data()); - CHECK(avg_col(0) == Approx(-ref_avg).epsilon(1e-14).margin(1e-14)); - CHECK(avg_col(1) == Approx(-ref_avg).epsilon(1e-14).margin(1e-14)); - CHECK(avg_col(2) == Approx(-ref_avg).epsilon(1e-14).margin(1e-14)); - - rr = ReduceDim(AT, -1.0/((double) N)); - Kokkos::parallel_reduce(N, rr, avg_row.data()); - CHECK(avg_row(0) == Approx(-ref_avg).epsilon(1e-14).margin(1e-14)); - CHECK(avg_row(1) == Approx(-ref_avg).epsilon(1e-14).margin(1e-14)); - CHECK(avg_row(2) == Approx(-ref_avg).epsilon(1e-14).margin(1e-14)); -} - TEST_CASE( "Testing LU Factorization", "LinearAlgebra_LU" ) { Kokkos::View A("A", 3, 3); Kokkos::View B("B", 3, 2); diff --git a/tests/Test_TrainMapAdaptive.cpp b/tests/Test_TrainMapAdaptive.cpp index 4385a2d6..b3c54acd 100644 --- a/tests/Test_TrainMapAdaptive.cpp +++ b/tests/Test_TrainMapAdaptive.cpp @@ -16,18 +16,30 @@ void NormalizeSamples(StridedMatrix mat) { unsigned int dim = mat.extent(0); unsigned int N_samples = mat.extent(1); // Take sum of each row, divide by 1/N_samples - ReduceDim rd_mean(mat, 1./(static_cast(N_samples))); Kokkos::View meanVar("MeanStd", dim); - Kokkos::parallel_reduce(N_samples, rd_mean, &meanVar(0)); + for(int i=0; i(N_samples); + } + // Subtract mean from each point using ExecSpace = typename MemoryToExecution::Space; auto policy = Kokkos::MDRangePolicy,ExecSpace>({0,0},{dim,N_samples}); Kokkos::parallel_for("Center data", policy, KOKKOS_LAMBDA(const unsigned int i, const unsigned int j){ mat(i,j) -= meanVar(i); }); + // Take || . ||_2^2 of each row, divide by 1/(N_samples-1) - ReduceDim rd_var(mat, 1./(static_cast(N_samples-1))); - Kokkos::parallel_reduce(N_samples, rd_var, &meanVar(0)); + for(int i=0; i(N_samples-1); + } + // Divide each point by appropriate standard deviation Kokkos::parallel_for("Scale data", policy, KOKKOS_LAMBDA(const unsigned int i, const unsigned int j){ mat(i,j) /= std::sqrt(meanVar(i));