diff --git a/CMakeLists.txt b/CMakeLists.txt
index bd1b2894..861f4d65 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,7 +88,7 @@ if(NOT Kokkos_FOUND)
         FetchContent_Declare(
             kokkos
             GIT_REPOSITORY https://github.com/kokkos/kokkos
-            GIT_TAG 3.7.00
+            GIT_TAG 4.2.00
             GIT_SHALLOW TRUE
         )
         FetchContent_MakeAvailable(kokkos)
diff --git a/MParT/TrainMap.h b/MParT/TrainMap.h
index 3e97f0a7..3125f6d3 100644
--- a/MParT/TrainMap.h
+++ b/MParT/TrainMap.h
@@ -3,8 +3,9 @@
 
 #include <functional>
 #include <nlopt.hpp>
-#include "ConditionalMapBase.h"
-#include "MapObjective.h"
+#include <iostream>
+#include "MParT/ConditionalMapBase.h"
+#include "MParT/MapObjective.h"
 
 namespace mpart {
 
diff --git a/MParT/Utilities/LinearAlgebra.h b/MParT/Utilities/LinearAlgebra.h
index b0c3ff2f..ae758c40 100644
--- a/MParT/Utilities/LinearAlgebra.h
+++ b/MParT/Utilities/LinearAlgebra.h
@@ -195,55 +195,6 @@ StridedMatrix<double, MemorySpace> operator*(TransposeObject<MemorySpace> A,
     return C;
 }
 
-enum class ReduceDimMap { sum, norm };
-
-template<ReduceDimMap Map>
-KOKKOS_INLINE_FUNCTION double ReduceDimMapKernel(double) {return 0.;};
-
-template<>
-KOKKOS_INLINE_FUNCTION double ReduceDimMapKernel<ReduceDimMap::sum>(double x) {return x;};
-
-template<>
-KOKKOS_INLINE_FUNCTION double ReduceDimMapKernel<ReduceDimMap::norm>(double x) {return x*x;};
-
-
-// This struct allows you to reduce the columns of a mxn matrix
-// Performs alpha*A*[1,...,1]
-template<ReduceDimMap Map, typename MemorySpace, unsigned int Dim = 1, typename = std::enable_if_t<Dim <= 1,int> >
-struct ReduceDim {
-    using value_type = double[];
-    using size_type = typename StridedMatrix<double, MemorySpace>::size_type;
-
-    // Keep track of the dimension we are not reducing
-    size_type value_count;
-
-    StridedMatrix<const double, MemorySpace> A_;
-    double alpha_;
-
-    ReduceDim(StridedMatrix<double, MemorySpace> A, double alpha): value_count(A.extent(1-Dim)), A_(A), alpha_(alpha) {}
-
-    KOKKOS_INLINE_FUNCTION void operator()(const size_type reduce_idx, value_type sum) const {
-        for(size_type full_idx=0; full_idx<value_count; ++full_idx) {
-            double aij = 0.;
-            if(Dim == 1) aij = A_(full_idx,reduce_idx);
-            else aij = A_(reduce_idx,full_idx);
-            sum[full_idx] += ReduceDimMapKernel<Map>(aij)*alpha_;
-        }
-    }
-
-    KOKKOS_INLINE_FUNCTION void join (volatile value_type dst, const volatile value_type src) const {
-        for (size_type i = 0; i < value_count; ++i) {
-            dst[i] += src[i];
-        }
-    }
-
-    KOKKOS_INLINE_FUNCTION void init (value_type sum) const {
-        for (size_type i = 0; i < value_count; ++i) {
-            sum[i] = 0.0;
-        }
-    }
-};
-
 /** Mimics the interface of the Eigen::PartialPivLU class, but using Kokkos::Views and CUBLAS/CUSOLVER linear algebra.
 
 Note that the layout of the matrices used in this class is important.  Cublas expects column major (layout left).
diff --git a/bindings/julia/src/CommonJuliaUtilities.cpp b/bindings/julia/src/CommonJuliaUtilities.cpp
index 5a04a2d6..a3d00991 100644
--- a/bindings/julia/src/CommonJuliaUtilities.cpp
+++ b/bindings/julia/src/CommonJuliaUtilities.cpp
@@ -25,6 +25,5 @@ void mpart::binding::CommonUtilitiesWrapper(jlcxx::Module &mod)
     mod.method("Initialize", [](){mpart::binding::Initialize(std::vector<std::string> {});});
     mod.method("Initialize", [](std::vector<std::string> v){mpart::binding::Initialize(makeInitArguments(v));});
     mod.method("Concurrency", &Kokkos::DefaultExecutionSpace::concurrency);
-    mod.add_type<Kokkos::HostSpace>("HostSpace");
     mod.add_type<Kokkos::LayoutStride>("LayoutStride");
 }
diff --git a/src/Distributions/GaussianSamplerDensity.cpp b/src/Distributions/GaussianSamplerDensity.cpp
index a4a725e2..9d991842 100644
--- a/src/Distributions/GaussianSamplerDensity.cpp
+++ b/src/Distributions/GaussianSamplerDensity.cpp
@@ -44,10 +44,11 @@ void GaussianSamplerDensity<MemorySpace>::LogDensityImpl(StridedMatrix<const dou
         covChol_.solveInPlaceL(diff);
     }
 
-    ReduceDim<ReduceDimMap::norm,MemorySpace,0> rr(diff, -0.5);
-    Kokkos::parallel_reduce(M, rr, &output(0));
-    Kokkos::parallel_for( "log terms", N, KOKKOS_LAMBDA (const int& j) {
-        output(j) -= 0.5*( M*logtau_ + logDetCov_ );
+    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int& j){
+        output(j) = -0.5*( M*logtau_ + logDetCov_ );
+        for(int d=0; d<M; ++d){
+            output(j) += -0.5*diff(d,j)*diff(d,j);
+        }
     });
 }
 
diff --git a/src/MapObjective.cpp b/src/MapObjective.cpp
index d11123d9..d46f1737 100644
--- a/src/MapObjective.cpp
+++ b/src/MapObjective.cpp
@@ -52,15 +52,36 @@ std::shared_ptr<MapObjective<MemorySpace>> ObjectiveFactory::CreateGaussianKLObj
 template<typename MemorySpace>
 double KLObjective<MemorySpace>::ObjectivePlusCoeffGradImpl(StridedMatrix<const double, MemorySpace> data, StridedVector<double, MemorySpace> grad, std::shared_ptr<ConditionalMapBase<MemorySpace>> map) const {
     unsigned int N_samps = data.extent(1);
+    unsigned int grad_dim = grad.extent(0);
     PullbackDensity<MemorySpace> pullback {map, density_};
     StridedVector<double, MemorySpace> densityX = pullback.LogDensity(data);
     StridedMatrix<double, MemorySpace> densityGradX = pullback.LogDensityCoeffGrad(data);
     double sumDensity = 0.;
+    
     Kokkos::parallel_reduce ("Sum Negative Log Likelihood", N_samps, KOKKOS_LAMBDA (const int i, double &sum) {
         sum -= densityX(i);
     }, sumDensity);
-    ReduceDim<ReduceDimMap::sum,MemorySpace> rc(densityGradX, -1.0/((double) N_samps));
-    Kokkos::parallel_reduce(N_samps, rc, &grad(0));
+
+
+    if (grad.data()!=nullptr){
+        double scale = -1.0/((double) N_samps);
+
+        Kokkos::TeamPolicy<MemoryToExecution<MemorySpace>> policy(grad_dim, Kokkos::AUTO());
+
+        Kokkos::parallel_for(policy,
+            KOKKOS_LAMBDA(auto& tag, auto&& teamMember){
+                int row = teamMember.league_rank();
+                double thisRowSum = 0.0;
+                Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, N_samps), 
+                    KOKKOS_LAMBDA(int col, double& innerUpdate){
+                        innerUpdate += scale*densityGradX(row,col);
+                    }, 
+                thisRowSum);
+
+                grad(row) = thisRowSum;
+            }
+        );
+    }
     return sumDensity/N_samps;
 }
 
@@ -79,10 +100,21 @@ double KLObjective<MemorySpace>::ObjectiveImpl(StridedMatrix<const double, Memor
 template<typename MemorySpace>
 void KLObjective<MemorySpace>::CoeffGradImpl(StridedMatrix<const double, MemorySpace> data, StridedVector<double, MemorySpace> grad, std::shared_ptr<ConditionalMapBase<MemorySpace>> map) const {
     unsigned int N_samps = data.extent(1);
+    unsigned int grad_dim = grad.extent(0);
     PullbackDensity<MemorySpace> pullback {map, density_};
     StridedMatrix<double, MemorySpace> densityGradX = pullback.LogDensityCoeffGrad(data);
-    ReduceDim<ReduceDimMap::sum,MemorySpace> rc(densityGradX, -1.0/((double) N_samps));
-    Kokkos::parallel_reduce(N_samps, rc, &grad(0));
+
+    double scale = -1.0/((double) N_samps);
+    Kokkos::parallel_for("grad", Kokkos::TeamPolicy<MemoryToExecution<MemorySpace>>(grad_dim, Kokkos::AUTO()),
+        KOKKOS_LAMBDA(auto t, typename Kokkos::TeamPolicy<MemoryToExecution<MemorySpace>>::member_type teamMember){
+            int row = teamMember.league_rank();
+            double thisRowSum = 0.0;
+            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, N_samps), KOKKOS_LAMBDA(int col, double& innerUpdate){
+              innerUpdate += scale*densityGradX(row,col);
+            }, thisRowSum);
+            grad(row) = thisRowSum;
+        }
+    );
 }
 
 // Explicit template instantiation
diff --git a/tests/Test_LinearAlgebra.cpp b/tests/Test_LinearAlgebra.cpp
index 604cf419..d51ded4f 100644
--- a/tests/Test_LinearAlgebra.cpp
+++ b/tests/Test_LinearAlgebra.cpp
@@ -139,47 +139,6 @@ TEST_CASE( "Testing Linear Mat-Mat product", "[LinearAlgebra_MatMat]" ) {
     }
 }
 
-TEST_CASE( "Testing ReduceDim", "ReduceDim" ) {
-    int N = 23;
-    StridedMatrix<double, Kokkos::HostSpace> A = Kokkos::View<double**, Kokkos::HostSpace>("A", 3, N);
-    StridedMatrix<double, Kokkos::HostSpace> AT = Kokkos::View<double**, Kokkos::HostSpace>("AT", N, 3);
-    double ref_avg = ((double) (N+1)) / 2.0;
-    for(int i = 0; i < N; i++) {
-        A(0,i) = i+1;
-        A(1,i) = N-i;
-        A(2,i) = ref_avg;
-
-        AT(i,0) = i+1;
-        AT(i,1) = N-i;
-        AT(i,2) = ref_avg;
-    }
-    ReduceDim<ReduceDimMap::sum,Kokkos::HostSpace,1> rc (A, 1.0);
-    Kokkos::View<double*, Kokkos::HostSpace> avg_col("avg_col", 3);
-    Kokkos::parallel_reduce(N, rc, avg_col.data());
-    CHECK(avg_col(0) == Approx(N*ref_avg).epsilon(1e-14).margin(1e-14));
-    CHECK(avg_col(1) == Approx(N*ref_avg).epsilon(1e-14).margin(1e-14));
-    CHECK(avg_col(2) == Approx(N*ref_avg).epsilon(1e-14).margin(1e-14));
-    
-    ReduceDim<ReduceDimMap::sum,Kokkos::HostSpace,0> rr (AT,1.0);
-    Kokkos::View<double*, Kokkos::HostSpace> avg_row("avg_row", 3);
-    Kokkos::parallel_reduce(N, rr, avg_row.data());
-    CHECK(avg_row(0) == Approx(N*ref_avg).epsilon(1e-14).margin(1e-14));
-    CHECK(avg_row(1) == Approx(N*ref_avg).epsilon(1e-14).margin(1e-14));
-    CHECK(avg_row(2) == Approx(N*ref_avg).epsilon(1e-14).margin(1e-14));
-
-    rc = ReduceDim<ReduceDimMap::sum,Kokkos::HostSpace>(A, -1.0/((double) N));
-    Kokkos::parallel_reduce(N, rc, avg_col.data());
-    CHECK(avg_col(0) == Approx(-ref_avg).epsilon(1e-14).margin(1e-14));
-    CHECK(avg_col(1) == Approx(-ref_avg).epsilon(1e-14).margin(1e-14));
-    CHECK(avg_col(2) == Approx(-ref_avg).epsilon(1e-14).margin(1e-14));
-    
-    rr = ReduceDim<ReduceDimMap::sum,Kokkos::HostSpace,0>(AT, -1.0/((double) N));
-    Kokkos::parallel_reduce(N, rr, avg_row.data());
-    CHECK(avg_row(0) == Approx(-ref_avg).epsilon(1e-14).margin(1e-14));
-    CHECK(avg_row(1) == Approx(-ref_avg).epsilon(1e-14).margin(1e-14));
-    CHECK(avg_row(2) == Approx(-ref_avg).epsilon(1e-14).margin(1e-14));
-}
-
 TEST_CASE( "Testing LU Factorization", "LinearAlgebra_LU" ) {
     Kokkos::View<double**, Kokkos::LayoutLeft, Kokkos::HostSpace> A("A", 3, 3);
     Kokkos::View<double**, Kokkos::LayoutLeft, Kokkos::HostSpace> B("B", 3, 2);
diff --git a/tests/Test_TrainMapAdaptive.cpp b/tests/Test_TrainMapAdaptive.cpp
index 4385a2d6..b3c54acd 100644
--- a/tests/Test_TrainMapAdaptive.cpp
+++ b/tests/Test_TrainMapAdaptive.cpp
@@ -16,18 +16,30 @@ void NormalizeSamples(StridedMatrix<double, Kokkos::HostSpace> mat) {
     unsigned int dim = mat.extent(0);
     unsigned int N_samples = mat.extent(1);
     // Take sum of each row, divide by 1/N_samples
-    ReduceDim<ReduceDimMap::sum,MemorySpace,1> rd_mean(mat, 1./(static_cast<double>(N_samples)));
     Kokkos::View<double*, MemorySpace> meanVar("MeanStd", dim);
-    Kokkos::parallel_reduce(N_samples, rd_mean, &meanVar(0));
+    for(int i=0; i<dim; ++i){
+        for(int j=0; j<N_samples; ++j){
+            meanVar(i) += mat(i,j);
+        }
+        meanVar(i) /= static_cast<double>(N_samples);
+    }
+
     // Subtract mean from each point
     using ExecSpace = typename MemoryToExecution<MemorySpace>::Space;
     auto policy = Kokkos::MDRangePolicy<Kokkos::Rank<2>,ExecSpace>({0,0},{dim,N_samples});
     Kokkos::parallel_for("Center data", policy, KOKKOS_LAMBDA(const unsigned int i, const unsigned int j){
         mat(i,j) -= meanVar(i);
     });
+
     // Take || . ||_2^2 of each row, divide by 1/(N_samples-1)
-    ReduceDim<ReduceDimMap::norm,MemorySpace,1> rd_var(mat, 1./(static_cast<double>(N_samples-1)));
-    Kokkos::parallel_reduce(N_samples, rd_var, &meanVar(0));
+    for(int i=0; i<dim; ++i){
+        meanVar(i) = 0.0;
+        for(int j=0; j<N_samples; ++j){
+            meanVar(i) += mat(i,j)*mat(i,j);
+        }
+        meanVar(i) /= static_cast<double>(N_samples-1);
+    }
+
     // Divide each point by appropriate standard deviation
     Kokkos::parallel_for("Scale data", policy, KOKKOS_LAMBDA(const unsigned int i, const unsigned int j){
         mat(i,j) /= std::sqrt(meanVar(i));