microsoft
diff --git a/‎onnxruntime/core/providers/cuda/multi_tensor/common.cuh‎
Lines changed: 4 additions & 4 deletions b/‎onnxruntime/core/providers/cuda/multi_tensor/common.cuh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎orttraining/orttraining/training_ops/cuda/math/isfinite.cc‎
Lines changed: 1 addition & 1 deletion b/‎orttraining/orttraining/training_ops/cuda/math/isfinite.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc‎
Lines changed: 20 additions & 8 deletions b/‎orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc‎
Lines changed: 20 additions & 8 deletions
diff --git a/‎orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu‎
Lines changed: 120 additions & 38 deletions b/‎orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu‎
Lines changed: 120 additions & 38 deletions
@@ -77,7 +77,7 @@ void launch_multi_tensor_functor(
     std::vector<int>& tensor_sizes,
     std::vector<std::vector<void*>>& grouped_tensor_pointers,
     TMultiTensorFunctor multipleTensorKernel,
-    TFunctorParams... kernelParams) {
+    TFunctorParams&&... kernelParams) {
   ORT_ENFORCE(tensor_sizes.size() > 0);
   ORT_ENFORCE(tensor_sizes.size() < static_cast<size_t>(std::numeric_limits<int>::max()));
   ORT_ENFORCE(grouped_tensor_pointers.size() > 0);
@@ -121,15 +121,15 @@ void launch_multi_tensor_functor(
       chunk_group.chunk_count = block_index;
 
       if (block_index == chunk_group.max_block_count) {
-        multipleTensorKernel(chunk_group, kernelParams...);
+        multipleTensorKernel(chunk_group, std::forward<TFunctorParams>(kernelParams)...);
         block_index = 0;
       }
     }
 
     // After ++tensor_group_index, tensor_group_index becomes the count of tensor group in chunk_group.
     ++tensor_group_index;
     if (tensor_group_index == chunk_group.max_tensor_group_count) {
-      multipleTensorKernel(chunk_group, kernelParams...);
+      multipleTensorKernel(chunk_group, std::forward<TFunctorParams>(kernelParams)...);
       block_index = 0;
       tensor_group_index = 0;
     }
@@ -138,7 +138,7 @@ void launch_multi_tensor_functor(
   // This round of processing tensor group is finished.
   // All the groups remain in chunk group should be processed right now.
   if (block_index != 0) {
-    multipleTensorKernel(chunk_group, kernelParams...);
+    multipleTensorKernel(chunk_group, std::forward<TFunctorParams>(kernelParams)...);
     block_index = 0;
     tensor_group_index = 0;
   }
 
@@ -75,7 +75,7 @@ Status IsAllFiniteOp<TSrc>::ComputeInternal(OpKernelContext* context) const {
 
   // Check if all values are finite and write true to output.
   // Otherwise, false will be written.
-  launch_multi_tensor_functor<1, TFunctor, bool*>(
+  launch_multi_tensor_functor<1, TFunctor>(
       2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, output_data);
 
   return Status::OK();
 
@@ -254,7 +254,7 @@ Status launch_lamb_compute_direction(
     typedef LambMultiTensorComputeDirectionFunctor<CudaT2, CudaT3, CudaT4, CudaT_GRAD_NORM> LambStage1;
     LambStage1 lamb_stage1;
 
-    launch_multi_tensor_functor<tensor_count_per_group, LambStage1, const CudaT2*, const CudaT_GRAD_NORM*, float, float, float, float>(
+    launch_multi_tensor_functor<tensor_count_per_group, LambStage1>(
         2048 * 32,
         tensor_sizes_in_buckets[key],
         buckets[key],
@@ -267,6 +267,7 @@ Status launch_lamb_compute_direction(
 
 template <typename CudaTNorm, typename CudaTIn1, typename CudaTIn2>
 Status launch_lamb_reduction(
+    const CudaKernel& kernel,
     const int group_count,
     std::vector<int>& tensor_sizes,
     std::vector<CudaTNorm*>& p_w_norms,
@@ -332,7 +333,10 @@ Status launch_lamb_reduction(
         2048 * 32,
         tensor_sizes_in_buckets,
         buckets,
-        reducer);
+        reducer,
+        kernel,
+        reduction_buffer,
+        reduction_buffer_size);
   }
 
   return Status::OK();
@@ -412,9 +416,7 @@ Status launch_lamb_update(
         LambStage2;
     LambStage2 lamb_stage2;
 
-    launch_multi_tensor_functor<
-        tensor_count_per_group, LambStage2,
-        const CudaT1*, const float, const float>(
+    launch_multi_tensor_functor<tensor_count_per_group, LambStage2>(
         2048 * 32,
         tensor_sizes_in_bucket,
         buckets,
@@ -542,9 +544,18 @@ Status LambOptimizer<T1, T2, T3, T4, T_GRAD_NORM, T_MIXED_PRECISION_FP>::Compute
     max_tensor_size = std::max(max_tensor_size, static_cast<int>(w.Shape().Size()));
   }
 
-  // Allocate a buffer in byte for reduction API calls.
-  const auto reduction_buffer_size =
-      compute_reduction_buffer_size<CudaT2>(max_tensor_size);
+  const size_t reduction_buffer_size = [&]() {
+    // Allocate a buffer in byte for reduction API calls.
+    size_t rbs = compute_reduction_buffer_size<CudaT2>(max_tensor_size);
+
+    // Enlarge reduction buffer to accomodate multi-tensor reduction kernel as well
+    const int tensor_group_size = 4;  // w, d, w_norm, d_norm
+    const int max_blocks = ChunkGroup<tensor_group_size>::max_block_count;
+    const size_t multitensor_block_reduce_buffer_size = 2 * max_blocks * sizeof(CudaT2);
+    rbs = std::max(rbs, multitensor_block_reduce_buffer_size);
+
+    return rbs;
+  }();
 
   // Allocate reduction buffer whose size is reduction_buffer_size bytes.
   IAllocatorUniquePtr<void> reduction_buffer = GetScratchBuffer<void>(reduction_buffer_size);
@@ -640,6 +651,7 @@ Status LambOptimizer<T1, T2, T3, T4, T_GRAD_NORM, T_MIXED_PRECISION_FP>::Compute
       do_bias_correction_));
 
   ORT_RETURN_IF_ERROR(launch_lamb_reduction(
+      *this,
       group_count,
       tensor_sizes,
       p_w_norms,
 
@@ -2,9 +2,12 @@
 // Licensed under the MIT License.
 
 #include "core/providers/cuda/cu_inc/common.cuh"
+#include "core/providers/cuda/cuda_allocator.h"
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/atomic/common.cuh"
+#include "core/providers/cuda/reduction/reduction_utils.cuh"
 #include "orttraining/training_ops/cuda/math/isfinite.cuh"
+#include "orttraining/training_ops/cuda/optimizer/common.h"
 #include "orttraining/training_ops/cuda/optimizer/common.cuh"
 #include "orttraining/training_ops/cuda/optimizer/lamb.h"
 
@@ -50,8 +53,8 @@ __device__ __forceinline__ void _LambComputeDirectionRule(
   const T3 m2_new_tmp_corrected = m2_new_tmp / beta_correction;
 
   // Save regularized update direction to output.
-  const T2 d_tmp = lambda * w + 
-    T1(m1_new_tmp_corrected / (_Sqrt(m2_new_tmp_corrected) + epsilon));
+  const T2 d_tmp = lambda * w +
+                   T1(m1_new_tmp_corrected / (_Sqrt(m2_new_tmp_corrected) + epsilon));
 
   // Things are updated only if the direction is finite.
   if (_IsFiniteScalar(d_tmp)) {
@@ -145,22 +148,22 @@ void LambComputeDirection(
 }
 
 #define SPECIALIZED_LAMB_COMPUTE_DIRECTION(T1, T2, T3, T_GRAD_NORM) \
-  template void LambComputeDirection(                     \
-      const T1* weights,                                  \
-      const T2* grads,                                    \
-      const T3* moment_1,                                 \
-      const T3* moment_2,                                 \
-      const T1* loss_scale,                               \
-      const T_GRAD_NORM* grad_norm,                       \
-      T3 alpha,                                           \
-      T3 beta,                                            \
-      T1 lambda,                                          \
-      T3 epsilon,                                         \
-      T3 alpha_correction,                                \
-      T3 beta_correction,                                 \
-      T2* weights_out,                                    \
-      T3* moment_1_out,                                   \
-      T3* moment_2_out,                                   \
+  template void LambComputeDirection(                               \
+      const T1* weights,                                            \
+      const T2* grads,                                              \
+      const T3* moment_1,                                           \
+      const T3* moment_2,                                           \
+      const T1* loss_scale,                                         \
+      const T_GRAD_NORM* grad_norm,                                 \
+      T3 alpha,                                                     \
+      T3 beta,                                                      \
+      T1 lambda,                                                    \
+      T3 epsilon,                                                   \
+      T3 alpha_correction,                                          \
+      T3 beta_correction,                                           \
+      T2* weights_out,                                              \
+      T3* moment_1_out,                                             \
+      T3* moment_2_out,                                             \
       size_t count);
 
 SPECIALIZED_LAMB_COMPUTE_DIRECTION(float, float, float, float)
@@ -182,9 +185,8 @@ __device__ __forceinline__ void _LambUpdateRule(
     T2* w_new,
     T3* g_new,
     T_MIXED_PRECISION_FP* w_mixed_precision_new) {
-  // Confidence coefficeint of this update. 
-  const T2 ratio = (w_norm != T2(0.0f) && r_norm != T2(0.0f)) ?
-    T2(eta) * _Max(T2(ratio_min), _Min(T2(ratio_max), _Sqrt(w_norm / r_norm))) : T2(eta);
+  // Confidence coefficeint of this update.
+  const T2 ratio = (w_norm != T2(0.0f) && r_norm != T2(0.0f)) ? T2(eta) * _Max(T2(ratio_min), _Min(T2(ratio_max), _Sqrt(w_norm / r_norm))) : T2(eta);
 
   // Compute delta using the saved update direction.
   const T2 delta = -ratio * T2(d);
@@ -313,7 +315,7 @@ __global__ void LambMultiTensorComputeDirectionImpl(
   T3* m2_new = reinterpret_cast<T3*>(chunk_group.tensor_ptrs[5][group_index]) + chunk_start;
   const T1 scale = _ComputeGradScale<T1, T_GRAD_NORM, T1>(loss_scale, g_norm);
 
-  #pragma unroll
+#pragma unroll
   for (int i = threadIdx.x; i < chunk_size && i + chunk_start < tensor_size; i += blockDim.x) {
     _LambComputeDirectionRule(
         scale,
@@ -359,16 +361,16 @@ void LambMultiTensorComputeDirectionFunctor<T1, T2, T3, T_GRAD_NORM>::operator()
       beta_correction);
 }
 
-#define INSTANTIATE_LAMB_STAGE1_MULTI_TENSOR_FUNCTOR(T1, T2, T3, T_GRAD_NORM)   \
+#define INSTANTIATE_LAMB_STAGE1_MULTI_TENSOR_FUNCTOR(T1, T2, T3, T_GRAD_NORM)                \
   template void LambMultiTensorComputeDirectionFunctor<T1, T2, T3, T_GRAD_NORM>::operator()( \
-      ChunkGroup<6> chunk_group,                                                \
-      const T1* loss_scale,                                                     \
-      const T_GRAD_NORM* g_norm,                                                \
-      const T1 lambda,                                                          \
-      const T3 alpha,                                                           \
-      const T3 beta,                                                            \
-      const T3 epsilon,                                                         \
-      const T3 alpha_correction,                                                \
+      ChunkGroup<6> chunk_group,                                                             \
+      const T1* loss_scale,                                                                  \
+      const T_GRAD_NORM* g_norm,                                                             \
+      const T1 lambda,                                                                       \
+      const T3 alpha,                                                                        \
+      const T3 beta,                                                                         \
+      const T3 epsilon,                                                                      \
+      const T3 alpha_correction,                                                             \
       const T3 beta_correction);
 
 INSTANTIATE_LAMB_STAGE1_MULTI_TENSOR_FUNCTOR(float, float, float, float)
@@ -440,9 +442,15 @@ INSTANTIATE_LAMB_MULTI_TENSOR_UPDATE_FUNCTOR(double, double, double, half)
 INSTANTIATE_LAMB_MULTI_TENSOR_UPDATE_FUNCTOR(half, float, half, half)
 INSTANTIATE_LAMB_MULTI_TENSOR_UPDATE_FUNCTOR(float, float, half, half)
 
+// w_buffer[i], d_buffer[i] is used to store the squared sum of all elements processed by the i-th block.
+// sync_range_and_lock is used for a well ordered reduction over blocks spanning the same tensor
 template <typename TIn1, typename TIn2, typename TOut1, typename TOut2, typename TBuf>
 __launch_bounds__(ChunkGroup<4>::thread_count_per_block)
-__global__ void LambMultiTensorReductionImpl(ChunkGroup<4> chunk_group) {
+    __global__ void LambMultiTensorReductionImpl(
+        ChunkGroup<4> chunk_group,
+        TOut1* w_buffer,
+        TOut2* d_buffer,
+        LambMultiTensorSyncRangeAndLock* sync_range_and_lock) {
   const int group_index = chunk_group.block_index_to_tensor_group_index[blockIdx.x];
   const int tensor_size = chunk_group.tensor_sizes[group_index];
   const int chunk_size = chunk_group.chunk_size;
@@ -469,7 +477,7 @@ __global__ void LambMultiTensorReductionImpl(ChunkGroup<4> chunk_group) {
     }
   }
 
-  // Thread count in a block must be a multiple of GPU_WARP_SIZE.
+// Thread count in a block must be a multiple of GPU_WARP_SIZE.
 #pragma unroll
   for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
     w_sum += WARP_SHFL_DOWN(w_sum, stride);
@@ -502,14 +510,77 @@ __global__ void LambMultiTensorReductionImpl(ChunkGroup<4> chunk_group) {
     __syncthreads();
   }
 
+  // ascertain the range of blocks with the associated tensor
+  // note: if non-ordered reduction is OK, then atomicAdd over blocks could suffice
+  const int leading_block_in_tensor = sync_range_and_lock[group_index].leading_block;
+  const int num_blocks_in_tensor = sync_range_and_lock[group_index].number_blocks;
+
+  if (num_blocks_in_tensor == 1) {
+    if (threadIdx.x == 0) {
+      *w_norm = TOut1(w_shared_memory_[0]);
+      *d_norm = TOut2(d_shared_memory_[0]);
+    }
+    return;
+  }
+
   if (threadIdx.x == 0) {
-    atomic_add(w_norm, TOut1(w_shared_memory_[0]));
-    atomic_add(d_norm, TOut2(d_shared_memory_[0]));
+    w_buffer[blockIdx.x] = w_shared_memory_[0];
+    d_buffer[blockIdx.x] = d_shared_memory_[0];
   }
+
+  __threadfence();
+  __syncthreads();
+
+  // use lock to determine if this is last block for given tensor
+  __shared__ bool is_last_block_done;
+
+  if (threadIdx.x == 0) {
+    int* p_lock = &sync_range_and_lock[group_index].completed_blocks;
+    int counter = atomicAdd(p_lock, 1);
+    is_last_block_done = (counter == num_blocks_in_tensor - 1);
+  }
+  __syncthreads();
+
+  // only last block to finish for associated tensor enters below
+  if (is_last_block_done) {
+    const int pow2_bound = least_pow2_bound(num_blocks_in_tensor);
+    int blockid = leading_block_in_tensor + threadIdx.x;
+    for (int stride = pow2_bound / 2; stride > 0; stride /= 2) {
+      if (threadIdx.x < stride && threadIdx.x + stride < num_blocks_in_tensor) {
+        w_buffer[blockid] += w_buffer[blockid + stride];
+        d_buffer[blockid] += d_buffer[blockid + stride];
+      }
+      __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+      *w_norm = TOut1(w_buffer[leading_block_in_tensor]);
+      *d_norm = TOut2(d_buffer[leading_block_in_tensor]);
+    }
+  }
+}
+
+CudaKernel::CudaAsyncBuffer<LambMultiTensorSyncRangeAndLock> compute_tensor_range_and_lock(ChunkGroup<4> chunk_group, const CudaKernel& kernel) {
+  const int num_blocks = chunk_group.chunk_count;
+
+  // sync_range_and_lock is a struct consisting of (start_block, num_blocks, lock) for each tensor
+  // Note: Adding such info to chunk group causes overflow (unless max tensors is reduced)
+  const int max_tensors = ChunkGroup<4>::max_tensor_group_count;
+  LambMultiTensorSyncRangeAndLock initial = {0, 0, 0};
+  CudaKernel::CudaAsyncBuffer<LambMultiTensorSyncRangeAndLock> sync_range_and_lock(&kernel, initial, max_tensors);
+  for (int block_index = num_blocks - 1; block_index >= 0; block_index--) {
+    int tensor_index = chunk_group.block_index_to_tensor_group_index[block_index];
+    auto& tensor_block_span = sync_range_and_lock.CpuPtr()[tensor_index];
+    tensor_block_span.leading_block = block_index;
+    tensor_block_span.number_blocks++;
+  }
+  sync_range_and_lock.CopyToGpu();
+
+  return sync_range_and_lock;
 }
 
 template <typename TIn1, typename TIn2, typename TOut1, typename TOut2, typename TBuf>
-void LambMultiTensorReductionFunctor<TIn1, TIn2, TOut1, TOut2, TBuf>::operator()(ChunkGroup<4> chunk_group) {
+void LambMultiTensorReductionFunctor<TIn1, TIn2, TOut1, TOut2, TBuf>::operator()(ChunkGroup<4> chunk_group, const CudaKernel& kernel, void* reduction_buffer, size_t reduction_buffer_size) {
   // thread count per block.
   constexpr int thread_count = ChunkGroup<4>::thread_count_per_block;
   // shared memory's size per block.
@@ -519,11 +590,22 @@ void LambMultiTensorReductionFunctor<TIn1, TIn2, TOut1, TOut2, TBuf>::operator()
   ORT_ENFORCE(thread_count % GPU_WARP_SIZE == 0);
   ORT_ENFORCE((thread_count & (thread_count - 1)) == 0);
 
-  LambMultiTensorReductionImpl<TIn1, TIn2, TOut1, TOut2, TBuf><<<chunk_group.chunk_count, thread_count, shared_memory_size>>>(chunk_group);
+  const int num_blocks = chunk_group.chunk_count;
+  const size_t w_buffer_size = num_blocks * sizeof(TOut1);
+  const size_t d_buffer_size = num_blocks * sizeof(TOut2);
+
+  ORT_ENFORCE(w_buffer_size + d_buffer_size <= reduction_buffer_size);
+
+  TOut1* w_buffer = reinterpret_cast<TOut1*>(reduction_buffer);
+  TOut2* d_buffer = reinterpret_cast<TOut2*>(w_buffer + num_blocks);
+
+  auto sync_range_and_lock = compute_tensor_range_and_lock(chunk_group, kernel);
+  LambMultiTensorReductionImpl<TIn1, TIn2, TOut1, TOut2, TBuf><<<chunk_group.chunk_count, thread_count, shared_memory_size>>>(
+      chunk_group, w_buffer, d_buffer, sync_range_and_lock.GpuPtr());
 }
 
 #define INSTANTIATE_LAMB_MULTI_TENSOR_REDUCTION_FUNCTOR(TIn1, TIn2, TOut1, TOut2, TBuf) \
-  template void LambMultiTensorReductionFunctor<TIn1, TIn2, TOut1, TOut2, TBuf>::operator()(ChunkGroup<4> chunk_group);
+  template void LambMultiTensorReductionFunctor<TIn1, TIn2, TOut1, TOut2, TBuf>::operator()(ChunkGroup<4> chunk_group, const CudaKernel& kernel, void* reduction_buffer, size_t reduction_buffer_size);
 
 INSTANTIATE_LAMB_MULTI_TENSOR_REDUCTION_FUNCTOR(float, float, float, float, float)
 INSTANTIATE_LAMB_MULTI_TENSOR_REDUCTION_FUNCTOR(double, double, double, double, double)