Intel-tensorflow · kmagiers · Jan 23, 2023
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl.h b/tensorflow/core/kernels/segment_reduction_ops_impl.h
@@ -358,8 +358,12 @@ struct UnsortedSegmentFunctor<CPUDevice, T, Index, InitialValueF, ReductionF> {
     const int64_t N = segment_ids.dimension(0);
     const int64_t num_segments = output.dimension(0);
     const int64_t inner_dim = data.dimension(1);
+    const T* data_ptr = data.data();
+    T* out_ptr = output.data();
     ReductionF reduction;
 
+    bool data_is_1D = data.dimensions()[1] == 1;
+
     // `num_real_segment` counts the rows actually reduced from input,
     // the rows with negative segment index will be excluded.
     // It will be used for cost model.
@@ -410,15 +414,29 @@ struct UnsortedSegmentFunctor<CPUDevice, T, Index, InitialValueF, ReductionF> {
         }
       }
     };
-
+    auto reductionWorker1D = [&](int64_t begin, int64_t end) -> void {
+      for (int64_t i = 0; i < N; i++) {
+        Index j = internal::SubtleMustCopy(segment_ids(i));
+        // If `j` is in work scope of this worker, do the reduction.
+        if (j >= begin && j < end) {
+            reduction(data_ptr[i], out_ptr[j]);
+        }
+      }
+    };
     // Reduction functors includes Sum, Max, Min, etc. Simply consider it
     // will cost 5 cycles per operation.
     const int64_t kAverTaskSize = num_real_segment / num_segments;
     const int64_t compute_cycles = 5 * inner_dim * kAverTaskSize;
     const int64_t input_bytes = sizeof(T) * inner_dim * kAverTaskSize;
     const int64_t output_bytes = sizeof(T) * inner_dim * kAverTaskSize;
     const Eigen::TensorOpCost cost(input_bytes, output_bytes, compute_cycles);
-    cpu_device.parallelFor(num_segments, cost, reductionWorker);
+    if(data_is_1D) {
+        cpu_device.parallelFor(num_segments, cost, reductionWorker1D);
+    }
+    else {
+        cpu_device.parallelFor(num_segments, cost, reductionWorker);
+    }
+
   }
 };
 
@@ -435,27 +453,39 @@ struct SumOp {
   void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
     output += data;
   }
+  void operator()(const T &data, T &output) {
+    output += data;
+  }
 };
 
 template <typename T>
 struct MaxOp {
   void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
     output = data.cwiseMax(output);
   }
+  void operator()(const T &data, T &output) {
+    output = std::max(data, output);
+  }
 };
 
 template <typename T>
 struct MinOp {
   void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
     output = data.cwiseMin(output);
   }
+  void operator()(const T &data, T &output) {
+    output = std::min(data, output);
+  }
 };
 
 template <typename T>
 struct ProdOp {
   void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
     output *= data;
   }
+  void operator()(const T &data, T &output) {
+    output *= data;
+  }
 };
 }  // namespace functor