microsoft · pranavsharma · Oct 24, 2020 · Oct 22, 2020 · Oct 23, 2020 · Oct 23, 2020
diff --git a/onnxruntime/core/providers/cpu/ml/scaler.cc b/onnxruntime/core/providers/cpu/ml/scaler.cc
@@ -60,6 +60,8 @@ ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
     KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<int32_t>()).MayInplace(0, 0),
     ScalerOp<int32_t>);
 
+static constexpr int kParallelizationThreshold = 10 * 1000;
+
 template <typename T>
 ScalerOp<T>::ScalerOp(const OpKernelInfo& info) : OpKernel(info),
                                                   scale_(info.GetAttrsOrDefault<float>("scale")),
@@ -85,7 +87,7 @@ common::Status ScalerOp<T>::Compute(OpKernelContext* context) const {
   int64_t stride = x_dims.size() == 1 ? x_dims[0] : x_dims[1];
   auto* ttp = context->GetOperatorThreadPool();
   auto conditional_batch_call = [ttp, x_size](std::function<void(ptrdiff_t)> f) {
-    if (x_size < 10 * 1000) {  // TODO: tune this, arbitrary threshold
+    if (x_size < kParallelizationThreshold) {  // TODO: tune this, arbitrary threshold
       for (size_t i = 0; i < x_size; ++i) {
         f(i);
       }

diff --git a/onnxruntime/core/providers/cpu/tensor/gather_elements.cc b/onnxruntime/core/providers/cpu/tensor/gather_elements.cc
@@ -25,6 +25,8 @@ ONNX_CPU_OPERATOR_KERNEL(
                                                         DataTypeImpl::GetTensorType<int64_t>()}),
     GatherElements);
 
+static constexpr int kParallelizationThreshold = 10 * 1000;
+
 // Some helpers needed for GatherElements op -
 
 // The following method computes the offset in the flattened array
@@ -159,7 +161,7 @@ static void core_impl(const Tensor* input_tensor, const Tensor* indices_tensor,
   int64_t output_counter = 0;
 
   auto conditional_batch_call = [ttp, inner_dim_size](std::function<void(ptrdiff_t)> f) {
-    if (inner_dim_size < 10 * 1000) {  // TODO: tune this, arbitrary threshold
+    if (inner_dim_size < kParallelizationThreshold) {  // TODO: tune this, arbitrary threshold
       for (int64_t i = 0; i < inner_dim_size; ++i) {
         f(i);
       }

diff --git a/onnxruntime/test/providers/cpu/ml/scaler_test.cc b/onnxruntime/test/providers/cpu/ml/scaler_test.cc
@@ -3,19 +3,29 @@
 
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
+
 using namespace std;
 namespace onnxruntime {
 namespace test {
 
 template <typename T>
-void TestScalar() {
+void TestScalar(bool use_big_input = false) {
   OpTester test("Scaler", 1, onnxruntime::kMLDomain);
   vector<float> scale{3.f, -4.f, 3.0f};
   vector<float> offset{4.8f, -0.5f, 77.0f};
   test.AddAttribute("scale", scale);
   test.AddAttribute("offset", offset);
-  vector<T> input{1, -2, 3, 4, 5, -6};
-  vector<int64_t> dims{2, 3};
+  vector<T> input;
+  vector<int64_t> dims;
+
+  if (!use_big_input) {
+    input = vector<T>{1, -2, 3, 4, 5, -6};
+    dims = {2, 3};
+  } else {
+    input.resize(15 * 1000);  // must be >= kParallelizationThreshold in scaler.cc
+    std::iota(std::begin(input), std::end(input), static_cast<T>(1));
+    dims = {5000, 3};
+  }
 
   // prepare expected output
   vector<float> expected_output;
@@ -33,6 +43,7 @@ TEST(MLOpTest, ScalerOp) {
   TestScalar<double>();
   TestScalar<int64_t>();
   TestScalar<int32_t>();
+  TestScalar<float>(true);  // use big input
 }
 
 TEST(MLOpTest, ScalerOpScaleOffsetSize1) {
@@ -55,5 +66,27 @@ TEST(MLOpTest, ScalerOpScaleOffsetSize1) {
   test.Run();
 }
 
+// tests invocation via TryBatchParallelFor for input of size 10K
+TEST(MLOpTest, ScalerOpScaleOffsetSize1BigInput) {
+  OpTester test("Scaler", 1, onnxruntime::kMLDomain);
+  vector<float> scale{3.f};
+  vector<float> offset{4.8f};
+  test.AddAttribute("scale", scale);
+  test.AddAttribute("offset", offset);
+  vector<float> input(15 * 1000);  // must be >= kParallelizationThreshold in scaler.cc
+  std::iota(std::begin(input), std::end(input), 1.0f);
+  vector<int64_t> dims{3, 5000};
+
+  // prepare expected output
+  vector<float> expected_output;
+  for (size_t i = 0; i < input.size(); ++i) {
+    expected_output.push_back((input[i] - offset[0]) * scale[0]);
+  }
+
+  test.AddInput<float>("X", dims, input);
+  test.AddOutput<float>("Y", dims, expected_output);
+  test.Run();
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc
@@ -315,5 +315,23 @@ TEST(GatherElementsOpTest, string) {
   RunTypedTest<std::string>();
 }
 
+TEST(GatherElementsOpTest, BigIndices) {
+  // int32_t indices - axis 0
+  OpTester test1("GatherElements", 11);
+
+  test1.AddAttribute<int64_t>("axis", 0LL);
+  const int kNumIndices = 10 * 1000;  // must be >= kParallelizationThreshold in gather_elements.cc
+  std::vector<float> input(2 * kNumIndices);
+  std::iota(std::begin(input), std::end(input), 0.f);
+  test1.AddInput<float>("data", {2, kNumIndices}, input);
+
+  std::vector<int32_t> indices(kNumIndices, 0);
+  std::vector<float> output(kNumIndices);
+  std::iota(std::begin(output), std::end(output), 0.f);
+  test1.AddInput<int32_t>("indices", {1, kNumIndices}, indices);
+  test1.AddOutput<float>("output", {1, kNumIndices}, output);
+  test1.Run();
+}
+
 }  // namespace test
 }  // namespace onnxruntime