Align AvgPool ceil_mode on last value to torch (#16752)

titaiwangms · guschmue · commit 079f15670e55 · 2025-03-05T16:36:15.000-08:00
Fix #16203 Previous to this PR, if `ceil_mode` is on, the calculation of a value would divide the kernel size, even if remaining pixels is less than the kernel size, which causes the difference in this operator between ORT and torch. However, this fix only applies to the change in #15597, which only supports AvgPool since 19. The older opset version is remain the same, as it's using mlas files. Also, the PR fixes the shape mismatch caused by sliding window starting from padding. More detail: onnx/onnx#6650 (And this PR is also validated with the tests added in onnx/onnx#6650)
diff --git a/onnxruntime/core/providers/cpu/nn/pool_attributes.h b/onnxruntime/core/providers/cpu/nn/pool_attributes.h
@@ -150,30 +150,30 @@ struct PoolAttributes {
         case AutoPadType::VALID:
           *pad_head = 0;
           *pad_tail = 0;
-          *out_size = ComputeOutputSize(in_size, stride, kernel, 0, dilation);
+          *out_size = ComputeOutputSize(in_size, stride, kernel, 0, 0, dilation);
           break;
         case AutoPadType::SAME_LOWER: {
           int64_t legacy_target_size = (in_size + stride - 1) / stride;
           int64_t pad_needed = (legacy_target_size - 1) * stride + kernel - in_size;
           *pad_head = (pad_needed + 1) / 2;
           *pad_tail = pad_needed - *pad_head;
-          *out_size = ComputeOutputSize(in_size, stride, kernel, pad_needed, dilation);
+          *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head, *pad_tail, dilation);
           break;
         }
         case AutoPadType::SAME_UPPER: {
           int64_t legacy_target_size = (in_size + stride - 1) / stride;
           int64_t pad_needed = (legacy_target_size - 1) * stride + kernel - in_size;
           *pad_head = pad_needed / 2;
           *pad_tail = pad_needed - *pad_head;
-          *out_size = ComputeOutputSize(in_size, stride, kernel, pad_needed, dilation);
+          *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head, *pad_tail, dilation);
           break;
         }
         default: {
           ORT_THROW("Unsupported AutoPad Type.");
         }
       }
     } else {
-      *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head + *pad_tail, dilation);
+      *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head, *pad_tail, dilation);
     }
   }
 #if defined(_MSC_VER) && !defined(__clang__)
@@ -184,13 +184,21 @@ struct PoolAttributes {
   int64_t ComputeOutputSize(int64_t in_size,
                             int64_t stride,
                             int64_t kernel,
-                            int64_t pad_needed,
+                            int64_t pad_head,
+                            int64_t pad_tail,
                             int64_t dilation) const {
-    if (ceil_mode == 0) {
-      return static_cast<int64_t>(static_cast<float>(in_size + pad_needed - dilation * (kernel - 1) - 1) / stride + 1);
+    int64_t numerator = in_size + pad_head + pad_tail - dilation * (kernel - 1) - 1;
+    int64_t out_size = numerator / stride + 1;
+
+    if (ceil_mode == 1) {
+      out_size = static_cast<int64_t>(std::ceil(static_cast<float>(numerator) / stride)) + 1;
+      // Ensure that the last pooling starts inside the image (at least 1 pixel)
+      // Reference: https://github.com/onnx/onnx/pull/5741
+      if ((out_size - 1) * stride >= in_size + pad_head) {
+        --out_size;
+      }
     }
-    return static_cast<int64_t>(
-        std::ceil(static_cast<float>(in_size + pad_needed - dilation * (kernel - 1) - 1) / stride + 1));
+    return out_size;
   }
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
diff --git a/onnxruntime/core/providers/cpu/nn/pool_functors.h b/onnxruntime/core/providers/cpu/nn/pool_functors.h
@@ -406,6 +406,7 @@ struct AveragePool1DTask final {
     for (int64_t ph = 0; ph < pooled_height; ++ph) {
       int64_t hstart = ph * stride_h - pads[0];
       int64_t hend = hstart + kernel_shape[0] * dilation_h;
+      hend = std::min(hend, height + pads[1]);
       y_d[ph] = 0;
       int total_elements = 0;
       for (int64_t h = hstart; h < hend; h += dilation_h) {
@@ -461,9 +462,11 @@ struct AveragePool2DTask final {
     for (int64_t ph = 0; ph < pooled_height; ++ph) {
       int64_t hstart = ph * stride_h - pads[0];
       int64_t hend = hstart + kernel_shape[0] * dilation_h;
+      hend = std::min(hend, height + pads[1]);
       for (int64_t pw = 0; pw < pooled_width; ++pw) {
         int64_t wstart = pw * stride_w - pads[1];
         int64_t wend = wstart + kernel_shape[1] * dilation_w;
+        wend = std::min(wend, width + pads[3]);
         const int64_t pool_index = ph * pooled_width + pw;
         y_d[pool_index] = 0;
         int total_elements = 0;
@@ -532,12 +535,15 @@ struct AveragePool3DTask {
     for (int64_t ph = 0; ph < pooled_height; ++ph) {
       int64_t hstart = ph * stride_h - pads[0];
       int64_t hend = hstart + kernel_shape[0] * dilation_h;
+      hend = std::min(hend, height + pads[1]);
       for (int64_t pw = 0; pw < pooled_width; ++pw) {
         int64_t wstart = pw * stride_w - pads[1];
         int64_t wend = wstart + kernel_shape[1] * dilation_w;
+        wend = std::min(wend, width + pads[3]);
         for (int64_t pd = 0; pd < pooled_depth; ++pd) {
           int64_t dstart = pd * stride_d - pads[2];
           int64_t dend = dstart + kernel_shape[2] * dilation_d;
+          dend = std::min(dend, depth + pads[5]);
           const int64_t pool_index = ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
           y_d[pool_index] = 0;
           int total_elements = 0;
diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc
@@ -961,6 +961,7 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
       {"reduce_prod_empty_set", "unknown version", {}},
       {"reduce_sum_empty_set", "unknown version", {}},
       {"reduce_sum_square_empty_set_expanded", "unknown version", {}},
+      {"averagepool_3d_dilations_large_count_include_pad_is_1_ceil_mode_is_True", "TODO(titaiwang): enable this in the next ONNX release."},
 #ifdef ENABLE_TRAINING_CORE
       {"adagrad", "not a registered function/op", {}},                  // Op not registered.
       {"adagrad_multiple", "not a registered function/op", {}},         // Op not registered.
diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
@@ -1030,6 +1030,31 @@ TEST(PoolTest, AveragePool_19_dilation_2d) {
             kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider});
 }
 
+TEST(PoolTest, AveragePool_19_ceil_count_include_pad_1d) {
+  // TODO: Unskip when fixed #41968513
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect.";
+  }
+
+  OpTester test("AveragePool", 19);
+
+  test.AddAttribute("auto_pad", "");
+  test.AddAttribute("strides", std::vector<int64_t>{3});
+  test.AddAttribute("pads", vector<int64_t>{3, 3});
+  test.AddAttribute("kernel_shape", vector<int64_t>{7});
+  test.AddAttribute("ceil_mode", (int64_t)1);
+  test.AddAttribute("count_include_pad", (int64_t)1);
+
+  std::vector<float> x_vals = {2.0903f, 4.6493f, 1.6320f, -3.2051f, 4.6975f, 4.7296f, 3.3653f, -1.5815f, -2.3832f, 0.9628f, -1.5899f, -2.6820f, 5.7529f, 7.7346f, -0.8910f, -2.0151f, 0.1313f, -0.5374f};
+  std::vector<int64_t> x_dims = {1, 2, 9};
+  std::vector<int64_t> expected_dims = {1, 2, 4};
+  std::vector<float> expected_vals = {0.73807144f, 2.5655572f, 0.8032287f, -0.09990001f, 0.34911433f, 1.0389f, 1.4536142f, -0.40353334f};
+
+  test.AddInput<float>("X", x_dims, x_vals);
+  test.AddOutput<float>("Y", expected_dims, expected_vals);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider});
+}
+
 TEST(PoolTest, GlobalAveragePool) {
   OpTester test("GlobalAveragePool");