Fix broadcasting of cdist backward (pytorch#56605)

albanD · facebook-github-bot · commit 0d7e780effaf · 2021-04-23T14:08:18.000-07:00
Summary: Pull Request resolved: pytorch#56605 Fix pytorch#55370 Test Plan: Imported from OSS Reviewed By: ailzhang Differential Revision: D27939202 Pulled By: albanD fbshipit-source-id: a4ac50a7b504c24f47f5343414fb57523546a0c7
diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp
@@ -145,7 +145,31 @@ Tensor _cdist_forward(const Tensor& x1, const Tensor& x2, const double p, c10::o
   return result;
 }
 
-Tensor _cdist_backward(const Tensor& grad, const Tensor& x1, const Tensor& x2, const double p, const Tensor& cdist) {
+Tensor _cdist_backward(const Tensor& grad, const Tensor& _x1, const Tensor& _x2, const double p, const Tensor& cdist) {
+  // Broadcasting might generate non-contiguous Tensors, so handle it before doing checks
+  int64_t c1 = _x1.size(-1);
+  int64_t c2 = _x2.size(-1);
+  int64_t r1 = _x1.size(-2);
+  int64_t r2 = _x2.size(-2);
+  auto dim1 = _x1.dim();
+  auto dim2 = _x2.dim();
+  IntArrayRef batch_tensor1(_x1.sizes().data(), dim1 - 2);
+  IntArrayRef batch_tensor2(_x2.sizes().data(), dim2 - 2);
+  std::vector<int64_t> expand_batch_portion = infer_size(batch_tensor1, batch_tensor2);
+  std::vector<int64_t> tensor1_expand_size(expand_batch_portion);
+  tensor1_expand_size.insert(tensor1_expand_size.end(), {r1, c1});
+  std::vector<int64_t> tensor2_expand_size(expand_batch_portion);
+  tensor2_expand_size.insert(tensor2_expand_size.end(), {r2, c2});
+
+  Tensor x1 = _x1;
+  if (tensor1_expand_size != x1.sizes()) {
+    x1 = x1.expand(tensor1_expand_size).contiguous();
+  }
+  Tensor x2 = _x2;
+  if (tensor2_expand_size != x2.sizes()) {
+    x2 = x2.expand(tensor2_expand_size).contiguous();
+  }
+
   TORCH_CHECK(x1.is_contiguous(), "_cdist_backward requires X1 to be contiguous");
   TORCH_CHECK(x2.is_contiguous(), "_cdist_backward requires X2 to be contiguous");
   TORCH_CHECK(cdist.is_contiguous(), "_cdist_backward requires dist to be contiguous");
@@ -156,13 +180,17 @@ Tensor _cdist_backward(const Tensor& grad, const Tensor& x1, const Tensor& x2, c
   TORCH_CHECK(device1 == kCPU || device1 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X1 got: ", device1);
   auto device2 = x2.device().type();
   TORCH_CHECK(device2 == kCPU || device2 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X2 got: ", device2);
-  IntArrayRef batch_tensor1(x1.sizes().data(), std::max<int64_t>(x1.dim() - 2, 0));
-  const int64_t batch_product = c10::multiply_integers(batch_tensor1);
+
+  // Compute the linearized batch size
+  const int64_t batch_product = c10::multiply_integers(expand_batch_portion);
+
   Tensor grad_x1 =
-      at::empty_like(x1, x1.options(), LEGACY_CONTIGUOUS_MEMORY_FORMAT)
-          .view({batch_product, n, m});
+      at::empty({batch_product, n, m}, x1.options(), LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   cdist_backward_stub(device1, grad_x1, grad, x1, x2, p, cdist);
-  return grad_x1;
+
+  // Use x1.size() here and not the original size of _x1.size() as this gradient is not taking broadcasting into account
+  // Broadcasting will be handled automatically by the autograd engine
+  return grad_x1.view(x1.sizes());
 }
 
 Tensor _pdist_forward(const Tensor& self, const double p) {
diff --git a/aten/src/ATen/native/cuda/DistanceKernel.cu b/aten/src/ATen/native/cuda/DistanceKernel.cu
@@ -331,7 +331,8 @@ void cdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor
   const int64_t r1 = x1.size(-2);
   const int64_t r2 = x2.size(-2);
   const int64_t m = x1.size(-1);
-  int64_t batch = x1.dim() > 2 ? x1.size(0) : 1;
+  // Just like we do in the CPU code, assume that result is always batched
+  int64_t batch = result.size(0);
   const int block_x = 64;
   const int block_y = 16;
   const int grid_x = (m + block_x * 8 - 1) / (block_x * 8);
@@ -352,7 +353,7 @@ void cdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor
   //we call grad.contiguous() before backward, so stride is guaranteed to be 1
   const int64_t gs = 1;
 
-  Tensor buffer = (x1.dim() > 2) ? at::empty({batch, r2, r1, m}, result.options()) : at::empty({r2, r1, m}, result.options());
+  Tensor buffer = at::empty({batch, r2, r1, m}, result.options());
   AT_DISPATCH_FLOATING_TYPES(result.scalar_type(), "cdist_cuda_backward", [&] {
     if (p == 1.0) {
       cdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(buffer.data_ptr<scalar_t>(),
@@ -382,11 +383,7 @@ void cdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor
     }
   });
 
-  if (x1.dim() > 2) {
-    at::sum_out(result, buffer, 1);
-  } else {
-    at::sum_out(result, buffer, 0);
-  }
+  at::sum_out(result, buffer, 1);
 
 }
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
@@ -932,6 +932,7 @@ def sample_inputs_broadcast_to(op_info, device, dtype, requires_grad, **kwargs):
             args=(shape,)) for size, shape in test_cases)
 
 def sample_inputs_cdist(op_info, device, dtype, requires_grad, **kwargs):
+    small_S = 2
     test_cases = (
         ((S, S, 2), (S, S + 1, 2)),
         ((S, S), (S, S)),
@@ -942,8 +943,10 @@ def sample_inputs_cdist(op_info, device, dtype, requires_grad, **kwargs):
         ((1, 1), (S, 1)),
         # TODO enable that as this causes "Floating point exception (core dumped)"
         # ((0, 5), (4, 5)),
-        # TODO enable that as this causes https://github.com/pytorch/pytorch/issues/55370
-        # ((S, S, 21, 2), (S, S, 22, 2))
+        # Using S here would make this one test take 9s
+        ((small_S, small_S, small_S + 1, 2), (small_S, small_S, small_S + 2, 2)),
+        ((small_S, 1, 1, small_S), (1, small_S, small_S)),
+        ((1, 1, small_S), (small_S, 1, small_S, small_S)),
     )
 
     samples = []