Update on "Add OpInfo test to check that floating point inputs in OpInfos have requires_grad set to True"

lezcano · lezcano · commit 7fd0bd750e86 · 2022-01-13T12:03:42.000Z
This test detected a number of sampling methods that were not generating
the samples as expected, e.g. `index_put`, `cosine_embedding`, `stft`, but
perhaps most notably the generator for `BinOps`.

It also detected that `reminder` and `fmod` did not have implemented the
backward formula for the second input. I added this in the previous PR.

[ghstack-poisoned]
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
@@ -484,12 +484,13 @@ __host__ __device__
 #endif // HAS_DEMANGLE
 
 #ifdef __clang__
-#define _C10_PRAGMA__(string) _Pragma( #string )
-#define _C10_PRAGMA_(string) _C10_PRAGMA__( string )
+#define _C10_PRAGMA__(string) _Pragma(#string)
+#define _C10_PRAGMA_(string) _C10_PRAGMA__(string)
 #define C10_CLANG_DIAGNOSTIC_PUSH() _Pragma("clang diagnostic push")
 #define C10_CLANG_DIAGNOSTIC_POP() _Pragma("clang diagnostic pop")
-#define C10_CLANG_DIAGNOSTIC_IGNORE(flag) _C10_PRAGMA_(clang diagnostic ignored flag)
-#define C10_CLANG_HAS_WARNING(flag) __has_warning( flag )
+#define C10_CLANG_DIAGNOSTIC_IGNORE(flag) \
+  _C10_PRAGMA_(clang diagnostic ignored flag)
+#define C10_CLANG_HAS_WARNING(flag) __has_warning(flag)
 #else
 #define C10_CLANG_DIAGNOSTIC_PUSH()
 #define C10_CLANG_DIAGNOSTIC_POP()
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
@@ -2870,7 +2870,7 @@ TEST_F(Composed, ComposedOp) {
   bool fusable_on_device = torch::jit::tensorexpr::getTEMustUseLLVMOnCPU();
   torch::jit::tensorexpr::getTEMustUseLLVMOnCPU() = false;
   setTensorExprDynamicShapeFusionEnabled(true);
-  FuseTensorExprs(graph, /*min_group_size*/2, /*add_composed_op*/true);
+  FuseTensorExprs(graph, /*min_group_size*/ 2, /*add_composed_op*/ true);
   Code code(graph, "");
   InterpreterState interpreter{code};
   std::vector<IValue> stack = {a, b};
diff --git a/test/cpp/tensorexpr/test_dynamic_shapes.cpp b/test/cpp/tensorexpr/test_dynamic_shapes.cpp
@@ -318,7 +318,6 @@ TEST(DynamicShapes, GraphWithPartiallySymbolicOutput) {
   symbolic_strides[y_inp] = input_desc;
   symbolic_strides[graph->outputs().at(0)] = input_desc;
 
-
   TensorExprKernel kernel(
       graph, {}, symbolic_shape_inputs, false, symbolic_strides);
 
@@ -443,7 +442,6 @@ TEST(DynamicShapes, GraphWithCatAndBroadcast) {
   symbolic_strides[z_inp] = input_desc;
   symbolic_strides[graph->outputs().at(0)] = input_desc;
 
-
   TensorExprKernel kernel(
       graph, {}, symbolic_shape_inputs, false, symbolic_strides);
 
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 
+#include <ATen/core/interned_strings.h>
 #include <ATen/core/symbol.h>
 #include <ATen/record_function.h>
 #include <c10/util/FunctionRef.h>
@@ -22,7 +23,6 @@
 #include <torch/csrc/jit/runtime/symbolic_shape_registry_util.h>
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 #include <torch/csrc/utils/memory.h>
-#include <ATen/core/interned_strings.h>
 
 // NOLINTNEXTLINE
 C10_DEFINE_bool(
@@ -1284,9 +1284,11 @@ Operation createTensorExprOp(const Node* node) {
     stride_map[v] = striding_inputs[index];
     index++;
   }
-  std::vector<std::string> output_desc = node->ival(attr::striding_outputs_desc).to<std::vector<std::string>>();
+  std::vector<std::string> output_desc =
+      node->ival(attr::striding_outputs_desc).to<std::vector<std::string>>();
   for (size_t i = 0; i < subgraph->outputs().size(); ++i) {
-    stride_map[subgraph->outputs().at(i)] = {strideInputFromString(output_desc.at(i))};
+    stride_map[subgraph->outputs().at(i)] = {
+        strideInputFromString(output_desc.at(i))};
   }
 
   std::shared_ptr<tensorexpr::TensorExprKernel> kernel =
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -1147,7 +1147,8 @@ Tensor TensorExprKernel::convertOutputToCorrectStrides(
           ExprHandle axis = axes[i];
           absolute_position = absolute_position + (stride * axis);
         }
-        std::vector<ExprHandle> new_axes(sorted_stride_indices_descending.size());
+        std::vector<ExprHandle> new_axes(
+            sorted_stride_indices_descending.size());
         for (size_t stride_index : sorted_stride_indices_descending) {
           auto size = sizes[stride_index];
           auto stride = strides[stride_index];
@@ -1156,25 +1157,31 @@ Tensor TensorExprKernel::convertOutputToCorrectStrides(
           // if the size is one, we don't advance the absolute position
           // which would give 0
           auto non_one_position = absolute_position % ExprHandle(stride);
-          absolute_position = CompareSelect::make(size, one, absolute_position, non_one_position, kEQ);
+          absolute_position = CompareSelect::make(
+              size, one, absolute_position, non_one_position, kEQ);
           new_axes[stride_index] = index;
         }
         return BufHandle(buf).load(new_axes);
       });
 }
 
-Tensor TensorExprKernel::convertSymbolicOutputToCorrectStrides(torch::jit::Value* v) {
+Tensor TensorExprKernel::convertSymbolicOutputToCorrectStrides(
+    torch::jit::Value* v) {
   const TensorTypePtr& tt = v->type()->expect<TensorType>();
   TORCH_INTERNAL_ASSERT(
       bufs_.count(v),
       buildErrorMessage(
           "Ouput tensor has no corresponding bufs in the fuser."));
   BufPtr buf = bufs_.at(v);
   // output is contiguous, no work to do
-  if (tensorOutputStrideDesc_[v->offset()] == torch::jit::StrideInput::TENSOR_CONT) {
-    return Tensor(buf, nullptr);;
+  if (tensorOutputStrideDesc_[v->offset()] ==
+      torch::jit::StrideInput::TENSOR_CONT) {
+    return Tensor(buf, nullptr);
+    ;
   }
-  TORCH_INTERNAL_ASSERT(tensorOutputStrideDesc_[v->offset()] == torch::jit::StrideInput::TENSOR_CONT_CHANNELS_LAST);
+  TORCH_INTERNAL_ASSERT(
+      tensorOutputStrideDesc_[v->offset()] ==
+      torch::jit::StrideInput::TENSOR_CONT_CHANNELS_LAST);
   auto sizes = sizesFromSymbolicShape(tt->symbolic_sizes());
   auto dims = c10::fmap<DimArg>(sizes);
   auto strides = make_channels_last_strides(sizes);
@@ -1185,11 +1192,12 @@ Tensor TensorExprKernel::convertSymbolicOutputToCorrectStrides(torch::jit::Value
   auto zero = LongImm::make(0);
   std::vector<ExprPtr> default_strides = make_contiguous_strides(sizes);
   // See explanation in convertOutputToCorrectStrides
-  return convertOutputToCorrectStrides(sizes, sorted_stride_indices, strides, buf);
+  return convertOutputToCorrectStrides(
+      sizes, sorted_stride_indices, strides, buf);
 }
 
-
-Tensor TensorExprKernel::convertStaticShapeOutputToCorrectStrides(torch::jit::Value* v) {
+Tensor TensorExprKernel::convertStaticShapeOutputToCorrectStrides(
+    torch::jit::Value* v) {
   const TensorTypePtr& tt = v->type()->expect<TensorType>();
   TORCH_INTERNAL_ASSERT(
       bufs_.count(v),
@@ -1231,9 +1239,9 @@ Tensor TensorExprKernel::convertStaticShapeOutputToCorrectStrides(torch::jit::Va
   auto zero = LongImm::make(0);
   std::vector<size_t> sorted_stride_indices = reverse_sort_indices(strides);
 
-  // TODO: call into `convertOutputToCorrectStrides`. Currently this causes a bug
-  // in IRSimplifier to occur.
-  // See explanation in `convertOutputToCorrectStrides`
+  // TODO: call into `convertOutputToCorrectStrides`. Currently this causes a
+  // bug in IRSimplifier to occur. See explanation in
+  // `convertOutputToCorrectStrides`
   return Compute(
       "output_1", dims, [&](const std::vector<VarHandle>& axes_input) {
         std::vector<ExprHandle> axes(axes_input.begin(), axes_input.end());
@@ -1467,7 +1475,8 @@ void TensorExprKernel::compile() {
       auto stride_desc = symbolic_strides_[output];
       TORCH_INTERNAL_ASSERT(stride_desc.size() == 1);
       tensorOutputStrideDesc_.push_back(stride_desc[0]);
-      Tensor properly_strided_output = convertSymbolicOutputToCorrectStrides(output);
+      Tensor properly_strided_output =
+          convertSymbolicOutputToCorrectStrides(output);
       if (properly_strided_output.stmt()) {
         block->append_stmt(properly_strided_output.stmt());
       }
@@ -1476,7 +1485,8 @@ void TensorExprKernel::compile() {
       // The "strided" tensor will be incorrect if used in NNC,
       // since NNC views it as contiguous. Only convert it to the right
       // strides at the end of the kernel (if already contiguous it's a no-op)
-      Tensor properly_strided_output = convertStaticShapeOutputToCorrectStrides(output);
+      Tensor properly_strided_output =
+          convertStaticShapeOutputToCorrectStrides(output);
       if (properly_strided_output.stmt()) {
         block->append_stmt(properly_strided_output.stmt());
       }
@@ -1601,9 +1611,13 @@ void TensorExprKernel::updateOutputSizesAndStrides(
     }
 
     if (tensorOutputStrideDesc_[i] == torch::jit::StrideInput::TENSOR_CONT) {
-      tensorOutputStrides_[i] = TensorType::contiguousStridesOf(tensorOutputSizes_[i]);
-    } else if (tensorOutputStrideDesc_[i] == torch::jit::StrideInput::TENSOR_CONT_CHANNELS_LAST) {
-      tensorOutputStrides_[i] = at::get_channels_last_strides_2d(tensorOutputSizes_[i]);
+      tensorOutputStrides_[i] =
+          TensorType::contiguousStridesOf(tensorOutputSizes_[i]);
+    } else if (
+        tensorOutputStrideDesc_[i] ==
+        torch::jit::StrideInput::TENSOR_CONT_CHANNELS_LAST) {
+      tensorOutputStrides_[i] =
+          at::get_channels_last_strides_2d(tensorOutputSizes_[i]);
     } else {
       std::string output_desc = toString(tensorOutputStrideDesc_[i]);
       TORCH_INTERNAL_ASSERT(