Support custom Python classes in CUDAFuture (pytorch#56516)

lw · facebook-github-bot · commit a688b297505a · 2021-04-24T07:06:28.000-07:00
Summary: Pull Request resolved: pytorch#56516 One problem with CUDAFuture's extraction of DataPtrs from IValues is that it only supported Python objects that could be converted to "regular" IValues (e.g., lists/dicts/tuples of ints/strings/tensors/...). One notable exception are custom Python classes, which are in fact a very common data type transferred over RPC. The only solution we found for those is to use the Python pickler to extract the tensors contained in them. We can't insert a Python dependency directly into CUDAFuture, so instead I'm proposing to use the same indirection technique used to support `getSubValues` on Python objects: define some methods on the abstract class `PyObjectHolder` (which can be used by CUDAFuture) but only implement them in the concrete subclass `ConcretePyObjectHolder` (which is only built when Python support is enabled). I am a bit worried about the performance toll of this (pickling isn't exactly known to be cheap) but I think we should start by providing a functionally complete API. We already have ideas on how to make this faster if needed, for example by having users provide a custom DataPtr extractor tailored to their class via a decorator. (Or just use TorchScript). ghstack-source-id: 127295014 Test Plan: Added a test later in the stack Reviewed By: mrshenli Differential Revision: D27887189 fbshipit-source-id: 9d27e4e62390b836e5bb4f06f401cc002f0cf95b
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
@@ -708,6 +708,7 @@ struct ivalue::PyObjectHolder : c10::intrusive_ptr_target {
   virtual c10::InferredType tryToInferType() = 0;
   virtual IValue toIValue(const TypePtr& type, c10::optional<int32_t> N = c10::nullopt) = 0;
   virtual std::string toStr() = 0;
+  virtual std::vector<at::Tensor> extractTensors() = 0;
 
   virtual ~PyObjectHolder(){};
 };
diff --git a/aten/src/ATen/cuda/CUDAFuture.cpp b/aten/src/ATen/cuda/CUDAFuture.cpp
@@ -26,15 +26,26 @@ namespace {
 
 std::vector<std::reference_wrapper<const at::DataPtr>> extractDataPtrs(
     const at::IValue& value) {
-  at::IValue::HashAliasedIValues sub_values;
-  // Prefer getSubValues() over visit() as the latter is a silent no-op for
-  // some unsupported types, whereas the former at least fails loudly.
-  value.getSubValues(sub_values);
-
   std::vector<std::reference_wrapper<const at::DataPtr>> data_ptrs;
-  for (const at::IValue& sub_value : sub_values) {
-    if (sub_value.isTensor()) {
-      data_ptrs.emplace_back(sub_value.toTensor().storage().data_ptr());
+  // getSubValues works poorly on Python objects: it only works if they can be
+  // converted to a "regular" IValue type hence, for example, it doesn't support
+  // custom subclasses. Thus, instead, we extract the tensors through pickling.
+  if (value.isPyObject()) {
+    std::vector<at::Tensor> tensors =
+        value.toPyObjectHolder()->extractTensors();
+    data_ptrs.reserve(tensors.size());
+    for (const at::Tensor& tensor : tensors) {
+      data_ptrs.emplace_back(tensor.storage().data_ptr());
+    }
+  } else {
+    at::IValue::HashAliasedIValues sub_values;
+    // Prefer getSubValues() over visit() as the latter is a silent no-op for
+    // some unsupported types, whereas the former at least fails loudly.
+    value.getSubValues(sub_values);
+    for (const at::IValue& sub_value : sub_values) {
+      if (sub_value.isTensor()) {
+        data_ptrs.emplace_back(sub_value.toTensor().storage().data_ptr());
+      }
     }
   }
   return data_ptrs;
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
@@ -15,6 +15,8 @@
 import torch
 import sys
 import builtins
+import io
+import pickle
 # This is needed. `torch._jit_internal` is imported before `torch.distributed.__init__`.
 # Explicitly ask to import `torch.distributed.__init__` first.
 # Otherwise, "AttributeError: module 'torch' has no attribute 'distributed'" is raised.
@@ -1119,3 +1121,29 @@ def _isinstance(obj, target_type) -> bool:
 
     # handle non-containers
     return isinstance(obj, target_type)
+
+
+class _TensorExtractor(pickle.Pickler):
+    def __init__(self, *args, tensors: List[torch.Tensor], **kwargs):
+        super().__init__(*args, **kwargs)
+        self.tensors = tensors
+
+    def persistent_id(self, obj):
+        if isinstance(obj, torch.Tensor):
+            self.tensors.append(obj)
+            return ""
+        else:
+            return None
+
+
+def _extract_tensors(obj):
+    r"""
+    This function is exclusively called from C++.
+    See ``torch/csrc/jit/python/python_ivalue.h``.
+
+    It extracts the tensors contained in the given object, through pickling.
+    """
+    tensors: List[torch.Tensor] = []
+    extractor = _TensorExtractor(io.BytesIO(), protocol=-1, tensors=tensors)
+    extractor.dump(obj)
+    return tensors
diff --git a/torch/csrc/jit/python/python_ivalue.h b/torch/csrc/jit/python/python_ivalue.h
@@ -42,6 +42,28 @@ struct C10_EXPORT ConcretePyObjectHolder final : PyObjectHolder {
     return py::str(py_obj_);
   }
 
+  std::vector<at::Tensor> extractTensors() override {
+    // We could implement this entirely in C++ via pybind11 but it turns out to
+    // be substantially slower. Namely, the total time taken by markCompleted on
+    // a CUDAFuture is 21.5us with this implementation, but goes up to 58.7us
+    // when using C++. The reason is unclear.
+    try {
+      pybind11::gil_scoped_acquire ag;
+      return py::module::import("torch._jit_internal")
+          .attr("_extract_tensors")(py_obj_)
+          .cast<std::vector<at::Tensor>>();
+    } catch (py::error_already_set& e) {
+      auto err = std::runtime_error(
+          c10::str("Cannot extract tensors from value: ", e.what()));
+      {
+        pybind11::gil_scoped_acquire ag;
+        e.restore();
+        PyErr_Clear();
+      }
+      throw err;
+    }
+  }
+
   // Note [Destructing py::object]
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~
   //