fix comments and add doc for context analysis

zhiics · zhiics · commit 6ae69008135e · 2020-08-28T04:05:56.000Z
diff --git a/python/tvm/relay/transform/memory_alloc.py b/python/tvm/relay/transform/memory_alloc.py
@@ -32,7 +32,7 @@
 from ..op.memory import flatten_tuple_type, from_tuple_type, to_tuple_type
 from ...import cpu
 from ..op.memory import alloc_storage
-from ..analysis import context_analysis as _context_analysis
+from ..analysis import context_analysis
 from ..._ffi.runtime_ctypes import TVMContext
 
 def alloc_tensor(storage, shape, dtype='float32', assert_shape=None):
@@ -85,7 +85,7 @@ def is_reshape_only(func):
 class ManifestAllocPass(ExprMutator):
     """A pass for explicitly manifesting all memory allocations in Relay."""
 
-    def __init__(self, target_host, context_analysis):
+    def __init__(self, target_host, context_analysis_map):
         self.invoke_tvm = op.vm.invoke_tvm_op
         self.shape_func = op.vm.shape_func
         self.shape_of = op.vm.shape_of
@@ -94,13 +94,13 @@ def __init__(self, target_host, context_analysis):
         self.target_host = target_host
         self.default_context = cpu(0)
         self.compute_dtype = "int64"
-        self.context_analysis = context_analysis
+        self.context_analysis_map = context_analysis_map
         super().__init__()
 
     def get_context(self, exp):
         """Get the context of a given expression"""
-        assert exp in self.context_analysis, exp.astext(False)
-        val = self.context_analysis[exp]
+        assert exp in self.context_analysis_map, exp.astext(False)
+        val = self.context_analysis_map[exp]
         # val[0], val[1] are device_type and device_id, respectively.
         # We don't need to unpack after porting this pass to C++.
         assert len(val) == 2
@@ -339,6 +339,7 @@ def _annotator(exp):
 @module_pass(opt_level=0)
 class ManifestAlloc:
     """The explicit pass wrapper around ManifestAlloc."""
+    # TODO(zhiics, jroesch) Port this pass to C++.
     def __init__(self, target_host, targets):
         self.target_host = target_host
         self.targets = targets
@@ -356,13 +357,13 @@ def transform_module(self, mod, _):
                 fallback_ctx = nd.context(pass_ctx.config["relay.fallback_device_type"])
             else:
                 fallback_ctx = cpu(0)
-            ca = _context_analysis(mod, TVMContext(fallback_ctx.device_type, 0))
+            ca = context_analysis(mod, TVMContext(fallback_ctx.device_type, 0))
         else:
             if isinstance(self.targets, dict):
                 dev = list(self.targets.keys())[0]
             else:
                 dev, _ = self.targets.items()[0]
-            ca = _context_analysis(mod, nd.context(dev.value))
+            ca = context_analysis(mod, nd.context(dev.value))
 
         # The following code can be used for debugging the module after
         # annotation.
diff --git a/src/relay/analysis/context_analysis.cc b/src/relay/analysis/context_analysis.cc
@@ -20,6 +20,33 @@
 /*!
  * \file src/relay/analysis/context_analysis.cc
  * \brief A pass for analyzing device attribute of each IR node.
+ *
+ * We use union-find data structures to analyze the context information of each
+ * sub-expression in a Relay program in this pass. Only the device copy node in
+ * Relay directly contains bidiretional device information. We use it to
+ * bidirectionally propagate the device info of its inputs and outputs.
+ *
+ * However, to support dynamism (e.g dynamic inputs), Relay introduces several
+ * concepts to compute the shape of tensors and operators at runtime, i.e.
+ * shape_of, shape_func, and reshape_tensor. These nodes are also referred to as
+ * VM dialects as we have native VM instructions for them. These dialects are
+ * intrinsically CPU friendly, therefore, they are only designed to be
+ * executed on CPU. We, hence, unify their inputs and outputs to CPU as well.
+ * Note the input of shape_of is a tensor and we only need the tensor shape.
+ * Therefore, the input could be sitting on GPU as well since no real data is
+ * needed. The context of the input would be propagated from its other
+ * consumers or fallback to the default device.
+ *
+ * Another type of dialect is used fo memory allocation, namely, alloc_storage
+ * and alloc_tensor. alloc_storage contains a context field to indicate where
+ * the chunk of memory is allocated. Therefore, we unify the context of
+ * alloc_storage with the context field. Other inputs, such as size and
+ * alignment, are left on CPU.
+ *
+ * Based on the above rules, we keep unifying the connected expressions and
+ * propagating their device information. An error will be raised whenever there
+ * is a unification conflict. All IR nodes that are not propagated with device
+ * context will fallback to the specified device.
  */
 
 #include <tvm/relay/analysis.h>