Enable Intel® VTune™ Profiler's Instrumentation and Tracing Technolog…

…y APIs (ITT) to PyTorch (pytorch#63289) More detailed description of benefits can be found at pytorch#41001. This is Intel's counterpart of NVidia’s NVTX (https://pytorch.org/docs/stable/autograd.html#torch.autograd.profiler.emit_nvtx). ITT is a functionality for labeling trace data during application execution across different Intel tools. For integrating Intel(R) VTune Profiler into Kineto, ITT needs to be integrated into PyTorch first. It works with both standalone VTune Profiler [(https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html](https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html)) and Kineto-integrated VTune functionality in the future. It works for both Intel CPU and Intel XPU devices. Pitch Add VTune Profiler's ITT API function calls to annotate PyTorch ops, as well as developer customized code scopes on CPU, like NVTX for NVidia GPU. This PR rebases the code changes at pytorch#61335 to the latest master branch. Usage example: ``` with torch.autograd.profiler.emit_itt(): for i in range(10): torch.itt.range_push('step_{}'.format(i)) model(input) torch.itt.range_pop() ``` cc @ilia-cher @robieta @chaekit @gdankel @bitfort @ngimel @orionr @nbcsm @guotuofeng @guyang3532 @gaoteng-git Pull Request resolved: pytorch#63289 Approved by: https://github.com/malfet
KalyanGokhale · Jul 13, 2022 · 3c70447 · 3c70447
1 parent 937ca69
commit 3c70447
Show file tree

Hide file tree

Showing 39 changed files with 534 additions and 50 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -139,6 +139,9 @@
 [submodule "third_party/pocketfft"]
 	path = third_party/pocketfft
 	url = https://github.com/mreineck/pocketfft
+[submodule "third_party/ittapi"]
+	path = third_party/ittapi
+	url = https://github.com/intel/ittapi.git
 [submodule "third_party/flatbuffers"]
 	path = third_party/flatbuffers
 	url = https://github.com/google/flatbuffers.git

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -295,6 +295,10 @@ if(NOT USE_XNNPACK AND CMAKE_VERSION VERSION_LESS ${XNNPACK_MIN_CMAKE_VER})
 endif()
 option(USE_ZMQ "Use ZMQ" OFF)
 option(USE_ZSTD "Use ZSTD" OFF)
+# Ensure that an ITT build is the default for x86 CPUs
+cmake_dependent_option(
+  USE_ITT "Use Intel(R) VTune Profiler ITT functionality" ON
+  "CPU_INTEL" OFF)
 # Ensure that an MKLDNN build is the default for x86 CPUs
 # but optional for AArch64 (dependent on -DUSE_MKLDNN).
 cmake_dependent_option(

diff --git a/build_variables.bzl b/build_variables.bzl
@@ -132,6 +132,7 @@ libtorch_profiler_sources = [
     "torch/csrc/profiler/kineto_shim.cpp",
     "torch/csrc/profiler/nvtx_observer.cpp",
     "torch/csrc/profiler/kineto_client_interface.cpp",
+    "torch/csrc/profiler/itt_observer.cpp",
     "torch/csrc/monitor/counters.cpp",
     "torch/csrc/monitor/events.cpp",
 ]

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
@@ -609,6 +609,13 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
     )
   endif()
 
+  if(${USE_ITT})
+    list(APPEND TORCH_SRCS
+      ${TORCH_SRC_DIR}/csrc/itt_wrapper.cpp
+      ${TORCH_SRC_DIR}/csrc/profiler/itt.cpp
+    )
+  endif()
+
   if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
     list(APPEND TORCH_SRCS
       ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp

diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in
@@ -42,6 +42,7 @@ static_assert(
 #cmakedefine CAFFE2_USE_MKL
 #cmakedefine CAFFE2_USE_MKLDNN
 #cmakedefine CAFFE2_USE_NVTX
+#cmakedefine CAFFE2_USE_ITT
 #cmakedefine CAFFE2_USE_TRT
 
 #ifndef EIGEN_MPL2_ONLY
@@ -82,5 +83,6 @@ static_assert(
   {"USE_MKL", "${CAFFE2_USE_MKL}"}, \
   {"USE_MKLDNN", "${CAFFE2_USE_MKLDNN}"}, \
   {"USE_NVTX", "${CAFFE2_USE_NVTX}"}, \
+  {"USE_ITT", "${CAFFE2_USE_ITT}"}, \
   {"USE_TRT", "${CAFFE2_USE_TRT}"}, \
 }
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
@@ -962,6 +962,19 @@ if(USE_FFMPEG)
   endif()
 endif()
 
+if(USE_ITT)
+  find_package(ITT)
+  if(ITT_FOUND)
+    include_directories(SYSTEM ${ITT_INCLUDE_DIR})
+    list(APPEND Caffe2_DEPENDENCY_LIBS ${ITT_LIBRARIES})
+    list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${ITT_LIBRARIES})
+  else()
+    message(WARNING "Not compiling with ITT. Suppress this warning with -DUSE_ITT=OFF")
+    set(USE_ITT OFF CACHE BOOL "" FORCE)
+    caffe2_update_option(USE_ITT OFF)
+  endif()
+endif()
+
 # ---[ Caffe2 depends on FP16 library for half-precision conversions
 if(NOT TARGET fp16 AND NOT USE_SYSTEM_FP16)
   set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")

diff --git a/cmake/Modules/FindITT.cmake b/cmake/Modules/FindITT.cmake
@@ -0,0 +1,21 @@
+# - Try to find ITT
+#
+# The following are set after configuration is done:
+#  ITT_FOUND          : set to true if ITT is found.
+#  ITT_INCLUDE_DIR    : path to ITT include dir.
+#  ITT_LIBRARIES      : list of libraries for ITT
+
+IF (NOT ITT_FOUND)
+  SET(ITT_FOUND OFF)
+
+  SET(ITT_INCLUDE_DIR)
+  SET(ITT_LIBRARIES)
+
+  SET(ITT_ROOT "${PROJECT_SOURCE_DIR}/third_party/ittapi")
+  FIND_PATH(ITT_INCLUDE_DIR ittnotify.h PATHS ${ITT_ROOT} PATH_SUFFIXES include)
+  IF (ITT_INCLUDE_DIR)
+    ADD_SUBDIRECTORY(${ITT_ROOT})
+    SET(ITT_LIBRARIES ittnotify)
+    SET(ITT_FOUND ON)
+  ENDIF (ITT_INCLUDE_DIR)
+ENDIF(NOT ITT_FOUND)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
@@ -150,6 +150,7 @@ function(caffe2_print_configuration_summary)
   if(${USE_UCC})
     message(STATUS "    USE_SYSTEM_UCC        : ${USE_SYSTEM_UCC}")
   endif()
+  message(STATUS "  USE_ITT               : ${USE_ITT}")
   message(STATUS "  USE_NCCL              : ${USE_NCCL}")
   if(${USE_NCCL})
     message(STATUS "    USE_SYSTEM_NCCL     : ${USE_SYSTEM_NCCL}")

diff --git a/docs/source/autograd.rst b/docs/source/autograd.rst
@@ -223,10 +223,12 @@ Profiler
 ^^^^^^^^
 
 Autograd includes a profiler that lets you inspect the cost of different
-operators inside your model - both on the CPU and GPU. There are two modes
+operators inside your model - both on the CPU and GPU. There are three modes
 implemented at the moment - CPU-only using :class:`~torch.autograd.profiler.profile`.
-and nvprof based (registers both CPU and GPU activity) using
+nvprof based (registers both CPU and GPU activity) using
 :class:`~torch.autograd.profiler.emit_nvtx`.
+and vtune profiler based using
+:class:`~torch.autograd.profiler.emit_itt`.
 
 .. autoclass:: torch.autograd.profiler.profile
 
@@ -240,6 +242,7 @@ and nvprof based (registers both CPU and GPU activity) using
     profiler.profile.total_average
 
 .. autoclass:: torch.autograd.profiler.emit_nvtx
+.. autoclass:: torch.autograd.profiler.emit_itt
 
 
 .. autosummary::

diff --git a/docs/source/bottleneck.rst b/docs/source/bottleneck.rst
@@ -47,7 +47,9 @@ where [args] are any number of arguments to `script.py`, or run
     evaluating. If the profiler outputs don't help, you could try looking at
     the result of :func:`torch.autograd.profiler.emit_nvtx()` with ``nvprof``.
     However, please take into account that the NVTX overhead is very high and
-    often gives a heavily skewed timeline.
+    often gives a heavily skewed timeline. Similarly, Intel VTune Profiler helps
+    to analyze performance on Intel platforms further with
+    :func:`torch.autograd.profiler.emit_nvtx()`.
 
 .. warning::
     If you are profiling CUDA code, the first profiler that ``bottleneck`` runs

diff --git a/scripts/build_android.sh b/scripts/build_android.sh
@@ -135,6 +135,7 @@ else
 fi
 # Disable unused dependencies
 CMAKE_ARGS+=("-DUSE_CUDA=OFF")
+CMAKE_ARGS+=("-DUSE_ITT=OFF")
 CMAKE_ARGS+=("-DUSE_GFLAGS=OFF")
 CMAKE_ARGS+=("-DUSE_OPENCV=OFF")
 CMAKE_ARGS+=("-DUSE_LMDB=OFF")

diff --git a/scripts/build_ios.sh b/scripts/build_ios.sh
@@ -104,6 +104,7 @@ CMAKE_ARGS+=("-DBUILD_PYTHON=OFF")
 
 # Disable unused dependencies
 CMAKE_ARGS+=("-DUSE_CUDA=OFF")
+CMAKE_ARGS+=("-DUSE_ITT=OFF")
 CMAKE_ARGS+=("-DUSE_GFLAGS=OFF")
 CMAKE_ARGS+=("-DUSE_OPENCV=OFF")
 CMAKE_ARGS+=("-DUSE_LMDB=OFF")

diff --git a/scripts/build_mobile.sh b/scripts/build_mobile.sh
@@ -38,6 +38,7 @@ fi
 # Disable unused dependencies
 CMAKE_ARGS+=("-DUSE_ROCM=OFF")
 CMAKE_ARGS+=("-DUSE_CUDA=OFF")
+CMAKE_ARGS+=("-DUSE_ITT=OFF")
 CMAKE_ARGS+=("-DUSE_GFLAGS=OFF")
 CMAKE_ARGS+=("-DUSE_OPENCV=OFF")
 CMAKE_ARGS+=("-DUSE_LMDB=OFF")

diff --git a/scripts/build_tizen.sh b/scripts/build_tizen.sh
@@ -112,6 +112,7 @@ cd $BUILD_ROOT
 cmake "$CAFFE2_ROOT" \
     -DCMAKE_VERBOSE_MAKEFILE=1 \
     -DUSE_CUDA=OFF \
+    -DUSE_ITT=OFF \
     -DUSE_OPENCV=OFF \
     -DUSE_LMDB=OFF \
     -DCAFFE2_CPU_FLAGS="-mfpu=neon -mfloat-abi=soft" \

diff --git a/setup.py b/setup.py
@@ -52,6 +52,8 @@
 #
 #   USE_STATIC_MKL
 #     Prefer to link with MKL statically - Unix only
+#   USE_ITT=0
+#     disable use of Intel(R) VTune Profiler's ITT functionality
 #
 #   USE_NNPACK=0
 #     disables NNPACK build
@@ -541,6 +543,11 @@ def run(self):
         if cmake_cache_vars['USE_LIGHTWEIGHT_DISPATCH']:
             report('-- Using lightweight dispatch')
 
+        if cmake_cache_vars['USE_ITT']:
+            report('-- Using ITT')
+        else:
+            report('-- Not using ITT')
+
         # Do not use clang to compile extensions if `-fstack-clash-protection` is defined
         # in system CFLAGS
         c_flags = str(os.getenv('CFLAGS', ''))

diff --git a/third_party/ittapi b/third_party/ittapi
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
@@ -117,6 +117,13 @@ if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
       -Wno-writable-strings)
 endif()
 
+if(USE_ITT)
+  list(APPEND TORCH_PYTHON_SRCS
+    ${TORCH_SRC_DIR}/csrc/itt.cpp
+  )
+  list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_ITT)
+endif()
+
 if(USE_CUDA)
     include(${TORCH_ROOT}/cmake/public/cuda.cmake)
     append_filelist("libtorch_python_cuda_core_sources" TORCH_PYTHON_SRCS)

diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi
@@ -10,6 +10,7 @@ class ProfilerState(Enum):
     CPU = ...
     CUDA = ...
     NVTX = ...
+    ITT = ...
     KINETO = ...
     KINETO_GPU_FALLBACK = ...
 

diff --git a/torch/_C/_itt.pyi b/torch/_C/_itt.pyi
@@ -0,0 +1,4 @@
+# Defined in torch/csrc/itt.cpp
+def rangePush(message: str) -> None: ...
+def rangePop() -> None: ...
+def mark(message: str) -> None: ...
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
@@ -479,6 +479,70 @@ def _call_end_callbacks_on_future(self, fut: Future[Any]) -> Future[Any]:
         return profiled_future
 
 
+class emit_itt(object):
+    """Context manager that makes every autograd operation emit an ITT range.
+
+    It is useful when running the program under Intel(R) VTune Profiler::
+
+        vtune <--vtune_flags> <regular command here>
+
+    The Instrumentation and Tracing Technology (ITT) API enables your application to generate and
+    control the collection of trace data during its execution across different Intel tools.
+    This context manager is to annotate Intel(R) VTune Profiling trace. With help of this context manager,
+    you will be able to see labled ranges in Intel(R) VTune Profiler GUI.
+
+    .. warning:
+        This context manager should not be called recursively, i.e. at most one
+        instance should be enabled at any given time.
+
+    Args:
+        enabled (bool, optional, default=True): Setting ``enabled=False`` makes this context manager a no-op.
+            Default: ``True``.
+        record_shapes (bool, optional, default=False): If ``record_shapes=True``, the itt range wrapping
+            each autograd op will append information about the sizes of Tensor arguments received
+            by that op, in the following format:
+            ``[[arg0.size(0), arg0.size(1), ...], [arg1.size(0), arg1.size(1), ...], ...]``
+            Non-tensor arguments will be represented by ``[]``.
+            Arguments will be listed in the order they are received by the backend op.
+            Please note that this order may not match the order in which those arguments were passed
+            on the Python side.  Also note that shape recording may increase the overhead of itt range creation.
+
+    Example:
+        >>> with torch.autograd.profiler.emit_itt():
+        ...     model(x)
+
+    """
+    def __init__(self, enabled=True, record_shapes=False):
+        self.enabled = enabled
+        self.entered = False
+        self.record_shapes = record_shapes
+
+    def __enter__(self):
+        if not self.enabled:
+            return
+        if self.entered:
+            raise RuntimeError("ITT annotation context manager is not reentrant")
+        self.entered = True
+        _enable_profiler(
+            ProfilerConfig(
+                ProfilerState.ITT,
+                self.record_shapes,
+                False,
+                False,
+                False,
+                False,
+                _ExperimentalConfig()),
+            set()
+        )
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.enabled:
+            return
+        _disable_profiler()
+        return False
+
+
 class emit_nvtx(object):
     """Context manager that makes every autograd operation emit an NVTX range.
 

diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
@@ -910,6 +910,14 @@ void initModule(PyObject* module);
 } // namespace torch
 #endif
 
+#ifdef USE_ITT
+namespace torch {
+namespace profiler {
+void initIttBindings(PyObject* module);
+} // namespace profiler
+} // namespace torch
+#endif
+
 static std::vector<PyMethodDef> methods;
 
 // In Python we can't use the trick of C10_LOG_API_USAGE_ONCE
@@ -1008,6 +1016,9 @@ PyObject* initModule() {
   torch::autograd::init_legacy_variable(module);
   torch::python::init_bindings(module);
   torch::lazy::initLazyBindings(module);
+#ifdef USE_ITT
+  torch::profiler::initIttBindings(module);
+#endif
 #ifdef USE_CUDA
   torch::cuda::initModule(module);
 #endif

diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
@@ -85,6 +85,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
       .value("CPU", ProfilerState::CPU)
       .value("CUDA", ProfilerState::CUDA)
       .value("NVTX", ProfilerState::NVTX)
+      .value("ITT", ProfilerState::ITT)
       .value("KINETO", ProfilerState::KINETO)
       .value("KINETO_GPU_FALLBACK", ProfilerState::KINETO_GPU_FALLBACK);
 

diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
@@ -11,6 +11,7 @@
 #include <torch/csrc/profiler/api.h>
 #include <torch/csrc/profiler/collection.h>
 #include <torch/csrc/profiler/containers.h>
+#include <torch/csrc/profiler/itt_observer.h>
 #include <torch/csrc/profiler/kineto_shim.h>
 #include <torch/csrc/profiler/nvtx_observer.h>
 
@@ -626,7 +627,8 @@ void reportBackendEventToActiveKinetoProfiler(
 void prepareProfiler(
     const torch::profiler::impl::ProfilerConfig& config,
     const std::set<torch::profiler::impl::ActivityType>& activities) {
-  if (config.state == ProfilerState::NVTX) {
+  if (config.state == ProfilerState::NVTX ||
+      config.state == ProfilerState::ITT) {
     return;
   }
   TORCH_CHECK(
@@ -645,6 +647,9 @@ void enableProfilerWithEventPostProcess(
   TORCH_CHECK(
       config.state != ProfilerState::NVTX,
       "NVTX does not support post processing callback.");
+  TORCH_CHECK(
+      config.state != ProfilerState::ITT,
+      "ITT does not support post processing callback.");
   TORCH_INTERNAL_ASSERT(
       GlobalStateManager::get() == nullptr,
       "On-demand profiling does not support post processing callback");
@@ -662,6 +667,9 @@ void enableProfiler(
   if (config.state == ProfilerState::NVTX) {
     torch::profiler::impl::pushNVTXCallbacks(config, scopes);
     return;
+  } else if (config.state == ProfilerState::ITT) {
+    torch::profiler::impl::pushITTCallbacks(config, scopes);
+    return;
   }
 
   TORCH_CHECK(
@@ -705,7 +713,8 @@ std::unique_ptr<ProfilerResult> disableProfiler() {
           (config.state == ProfilerState::KINETO ||
            config.state == ProfilerState::KINETO_GPU_FALLBACK ||
            config.state == ProfilerState::KINETO_ONDEMAND ||
-           config.state == ProfilerState::NVTX),
+           config.state == ProfilerState::NVTX ||
+           config.state == ProfilerState::ITT),
       "Can't disable Kineto profiler when it's not running");
 
   if (state_ptr->hasCallbackHandle()) {

diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
@@ -279,8 +279,8 @@ struct TORCH_API KinetoEvent {
   int64_t debug_handle_{-1};
   std::string backend_;
 
-  torch::profiler::impl::CUDAEventStub cuda_event_start_ = nullptr;
-  torch::profiler::impl::CUDAEventStub cuda_event_end_ = nullptr;
+  torch::profiler::impl::ProfilerEventStub cuda_event_start_ = nullptr;
+  torch::profiler::impl::ProfilerEventStub cuda_event_end_ = nullptr;
   bool is_python_function_;
 };