Skip to content

Commit

Permalink
Enable Intel® VTune™ Profiler's Instrumentation and Tracing Technolog…
Browse files Browse the repository at this point in the history
…y APIs (ITT) to PyTorch (pytorch#63289)

More detailed description of benefits can be found at pytorch#41001. This is Intel's counterpart of NVidia’s NVTX (https://pytorch.org/docs/stable/autograd.html#torch.autograd.profiler.emit_nvtx).

ITT is a functionality for labeling trace data during application execution across different Intel tools.
For integrating Intel(R) VTune Profiler into Kineto, ITT needs to be integrated into PyTorch first. It works with both standalone VTune Profiler [(https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html](https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html)) and Kineto-integrated VTune functionality in the future.
It works for both Intel CPU and Intel XPU devices.

Pitch
Add VTune Profiler's ITT API function calls to annotate PyTorch ops, as well as developer customized code scopes on CPU, like NVTX for NVidia GPU.

This PR rebases the code changes at pytorch#61335 to the latest master branch.

Usage example:
```
with torch.autograd.profiler.emit_itt():
    for i in range(10):
        torch.itt.range_push('step_{}'.format(i))
        model(input)
        torch.itt.range_pop()
```

cc @ilia-cher @robieta @chaekit @gdankel @bitfort @ngimel @orionr @nbcsm @guotuofeng @guyang3532 @gaoteng-git
Pull Request resolved: pytorch#63289
Approved by: https://github.com/malfet
  • Loading branch information
jingxu10 authored and pytorchmergebot committed Jul 13, 2022
1 parent 937ca69 commit 3c70447
Show file tree
Hide file tree
Showing 39 changed files with 534 additions and 50 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@
[submodule "third_party/pocketfft"]
path = third_party/pocketfft
url = https://github.com/mreineck/pocketfft
[submodule "third_party/ittapi"]
path = third_party/ittapi
url = https://github.com/intel/ittapi.git
[submodule "third_party/flatbuffers"]
path = third_party/flatbuffers
url = https://github.com/google/flatbuffers.git
Expand Down
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,10 @@ if(NOT USE_XNNPACK AND CMAKE_VERSION VERSION_LESS ${XNNPACK_MIN_CMAKE_VER})
endif()
option(USE_ZMQ "Use ZMQ" OFF)
option(USE_ZSTD "Use ZSTD" OFF)
# Ensure that an ITT build is the default for x86 CPUs
cmake_dependent_option(
USE_ITT "Use Intel(R) VTune Profiler ITT functionality" ON
"CPU_INTEL" OFF)
# Ensure that an MKLDNN build is the default for x86 CPUs
# but optional for AArch64 (dependent on -DUSE_MKLDNN).
cmake_dependent_option(
Expand Down
1 change: 1 addition & 0 deletions build_variables.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ libtorch_profiler_sources = [
"torch/csrc/profiler/kineto_shim.cpp",
"torch/csrc/profiler/nvtx_observer.cpp",
"torch/csrc/profiler/kineto_client_interface.cpp",
"torch/csrc/profiler/itt_observer.cpp",
"torch/csrc/monitor/counters.cpp",
"torch/csrc/monitor/events.cpp",
]
Expand Down
7 changes: 7 additions & 0 deletions caffe2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,13 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
)
endif()

if(${USE_ITT})
list(APPEND TORCH_SRCS
${TORCH_SRC_DIR}/csrc/itt_wrapper.cpp
${TORCH_SRC_DIR}/csrc/profiler/itt.cpp
)
endif()

if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
list(APPEND TORCH_SRCS
${TORCH_SRC_DIR}/csrc/api/src/jit.cpp
Expand Down
2 changes: 2 additions & 0 deletions caffe2/core/macros.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ static_assert(
#cmakedefine CAFFE2_USE_MKL
#cmakedefine CAFFE2_USE_MKLDNN
#cmakedefine CAFFE2_USE_NVTX
#cmakedefine CAFFE2_USE_ITT
#cmakedefine CAFFE2_USE_TRT

#ifndef EIGEN_MPL2_ONLY
Expand Down Expand Up @@ -82,5 +83,6 @@ static_assert(
{"USE_MKL", "${CAFFE2_USE_MKL}"}, \
{"USE_MKLDNN", "${CAFFE2_USE_MKLDNN}"}, \
{"USE_NVTX", "${CAFFE2_USE_NVTX}"}, \
{"USE_ITT", "${CAFFE2_USE_ITT}"}, \
{"USE_TRT", "${CAFFE2_USE_TRT}"}, \
}
13 changes: 13 additions & 0 deletions cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -962,6 +962,19 @@ if(USE_FFMPEG)
endif()
endif()

if(USE_ITT)
find_package(ITT)
if(ITT_FOUND)
include_directories(SYSTEM ${ITT_INCLUDE_DIR})
list(APPEND Caffe2_DEPENDENCY_LIBS ${ITT_LIBRARIES})
list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${ITT_LIBRARIES})
else()
message(WARNING "Not compiling with ITT. Suppress this warning with -DUSE_ITT=OFF")
set(USE_ITT OFF CACHE BOOL "" FORCE)
caffe2_update_option(USE_ITT OFF)
endif()
endif()

# ---[ Caffe2 depends on FP16 library for half-precision conversions
if(NOT TARGET fp16 AND NOT USE_SYSTEM_FP16)
set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
Expand Down
21 changes: 21 additions & 0 deletions cmake/Modules/FindITT.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# - Try to find ITT
#
# The following are set after configuration is done:
# ITT_FOUND : set to true if ITT is found.
# ITT_INCLUDE_DIR : path to ITT include dir.
# ITT_LIBRARIES : list of libraries for ITT

IF (NOT ITT_FOUND)
SET(ITT_FOUND OFF)

SET(ITT_INCLUDE_DIR)
SET(ITT_LIBRARIES)

SET(ITT_ROOT "${PROJECT_SOURCE_DIR}/third_party/ittapi")
FIND_PATH(ITT_INCLUDE_DIR ittnotify.h PATHS ${ITT_ROOT} PATH_SUFFIXES include)
IF (ITT_INCLUDE_DIR)
ADD_SUBDIRECTORY(${ITT_ROOT})
SET(ITT_LIBRARIES ittnotify)
SET(ITT_FOUND ON)
ENDIF (ITT_INCLUDE_DIR)
ENDIF(NOT ITT_FOUND)
1 change: 1 addition & 0 deletions cmake/Summary.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ function(caffe2_print_configuration_summary)
if(${USE_UCC})
message(STATUS " USE_SYSTEM_UCC : ${USE_SYSTEM_UCC}")
endif()
message(STATUS " USE_ITT : ${USE_ITT}")
message(STATUS " USE_NCCL : ${USE_NCCL}")
if(${USE_NCCL})
message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}")
Expand Down
7 changes: 5 additions & 2 deletions docs/source/autograd.rst
Original file line number Diff line number Diff line change
Expand Up @@ -223,10 +223,12 @@ Profiler
^^^^^^^^

Autograd includes a profiler that lets you inspect the cost of different
operators inside your model - both on the CPU and GPU. There are two modes
operators inside your model - both on the CPU and GPU. There are three modes
implemented at the moment - CPU-only using :class:`~torch.autograd.profiler.profile`.
and nvprof based (registers both CPU and GPU activity) using
nvprof based (registers both CPU and GPU activity) using
:class:`~torch.autograd.profiler.emit_nvtx`.
and vtune profiler based using
:class:`~torch.autograd.profiler.emit_itt`.

.. autoclass:: torch.autograd.profiler.profile

Expand All @@ -240,6 +242,7 @@ and nvprof based (registers both CPU and GPU activity) using
profiler.profile.total_average

.. autoclass:: torch.autograd.profiler.emit_nvtx
.. autoclass:: torch.autograd.profiler.emit_itt


.. autosummary::
Expand Down
4 changes: 3 additions & 1 deletion docs/source/bottleneck.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ where [args] are any number of arguments to `script.py`, or run
evaluating. If the profiler outputs don't help, you could try looking at
the result of :func:`torch.autograd.profiler.emit_nvtx()` with ``nvprof``.
However, please take into account that the NVTX overhead is very high and
often gives a heavily skewed timeline.
often gives a heavily skewed timeline. Similarly, Intel VTune Profiler helps
to analyze performance on Intel platforms further with
:func:`torch.autograd.profiler.emit_nvtx()`.

.. warning::
If you are profiling CUDA code, the first profiler that ``bottleneck`` runs
Expand Down
1 change: 1 addition & 0 deletions scripts/build_android.sh
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ else
fi
# Disable unused dependencies
CMAKE_ARGS+=("-DUSE_CUDA=OFF")
CMAKE_ARGS+=("-DUSE_ITT=OFF")
CMAKE_ARGS+=("-DUSE_GFLAGS=OFF")
CMAKE_ARGS+=("-DUSE_OPENCV=OFF")
CMAKE_ARGS+=("-DUSE_LMDB=OFF")
Expand Down
1 change: 1 addition & 0 deletions scripts/build_ios.sh
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ CMAKE_ARGS+=("-DBUILD_PYTHON=OFF")

# Disable unused dependencies
CMAKE_ARGS+=("-DUSE_CUDA=OFF")
CMAKE_ARGS+=("-DUSE_ITT=OFF")
CMAKE_ARGS+=("-DUSE_GFLAGS=OFF")
CMAKE_ARGS+=("-DUSE_OPENCV=OFF")
CMAKE_ARGS+=("-DUSE_LMDB=OFF")
Expand Down
1 change: 1 addition & 0 deletions scripts/build_mobile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ fi
# Disable unused dependencies
CMAKE_ARGS+=("-DUSE_ROCM=OFF")
CMAKE_ARGS+=("-DUSE_CUDA=OFF")
CMAKE_ARGS+=("-DUSE_ITT=OFF")
CMAKE_ARGS+=("-DUSE_GFLAGS=OFF")
CMAKE_ARGS+=("-DUSE_OPENCV=OFF")
CMAKE_ARGS+=("-DUSE_LMDB=OFF")
Expand Down
1 change: 1 addition & 0 deletions scripts/build_tizen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ cd $BUILD_ROOT
cmake "$CAFFE2_ROOT" \
-DCMAKE_VERBOSE_MAKEFILE=1 \
-DUSE_CUDA=OFF \
-DUSE_ITT=OFF \
-DUSE_OPENCV=OFF \
-DUSE_LMDB=OFF \
-DCAFFE2_CPU_FLAGS="-mfpu=neon -mfloat-abi=soft" \
Expand Down
7 changes: 7 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
#
# USE_STATIC_MKL
# Prefer to link with MKL statically - Unix only
# USE_ITT=0
# disable use of Intel(R) VTune Profiler's ITT functionality
#
# USE_NNPACK=0
# disables NNPACK build
Expand Down Expand Up @@ -541,6 +543,11 @@ def run(self):
if cmake_cache_vars['USE_LIGHTWEIGHT_DISPATCH']:
report('-- Using lightweight dispatch')

if cmake_cache_vars['USE_ITT']:
report('-- Using ITT')
else:
report('-- Not using ITT')

# Do not use clang to compile extensions if `-fstack-clash-protection` is defined
# in system CFLAGS
c_flags = str(os.getenv('CFLAGS', ''))
Expand Down
1 change: 1 addition & 0 deletions third_party/ittapi
Submodule ittapi added at 5b8a7d
7 changes: 7 additions & 0 deletions torch/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,13 @@ if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-Wno-writable-strings)
endif()

if(USE_ITT)
list(APPEND TORCH_PYTHON_SRCS
${TORCH_SRC_DIR}/csrc/itt.cpp
)
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_ITT)
endif()

if(USE_CUDA)
include(${TORCH_ROOT}/cmake/public/cuda.cmake)
append_filelist("libtorch_python_cuda_core_sources" TORCH_PYTHON_SRCS)
Expand Down
1 change: 1 addition & 0 deletions torch/_C/_autograd.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class ProfilerState(Enum):
CPU = ...
CUDA = ...
NVTX = ...
ITT = ...
KINETO = ...
KINETO_GPU_FALLBACK = ...

Expand Down
4 changes: 4 additions & 0 deletions torch/_C/_itt.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Defined in torch/csrc/itt.cpp
def rangePush(message: str) -> None: ...
def rangePop() -> None: ...
def mark(message: str) -> None: ...
64 changes: 64 additions & 0 deletions torch/autograd/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,70 @@ def _call_end_callbacks_on_future(self, fut: Future[Any]) -> Future[Any]:
return profiled_future


class emit_itt(object):
"""Context manager that makes every autograd operation emit an ITT range.
It is useful when running the program under Intel(R) VTune Profiler::
vtune <--vtune_flags> <regular command here>
The Instrumentation and Tracing Technology (ITT) API enables your application to generate and
control the collection of trace data during its execution across different Intel tools.
This context manager is to annotate Intel(R) VTune Profiling trace. With help of this context manager,
you will be able to see labled ranges in Intel(R) VTune Profiler GUI.
.. warning:
This context manager should not be called recursively, i.e. at most one
instance should be enabled at any given time.
Args:
enabled (bool, optional, default=True): Setting ``enabled=False`` makes this context manager a no-op.
Default: ``True``.
record_shapes (bool, optional, default=False): If ``record_shapes=True``, the itt range wrapping
each autograd op will append information about the sizes of Tensor arguments received
by that op, in the following format:
``[[arg0.size(0), arg0.size(1), ...], [arg1.size(0), arg1.size(1), ...], ...]``
Non-tensor arguments will be represented by ``[]``.
Arguments will be listed in the order they are received by the backend op.
Please note that this order may not match the order in which those arguments were passed
on the Python side. Also note that shape recording may increase the overhead of itt range creation.
Example:
>>> with torch.autograd.profiler.emit_itt():
... model(x)
"""
def __init__(self, enabled=True, record_shapes=False):
self.enabled = enabled
self.entered = False
self.record_shapes = record_shapes

def __enter__(self):
if not self.enabled:
return
if self.entered:
raise RuntimeError("ITT annotation context manager is not reentrant")
self.entered = True
_enable_profiler(
ProfilerConfig(
ProfilerState.ITT,
self.record_shapes,
False,
False,
False,
False,
_ExperimentalConfig()),
set()
)
return self

def __exit__(self, exc_type, exc_val, exc_tb):
if not self.enabled:
return
_disable_profiler()
return False


class emit_nvtx(object):
"""Context manager that makes every autograd operation emit an NVTX range.
Expand Down
11 changes: 11 additions & 0 deletions torch/csrc/Module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -910,6 +910,14 @@ void initModule(PyObject* module);
} // namespace torch
#endif

#ifdef USE_ITT
namespace torch {
namespace profiler {
void initIttBindings(PyObject* module);
} // namespace profiler
} // namespace torch
#endif

static std::vector<PyMethodDef> methods;

// In Python we can't use the trick of C10_LOG_API_USAGE_ONCE
Expand Down Expand Up @@ -1008,6 +1016,9 @@ PyObject* initModule() {
torch::autograd::init_legacy_variable(module);
torch::python::init_bindings(module);
torch::lazy::initLazyBindings(module);
#ifdef USE_ITT
torch::profiler::initIttBindings(module);
#endif
#ifdef USE_CUDA
torch::cuda::initModule(module);
#endif
Expand Down
1 change: 1 addition & 0 deletions torch/csrc/autograd/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
.value("CPU", ProfilerState::CPU)
.value("CUDA", ProfilerState::CUDA)
.value("NVTX", ProfilerState::NVTX)
.value("ITT", ProfilerState::ITT)
.value("KINETO", ProfilerState::KINETO)
.value("KINETO_GPU_FALLBACK", ProfilerState::KINETO_GPU_FALLBACK);

Expand Down
13 changes: 11 additions & 2 deletions torch/csrc/autograd/profiler_kineto.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <torch/csrc/profiler/api.h>
#include <torch/csrc/profiler/collection.h>
#include <torch/csrc/profiler/containers.h>
#include <torch/csrc/profiler/itt_observer.h>
#include <torch/csrc/profiler/kineto_shim.h>
#include <torch/csrc/profiler/nvtx_observer.h>

Expand Down Expand Up @@ -626,7 +627,8 @@ void reportBackendEventToActiveKinetoProfiler(
void prepareProfiler(
const torch::profiler::impl::ProfilerConfig& config,
const std::set<torch::profiler::impl::ActivityType>& activities) {
if (config.state == ProfilerState::NVTX) {
if (config.state == ProfilerState::NVTX ||
config.state == ProfilerState::ITT) {
return;
}
TORCH_CHECK(
Expand All @@ -645,6 +647,9 @@ void enableProfilerWithEventPostProcess(
TORCH_CHECK(
config.state != ProfilerState::NVTX,
"NVTX does not support post processing callback.");
TORCH_CHECK(
config.state != ProfilerState::ITT,
"ITT does not support post processing callback.");
TORCH_INTERNAL_ASSERT(
GlobalStateManager::get() == nullptr,
"On-demand profiling does not support post processing callback");
Expand All @@ -662,6 +667,9 @@ void enableProfiler(
if (config.state == ProfilerState::NVTX) {
torch::profiler::impl::pushNVTXCallbacks(config, scopes);
return;
} else if (config.state == ProfilerState::ITT) {
torch::profiler::impl::pushITTCallbacks(config, scopes);
return;
}

TORCH_CHECK(
Expand Down Expand Up @@ -705,7 +713,8 @@ std::unique_ptr<ProfilerResult> disableProfiler() {
(config.state == ProfilerState::KINETO ||
config.state == ProfilerState::KINETO_GPU_FALLBACK ||
config.state == ProfilerState::KINETO_ONDEMAND ||
config.state == ProfilerState::NVTX),
config.state == ProfilerState::NVTX ||
config.state == ProfilerState::ITT),
"Can't disable Kineto profiler when it's not running");

if (state_ptr->hasCallbackHandle()) {
Expand Down
4 changes: 2 additions & 2 deletions torch/csrc/autograd/profiler_kineto.h
Original file line number Diff line number Diff line change
Expand Up @@ -279,8 +279,8 @@ struct TORCH_API KinetoEvent {
int64_t debug_handle_{-1};
std::string backend_;

torch::profiler::impl::CUDAEventStub cuda_event_start_ = nullptr;
torch::profiler::impl::CUDAEventStub cuda_event_end_ = nullptr;
torch::profiler::impl::ProfilerEventStub cuda_event_start_ = nullptr;
torch::profiler::impl::ProfilerEventStub cuda_event_end_ = nullptr;
bool is_python_function_;
};

Expand Down
Loading

0 comments on commit 3c70447

Please sign in to comment.