From 63a55e184ec87d1b367c5c865e08ed228464fea7 Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk <82804725+apavliuk-altran@users.noreply.github.com> Date: Tue, 23 Jan 2024 10:18:43 +0200 Subject: [PATCH] [NVIDIA] TensorIterator Body as Multiple CUDA Graphs (#808) * [NVIDIA] Add operator==/!= to DevicePointer * [NVIDIA] Add CUDA::NodeParams, CUDA::TransferNode, CUDA::KernelNode * [NVIDIA] Add kernel args getters for Insert/Slice * [NVIDIA] Add KernelNodeTest and TransferNodeTest * [NVIDIA] Fix review issues * [NVIDIA] Add launchers to TI, refactor Execute() * [NVIDIA] Add TiCudaGraphInfo * [NVIDIA] Update TI to support CUDA graph as a body of iterations loop * [NVIDIA] Add operator== for dim3, KernelNode and NodeParams * [NVIDIA] Update Run() of *TopologyRunners to take non-const context reference * [NVIDIA] Remove TiCudaGraphInfo, add set_current_graph(), add_new_graph_info(), get_current_graph_info(), select_current_graph() * [NVIDIA] Change IsCudaGraphCompatible() interface to GetCudaGraphCompatibility() using enum * [NVIDIA] Add ExecuteGraph() to IOperationExec/OperationBase * [NVIDIA] Remove paramsGraph_/resultsGraph_ from CudaGraphInfo * [NVIDIA] Add multi-graph support for TI * [NVIDIA] Add multi-graph TI tests * [NVIDIA] Update CudaGraphTopologyRunnerTest --------- Co-authored-by: Pavel Durandin --- modules/nvidia_plugin/src/cuda/graph.cpp | 3 + modules/nvidia_plugin/src/cuda/graph.hpp | 2 + .../nvidia_plugin/src/cuda/node_params.hpp | 9 + modules/nvidia_plugin/src/cuda/utils.hpp | 15 + .../src/cuda_eager_topology_runner.cpp | 24 - .../src/cuda_eager_topology_runner.hpp | 17 +- .../nvidia_plugin/src/cuda_graph_context.cpp | 179 +++---- .../nvidia_plugin/src/cuda_graph_context.hpp | 208 +++++++-- .../src/cuda_graph_topology_runner.cpp | 99 ++-- .../src/cuda_graph_topology_runner.hpp | 21 +- .../src/cuda_iexecution_delegator.hpp | 12 + .../src/cuda_inference_request_context.hpp | 8 + .../src/cuda_itopology_runner.hpp | 15 +- .../nvidia_plugin/src/cuda_operation_base.hpp | 30 +- modules/nvidia_plugin/src/cuda_profiler.cpp | 12 + modules/nvidia_plugin/src/cuda_profiler.hpp | 19 + .../src/cuda_simple_execution_delegator.hpp | 19 + .../memory_manager/cuda_device_mem_block.hpp | 2 - .../src/ops/activation_forward_cudnn_base.cpp | 4 +- .../src/ops/activation_forward_cudnn_base.hpp | 2 +- modules/nvidia_plugin/src/ops/avgpool.cpp | 2 +- modules/nvidia_plugin/src/ops/avgpool.hpp | 2 +- modules/nvidia_plugin/src/ops/broadcast.cpp | 2 +- modules/nvidia_plugin/src/ops/broadcast.hpp | 2 +- modules/nvidia_plugin/src/ops/clamp_cuda.cpp | 2 +- modules/nvidia_plugin/src/ops/clamp_cuda.hpp | 2 +- modules/nvidia_plugin/src/ops/clamp_cudnn.cpp | 2 +- modules/nvidia_plugin/src/ops/clamp_cudnn.hpp | 2 +- modules/nvidia_plugin/src/ops/comparison.cpp | 2 +- modules/nvidia_plugin/src/ops/comparison.hpp | 2 +- modules/nvidia_plugin/src/ops/concat.cpp | 2 +- modules/nvidia_plugin/src/ops/concat.hpp | 2 +- modules/nvidia_plugin/src/ops/convert.cpp | 2 +- modules/nvidia_plugin/src/ops/convert.hpp | 2 +- .../src/ops/convert_color_i420.hpp | 2 +- .../src/ops/convert_color_nv12.hpp | 2 +- .../src/ops/convolution_backprop_data.cpp | 4 +- .../src/ops/convolution_backprop_data.hpp | 2 +- .../src/ops/convolution_cudnn.cpp | 2 +- .../src/ops/convolution_cudnn.hpp | 2 +- .../src/ops/convolution_cudnn_be.cpp | 2 +- .../src/ops/convolution_cudnn_be.hpp | 2 +- .../src/ops/cudnn_tensor_op_base.cpp | 2 +- .../src/ops/cudnn_tensor_op_base.hpp | 2 +- .../src/ops/detection_output.cpp | 2 +- .../src/ops/detection_output.hpp | 2 +- .../src/ops/elementwise_binary.hpp | 2 +- .../src/ops/elementwise_unary.hpp | 2 +- modules/nvidia_plugin/src/ops/elu.cpp | 2 +- modules/nvidia_plugin/src/ops/elu.hpp | 2 +- .../nvidia_plugin/src/ops/fake_quantize.cpp | 2 +- .../nvidia_plugin/src/ops/fake_quantize.hpp | 2 +- .../nvidia_plugin/src/ops/fully_connected.cpp | 2 +- .../nvidia_plugin/src/ops/fully_connected.hpp | 2 +- .../ops/fused_convolution_backprop_data.cpp | 4 +- .../ops/fused_convolution_backprop_data.hpp | 2 +- .../src/ops/fused_convolution_cudnn.cpp | 2 +- .../src/ops/fused_convolution_cudnn.hpp | 2 +- .../src/ops/fused_convolution_cudnn_be.cpp | 4 +- .../src/ops/fused_convolution_cudnn_be.hpp | 2 +- .../fused_convolution_cudnn_decomposed.cpp | 4 +- .../fused_convolution_cudnn_decomposed.hpp | 2 +- modules/nvidia_plugin/src/ops/gather.cpp | 2 +- modules/nvidia_plugin/src/ops/gather.hpp | 2 +- .../src/ops/group_convolution.cpp | 2 +- .../src/ops/group_convolution.hpp | 2 +- modules/nvidia_plugin/src/ops/gru_cell.cpp | 2 +- modules/nvidia_plugin/src/ops/gru_cell.hpp | 2 +- .../nvidia_plugin/src/ops/gru_sequence.cpp | 6 +- .../nvidia_plugin/src/ops/gru_sequence.hpp | 4 +- .../src/ops/interpolate_cubic.cpp | 2 +- .../src/ops/interpolate_cubic.hpp | 2 +- .../src/ops/interpolate_linear.cpp | 2 +- .../src/ops/interpolate_linear.hpp | 2 +- .../src/ops/interpolate_nearest.cpp | 2 +- .../src/ops/interpolate_nearest.hpp | 2 +- modules/nvidia_plugin/src/ops/logical_not.cpp | 2 +- modules/nvidia_plugin/src/ops/logical_not.hpp | 2 +- modules/nvidia_plugin/src/ops/lstm_cell.cpp | 2 +- modules/nvidia_plugin/src/ops/lstm_cell.hpp | 2 +- .../src/ops/lstm_sequence_base.cpp | 6 +- .../src/ops/lstm_sequence_base.hpp | 4 +- modules/nvidia_plugin/src/ops/matmul.cpp | 2 +- modules/nvidia_plugin/src/ops/matmul.hpp | 2 +- modules/nvidia_plugin/src/ops/maxpool.cpp | 2 +- modules/nvidia_plugin/src/ops/maxpool.hpp | 2 +- modules/nvidia_plugin/src/ops/mvn.cpp | 2 +- modules/nvidia_plugin/src/ops/mvn.hpp | 2 +- modules/nvidia_plugin/src/ops/nop_op.hpp | 2 +- modules/nvidia_plugin/src/ops/pad.cpp | 2 +- modules/nvidia_plugin/src/ops/pad.hpp | 2 +- modules/nvidia_plugin/src/ops/parameter.cpp | 2 +- modules/nvidia_plugin/src/ops/parameter.hpp | 2 +- modules/nvidia_plugin/src/ops/range.cpp | 2 +- modules/nvidia_plugin/src/ops/range.hpp | 2 +- modules/nvidia_plugin/src/ops/reduce.cpp | 2 +- modules/nvidia_plugin/src/ops/reduce.hpp | 2 +- modules/nvidia_plugin/src/ops/result.cpp | 2 +- modules/nvidia_plugin/src/ops/result.hpp | 2 +- modules/nvidia_plugin/src/ops/round.cpp | 2 +- modules/nvidia_plugin/src/ops/round.hpp | 2 +- .../src/ops/scatter_nd_update.cpp | 2 +- .../src/ops/scatter_nd_update.hpp | 2 +- modules/nvidia_plugin/src/ops/select.cpp | 2 +- modules/nvidia_plugin/src/ops/select.hpp | 2 +- modules/nvidia_plugin/src/ops/softmax.cpp | 2 +- modules/nvidia_plugin/src/ops/softmax.hpp | 2 +- modules/nvidia_plugin/src/ops/split.cpp | 2 +- modules/nvidia_plugin/src/ops/split.hpp | 2 +- .../nvidia_plugin/src/ops/strided_slice.cpp | 4 +- .../nvidia_plugin/src/ops/strided_slice.hpp | 2 +- modules/nvidia_plugin/src/ops/subgraph.cpp | 99 ++-- modules/nvidia_plugin/src/ops/subgraph.hpp | 47 +- modules/nvidia_plugin/src/ops/swish.cpp | 2 +- modules/nvidia_plugin/src/ops/swish.hpp | 2 +- .../nvidia_plugin/src/ops/tensor_iterator.cpp | 435 ++++++++++++++---- .../nvidia_plugin/src/ops/tensor_iterator.hpp | 183 +++++++- modules/nvidia_plugin/src/ops/topk.cpp | 2 +- modules/nvidia_plugin/src/ops/topk.hpp | 2 +- modules/nvidia_plugin/src/ops/transpose.cpp | 2 +- modules/nvidia_plugin/src/ops/transpose.hpp | 2 +- .../nvidia_plugin/src/ops/variadic_split.cpp | 2 +- .../nvidia_plugin/src/ops/variadic_split.hpp | 2 +- ...tible.cpp => cuda_graph_compatibility.cpp} | 12 +- .../unit/cuda_graph_topology_runner_test.cpp | 41 +- .../tests/unit/cuda_multi_graph_test.cpp | 4 +- .../tests/unit/cuda_multi_graph_ti_test.cpp | 376 +++++++++++++++ 127 files changed, 1654 insertions(+), 463 deletions(-) create mode 100644 modules/nvidia_plugin/src/cuda/utils.hpp delete mode 100644 modules/nvidia_plugin/src/cuda_eager_topology_runner.cpp rename modules/nvidia_plugin/tests/unit/{is_cuda_graph_compatible.cpp => cuda_graph_compatibility.cpp} (95%) create mode 100644 modules/nvidia_plugin/tests/unit/cuda_multi_graph_ti_test.cpp diff --git a/modules/nvidia_plugin/src/cuda/graph.cpp b/modules/nvidia_plugin/src/cuda/graph.cpp index 3428c7130..93ee1190a 100644 --- a/modules/nvidia_plugin/src/cuda/graph.cpp +++ b/modules/nvidia_plugin/src/cuda/graph.cpp @@ -189,4 +189,7 @@ bool CUDA::TransferNode::operator==(const TransferNode& rhs) const { return size_ == rhs.size_ && src_.get() == rhs.src_.get() && dst_.get() == rhs.dst_.get() && node_ == rhs.node_; } +bool KernelNode::operator==(const KernelNode& rhs) const { + return node_ == rhs.node_ && node_params_ == rhs.node_params_; +} } // namespace CUDA diff --git a/modules/nvidia_plugin/src/cuda/graph.hpp b/modules/nvidia_plugin/src/cuda/graph.hpp index 4cfeeaa04..4360af27b 100644 --- a/modules/nvidia_plugin/src/cuda/graph.hpp +++ b/modules/nvidia_plugin/src/cuda/graph.hpp @@ -148,6 +148,8 @@ class KernelNode { throwIfError(cudaGraphExecKernelNodeSetParams(exec.get(), node_, &node_params_.get_knp())); } + bool operator==(const KernelNode& rhs) const; + private: KernelNode(cudaGraphNode_t node, CUDA::NodeParams&& params); diff --git a/modules/nvidia_plugin/src/cuda/node_params.hpp b/modules/nvidia_plugin/src/cuda/node_params.hpp index 2edc20139..aadea48fa 100644 --- a/modules/nvidia_plugin/src/cuda/node_params.hpp +++ b/modules/nvidia_plugin/src/cuda/node_params.hpp @@ -6,6 +6,7 @@ #include +#include #include namespace CUDA { @@ -33,9 +34,17 @@ struct NodeParams { void reset_args() { ptrs_.clear(); } + friend bool operator==(const NodeParams& lhs, const NodeParams& rhs); + private: std::vector ptrs_; cudaKernelNodeParams knp_; }; +inline bool operator==(const NodeParams& lhs, const NodeParams& rhs) { + return lhs.ptrs_ == rhs.ptrs_ && rhs.knp_.func == lhs.knp_.func && rhs.knp_.gridDim == lhs.knp_.gridDim && + rhs.knp_.blockDim == lhs.knp_.blockDim && rhs.knp_.sharedMemBytes == lhs.knp_.sharedMemBytes && + rhs.knp_.extra == lhs.knp_.extra; +} + } // namespace CUDA diff --git a/modules/nvidia_plugin/src/cuda/utils.hpp b/modules/nvidia_plugin/src/cuda/utils.hpp new file mode 100644 index 000000000..1ac504076 --- /dev/null +++ b/modules/nvidia_plugin/src/cuda/utils.hpp @@ -0,0 +1,15 @@ +// Copyright (C) 2020-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace CUDA { + +inline bool operator==(dim3 rhs, dim3 lhs) { return rhs.x == lhs.x && rhs.y == lhs.y && rhs.z == lhs.z; } + +inline bool operator!=(dim3 rhs, dim3 lhs) { return !(rhs == lhs); } + +} // namespace CUDA diff --git a/modules/nvidia_plugin/src/cuda_eager_topology_runner.cpp b/modules/nvidia_plugin/src/cuda_eager_topology_runner.cpp deleted file mode 100644 index 383cff255..000000000 --- a/modules/nvidia_plugin/src/cuda_eager_topology_runner.cpp +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "cuda_eager_topology_runner.hpp" - -namespace ov { -namespace nvidia_gpu { - -EagerTopologyRunner::EagerTopologyRunner(const CreationContext& context, const std::shared_ptr& model) - : SubGraph(context, model) {} - -void EagerTopologyRunner::Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const { - Workbuffers workbuffers{}; - workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); - SubGraph::Execute(context, {}, {}, workbuffers); -} - -const SubGraph& EagerTopologyRunner::GetSubGraph() const { - return *this; -} - -} // namespace nvidia_gpu -} // namespace ov diff --git a/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp b/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp index 67230df6b..72c60b52f 100644 --- a/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp +++ b/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp @@ -13,12 +13,23 @@ namespace nvidia_gpu { class EagerTopologyRunner final : public SubGraph, public ITopologyRunner { public: - EagerTopologyRunner(const CreationContext& context, const std::shared_ptr& model); + EagerTopologyRunner(const CreationContext& context, const std::shared_ptr& model) : SubGraph(context, model) {} ~EagerTopologyRunner() override = default; - void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; + void Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override { + Workbuffers workbuffers{}; + workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); + SubGraph::Execute(context, {}, {}, workbuffers); + } + + void Run(InferenceRequestContext& context, const Workbuffers& workbuffers) const override{}; + + void Capture(InferenceRequestContext& context, const Workbuffers& workbuffers) const override{}; void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override{}; - const SubGraph& GetSubGraph() const override; + + const SubGraph& GetSubGraph() const override { return *this; } + + std::size_t GetCudaGraphsCount() const override { return 0; } }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/cuda_graph_context.cpp b/modules/nvidia_plugin/src/cuda_graph_context.cpp index e1f9e2487..82c00129c 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.cpp @@ -7,126 +7,151 @@ namespace ov { namespace nvidia_gpu { -void CudaGraphContext::reset() { - graphs_.clear(); - currentGraphIndex_ = 0; +void CudaGraphInfo::reset() { + graph_.reset(); + graphExec_.reset(); + parameterNodes_.clear(); + resultNodes_.clear(); + transferNodes_.clear(); + kernelNodes_.clear(); } -void CudaGraphContext::start_next_graph_addition() { - currentGraphIndex_ = graphs_.size(); - graphs_.emplace_back(); +void CudaGraphInfo::add_parameter(const std::string& tensorName, + const CUDA::Stream& stream, + CUDA::DevicePointer dst, + const void* src, + std::size_t size) { + CUDA::CaptureInfo captureInfo{stream}; + parameterNodes_.emplace(tensorName, captureInfo.addUploadNode(dst, src, size)); +} + +void CudaGraphInfo::add_result(const std::string& tensorName, + const CUDA::Stream& stream, + void* dst, + CUDA::DevicePointer src, + std::size_t size) { + CUDA::CaptureInfo captureInfo{stream}; + resultNodes_.emplace(tensorName, captureInfo.addDownloadNode(dst, src, size)); +} + +void CudaGraphInfo::add_transfer(const CUDA::Stream& stream, + CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size) { + CUDA::CaptureInfo captureInfo{stream}; + transferNodes_.emplace_back(captureInfo.addTransferNode(dst, src, size)); +} + +bool CudaGraphInfo::is_initialized() const { return graph_.has_value() && graphExec_.has_value(); } + +void CudaGraphInfo::update_capture(const TensorMappingContext& context) { + for (auto&& [tensorName, node] : parameterNodes_) { + node.update_src(graphExec_.value(), (context.get_input_tensor(tensorName)->data())); + } + for (auto&& [tensorName, node] : resultNodes_) { + node.update_dst(graphExec_.value(), context.get_output_tensor(tensorName)->data()); + } +} + +std::size_t CudaGraphInfo::get_graphs_count() const { return is_initialized() ? 1 : 0; } + +void CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); } + +void CudaGraphPack::reset() { + graphs_.clear(); + currentGraphIndex_ = 0; } -void CudaGraphContext::add_parameter(const std::string& tensorName, +void CudaGraphPack::add_parameter(const std::string& tensorName, const CUDA::Stream& stream, CUDA::DevicePointer dst, const void* src, std::size_t size) { OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); - graphs_[currentGraphIndex_].add_parameter(tensorName, stream, dst, src, size); + graphs_[currentGraphIndex_]->add_parameter(tensorName, stream, dst, src, size); } -void CudaGraphContext::add_result(const std::string& tensorName, +void CudaGraphPack::add_result(const std::string& tensorName, const CUDA::Stream& stream, void* dst, CUDA::DevicePointer src, std::size_t size) { OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); - graphs_[currentGraphIndex_].add_result(tensorName, stream, dst, src, size); + graphs_[currentGraphIndex_]->add_result(tensorName, stream, dst, src, size); +} + +void CudaGraphPack::add_transfer(const CUDA::Stream& stream, + CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size) { + graphs_[currentGraphIndex_]->add_transfer(stream, dst, src, size); } -void CudaGraphContext::add_graph(const CUDA::Graph& graph) { +void CudaGraphPack::set_current_graph(const CUDA::Graph& graph) { OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); - graphs_[currentGraphIndex_].set_graph(graph); + graphs_[currentGraphIndex_]->set_current_graph(graph); } -bool CudaGraphContext::is_initialized() const { +bool CudaGraphPack::is_initialized() const { const auto size = graphs_.size(); - return size != 0 && graphs_[size - 1].is_initialized(); + return size != 0 && graphs_[size - 1]->is_initialized(); } -void CudaGraphContext::update_capture(const TensorMappingContext& context) { +void CudaGraphPack::update_capture(const TensorMappingContext& context) { for (currentGraphIndex_ = 0; currentGraphIndex_ < graphs_.size(); ++currentGraphIndex_) { - graphs_[currentGraphIndex_].update_capture(context); - } -} - -void CudaGraphContext::launch(std::size_t index, const CUDA::Stream& stream) const { - currentGraphIndex_ = index; - OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); - graphs_[currentGraphIndex_].launch(stream); -} - -std::size_t CudaGraphContext::get_params_count() const { - std::size_t res = 0; - for (const auto& graph : graphs_) { - res += graph.get_params_count(); + graphs_[currentGraphIndex_]->update_capture(context); } - return res; } -std::size_t CudaGraphContext::get_results_count() const { - std::size_t res = 0; - for (const auto& graph : graphs_) { - res += graph.get_results_count(); - } - return res; +ICudaGraphInfo& CudaGraphPack::add(std::shared_ptr ptr) { + currentGraphIndex_ = graphs_.size(); + graphs_.emplace_back(ptr); + return *graphs_.back(); } -std::size_t CudaGraphContext::get_graphs_count() const { return graphs_.size(); } +ICudaGraphInfo& CudaGraphPack::get_current_graph() { return *graphs_[currentGraphIndex_]; } -void CudaGraphContext::CudaGraphInfo::add_parameter(const std::string& tensorName, - const CUDA::Stream& stream, - CUDA::DevicePointer dst, - const void* src, - std::size_t size) { - CUDA::CaptureInfo captureInfo{stream}; - parameterNodes_.emplace(tensorName, captureInfo.addUploadNode(dst, src, size)); +void CudaGraphPack::select_current_graph(std::size_t index) { + OPENVINO_ASSERT(index < graphs_.size(), "Graph index/vector size incosistency"); + currentGraphIndex_ = index; } -void CudaGraphContext::CudaGraphInfo::add_result(const std::string& tensorName, - const CUDA::Stream& stream, - void* dst, - CUDA::DevicePointer src, - std::size_t size) { - CUDA::CaptureInfo captureInfo{stream}; - resultNodes_.emplace(tensorName, captureInfo.addDownloadNode(dst, src, size)); +std::size_t CudaGraphPack::get_params_count() const { + return std::accumulate( + graphs_.begin(), graphs_.end(), static_cast(0), [](auto sum, const auto& graph) { + return sum + graph->get_params_count(); + }); } -void CudaGraphContext::CudaGraphInfo::set_graph(const CUDA::Graph& graph) { - graph_.emplace(graph); - graphExec_.emplace(graph); +std::size_t CudaGraphPack::get_results_count() const { + return std::accumulate( + graphs_.begin(), graphs_.end(), static_cast(0), [](auto sum, const auto& graph) { + return sum + graph->get_results_count(); + }); } -bool CudaGraphContext::CudaGraphInfo::is_initialized() const { return graph_.has_value() && graphExec_.has_value(); } - -void CudaGraphContext::CudaGraphInfo::update_capture(const TensorMappingContext& context) { - for (auto&& [tensorName, node] : parameterNodes_) { - node.update_src(graphExec_.value(), (context.get_input_tensor(tensorName)->data())); - } - for (auto&& [tensorName, node] : resultNodes_) { - node.update_dst(graphExec_.value(), context.get_output_tensor(tensorName)->data()); - } +std::size_t CudaGraphPack::get_transfers_count() const { + return std::accumulate( + graphs_.begin(), graphs_.end(), static_cast(0), [](auto sum, const auto& graph) { + return sum + graph->get_transfers_count(); + }); } -void CudaGraphContext::CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); } - -std::size_t CudaGraphContext::CudaGraphInfo::get_params_count() const { return parameterNodes_.size(); } - -std::size_t CudaGraphContext::CudaGraphInfo::get_results_count() const { return resultNodes_.size(); } - -bool operator==(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) { - return lhs.graph_ == rhs.graph_ && lhs.graphExec_ == rhs.graphExec_ && lhs.parameterNodes_ == rhs.parameterNodes_ && - lhs.resultNodes_ == rhs.resultNodes_; +std::size_t CudaGraphPack::get_kernels_count() const { + return std::accumulate( + graphs_.begin(), graphs_.end(), static_cast(0), [](auto sum, const auto& graph) { + return sum + graph->get_kernels_count(); + }); } -bool operator!=(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) { - return !(lhs == rhs); +std::size_t CudaGraphPack::get_graphs_count() const { + return std::accumulate( + graphs_.begin(), graphs_.end(), static_cast(0), [](auto sum, const auto& graph) { + return sum + graph->get_graphs_count(); + }); } -bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return lhs.graphs_ == rhs.graphs_; } - -bool operator!=(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return !(lhs == rhs); } +void CudaGraphPack::launch(const CUDA::Stream& stream) const { graphs_[currentGraphIndex_]->launch(stream); } } // namespace nvidia_gpu } // namespace ov diff --git a/modules/nvidia_plugin/src/cuda_graph_context.hpp b/modules/nvidia_plugin/src/cuda_graph_context.hpp index c0ca01e18..b058af53c 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.hpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.hpp @@ -11,89 +11,199 @@ namespace ov { namespace nvidia_gpu { -class CudaGraphContext { +class ICudaGraphInfo { public: - void reset(); + virtual ~ICudaGraphInfo() = 0; - void start_next_graph_addition(); + virtual void reset() = 0; + + virtual void add_parameter(const std::string& tensorName, + const CUDA::Stream& stream, + CUDA::DevicePointer dst, + const void* src, + std::size_t size) = 0; + + virtual void add_result(const std::string& tensorName, + const CUDA::Stream& stream, + void* dst, + CUDA::DevicePointer src, + std::size_t size) = 0; + + virtual void add_transfer(const CUDA::Stream& stream, + CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size) = 0; + + template + void add_kernel(const CUDA::Stream& stream, void* kernel, dim3 gridDim, dim3 blockDim, Args&&... args) { + CUDA::CaptureInfo captureInfo{stream}; + get_kernels().emplace_back(captureInfo.addKernelNode(kernel, gridDim, blockDim, std::forward(args)...)); + } + + template + void update_kernel(std::size_t index, Args&&... args) { + get_kernels()[index].update_params(get_graph_exec().value(), std::forward(args)...); + } + + virtual void set_current_graph(const CUDA::Graph& graph) = 0; + + virtual bool is_initialized() const = 0; + virtual bool is_nested() const = 0; + + virtual void update_capture(const TensorMappingContext& context) = 0; + + virtual ICudaGraphInfo& add(std::shared_ptr ptr) = 0; + + virtual ICudaGraphInfo& get_current_graph() = 0; + + virtual void select_current_graph(std::size_t index) = 0; + + virtual std::size_t get_params_count() const = 0; + virtual std::size_t get_results_count() const = 0; + virtual std::size_t get_transfers_count() const = 0; + virtual std::size_t get_kernels_count() const = 0; + + virtual std::size_t get_graphs_count() const = 0; + + virtual void launch(const CUDA::Stream& stream) const = 0; + + virtual std::vector& get_kernels() = 0; + virtual std::optional& get_graph_exec() = 0; +}; + +inline ICudaGraphInfo::~ICudaGraphInfo() = default; + +class CudaGraphInfo : public ICudaGraphInfo { +public: + CudaGraphInfo() = default; + CudaGraphInfo(const CudaGraphInfo&) = delete; + CudaGraphInfo& operator=(const CudaGraphInfo&) = delete; + + static std::shared_ptr create() { return std::make_shared(); } + + void reset() override; void add_parameter(const std::string& tensorName, const CUDA::Stream& stream, CUDA::DevicePointer dst, const void* src, - std::size_t size); + std::size_t size) override; void add_result(const std::string& tensorName, const CUDA::Stream& stream, void* dst, CUDA::DevicePointer src, - std::size_t size); + std::size_t size) override; + + void add_transfer(const CUDA::Stream& stream, + CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size) override; + + void set_current_graph(const CUDA::Graph& graph) override { + graph_.emplace(graph); + graphExec_.emplace(graph); + } - void add_graph(const CUDA::Graph& graph); + bool is_initialized() const override; + bool is_nested() const override { return false; }; - bool is_initialized() const; + void update_capture(const TensorMappingContext& context) override; - void update_capture(const TensorMappingContext& context); + ICudaGraphInfo& add(std::shared_ptr ptr) override { + OPENVINO_THROW("add() called for CudaGraphInfo"); + } - void launch(std::size_t index, const CUDA::Stream& stream) const; + ICudaGraphInfo& get_current_graph() override { return *this; } - std::size_t get_params_count() const; - std::size_t get_results_count() const; - std::size_t get_graphs_count() const; + void select_current_graph(std::size_t index) override { + OPENVINO_THROW("select_current_graph() called for CudaGraphInfo"); + } - friend bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs); - friend bool operator!=(const CudaGraphContext& lhs, const CudaGraphContext& rhs); + std::size_t get_params_count() const override { return parameterNodes_.size(); } + std::size_t get_results_count() const override { return resultNodes_.size(); } + std::size_t get_transfers_count() const override { return transferNodes_.size(); } + std::size_t get_kernels_count() const override { return kernelNodes_.size(); } + + std::size_t get_graphs_count() const override; + + void launch(const CUDA::Stream& stream) const override; + + std::vector& get_kernels() override { return kernelNodes_; }; + std::optional& get_graph_exec() override { return graphExec_; }; + + const std::map& get_parameter_nodes() const { return parameterNodes_; } + const std::map& get_result_nodes() const { return resultNodes_; } private: - class CudaGraphInfo { - public: - void add_parameter(const std::string& tensorName, - const CUDA::Stream& stream, - CUDA::DevicePointer dst, - const void* src, - std::size_t size); + std::optional graph_{}; + std::optional graphExec_{}; - void add_result(const std::string& tensorName, - const CUDA::Stream& stream, - void* dst, - CUDA::DevicePointer src, - std::size_t size); + std::map parameterNodes_; + std::map resultNodes_; - void set_graph(const CUDA::Graph& graph); + std::vector transferNodes_; + std::vector kernelNodes_; +}; - bool is_initialized() const; +class CudaGraphPack : public ICudaGraphInfo { +public: + CudaGraphPack() = default; + CudaGraphPack(const CudaGraphPack&) = delete; + CudaGraphPack& operator=(const CudaGraphPack&) = delete; - void update_capture(const TensorMappingContext& context); + static std::shared_ptr create() { return std::make_shared(); } - void launch(const CUDA::Stream& stream) const; + void reset() override; - std::size_t get_params_count() const; - std::size_t get_results_count() const; + void add_parameter(const std::string& tensorName, + const CUDA::Stream& stream, + CUDA::DevicePointer dst, + const void* src, + std::size_t size) override; - friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); - friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); + void add_result(const std::string& tensorName, + const CUDA::Stream& stream, + void* dst, + CUDA::DevicePointer src, + std::size_t size) override; - private: - std::optional graph_{}; - std::optional graphExec_{}; - std::map parameterNodes_; - std::map resultNodes_; - }; + void add_transfer(const CUDA::Stream& stream, + CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size) override; - friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); - friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); + void set_current_graph(const CUDA::Graph& graph) override; - std::vector graphs_{}; - mutable std::size_t currentGraphIndex_ = 0; -}; + bool is_initialized() const override; + bool is_nested() const override { return true; }; + + void update_capture(const TensorMappingContext& context) override; + + ICudaGraphInfo& add(std::shared_ptr ptr) override; + + ICudaGraphInfo& get_current_graph() override; -bool operator==(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs); + void select_current_graph(std::size_t index) override; -bool operator!=(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs); + std::size_t get_params_count() const override; + std::size_t get_results_count() const override; + std::size_t get_transfers_count() const override; + std::size_t get_kernels_count() const override; -bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs); + std::size_t get_graphs_count() const override; + + void launch(const CUDA::Stream& stream) const override; + + std::vector& get_kernels() override { return graphs_[currentGraphIndex_]->get_kernels(); }; + std::optional& get_graph_exec() override { return graphs_[currentGraphIndex_]->get_graph_exec(); }; + +private: + std::vector> graphs_{}; + std::size_t currentGraphIndex_ = 0; +}; -bool operator!=(const CudaGraphContext& lhs, const CudaGraphContext& rhs); +using CudaGraphContext = CudaGraphPack; } // namespace nvidia_gpu } // namespace ov diff --git a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp index 1e851ec41..ff21a4ea6 100644 --- a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp @@ -5,79 +5,115 @@ #include "cuda_graph_topology_runner.hpp" #include "cuda/event.hpp" +#include "ops/tensor_iterator.hpp" namespace ov { namespace nvidia_gpu { -CudaGraphTopologyRunner::CudaGraphTopologyRunner(const CreationContext& context, - const std::shared_ptr& model) - : orig_subgraph_{context, model}, - cuda_graphs_count_{0} { +CudaGraphTopologyRunner::CudaGraphTopologyRunner(const CreationContext& context, const SubGraph& subgraph) + : orig_subgraph_(subgraph), cuda_graphs_count_{0} { std::vector sequences; SubGraph::ExecSequence currentSequence; const auto& origSequence = orig_subgraph_.getExecSequence(); const auto totalSize = origSequence.size(); OPENVINO_ASSERT(totalSize != 0, "ExecSequence size is 0"); - bool isLastOpCompatible = origSequence[0]->IsCudaGraphCompatible(); + CudaGraphCompatibility lastOpCompatibility = origSequence[0]->GetCudaGraphCompatibility(); currentSequence.push_back(origSequence[0]); - for (size_t i = 1; i < totalSize; ++i) { + for (std::size_t i = 1; i < totalSize; ++i) { const auto& op = origSequence[i]; - if (op->IsCudaGraphCompatible() != isLastOpCompatible) { - isLastOpCompatible = !isLastOpCompatible; + auto comp = op->GetCudaGraphCompatibility(); + if (comp != lastOpCompatibility || comp == CudaGraphCompatibility::SPECIAL) { + lastOpCompatibility = comp; sequences.emplace_back(std::move(currentSequence)); currentSequence.clear(); } + if (comp == CudaGraphCompatibility::SPECIAL) { + auto sg = std::dynamic_pointer_cast(op); + sg->initializeRunner(); + cuda_graphs_count_ += sg->GetCudaGraphsCount(); + } currentSequence.push_back(op); } sequences.emplace_back(std::move(currentSequence)); + const auto& model = orig_subgraph_.getModel(); const auto& memoryManager = orig_subgraph_.memoryManager(); - for (auto&& sequence : sequences) { - subgraphs_.emplace_back(context, model, std::move(sequence), memoryManager); - if (subgraphs_[subgraphs_.size() - 1].IsCudaGraphCompatible()) { + for (const auto& sequence : sequences) { + subgraphs_.emplace_back(context, model, sequence, memoryManager); + if (subgraphs_.back().GetCudaGraphCompatibility() == CudaGraphCompatibility::FULL) { ++cuda_graphs_count_; } } } -void CudaGraphTopologyRunner::Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const { +CudaGraphTopologyRunner::CudaGraphTopologyRunner(const CreationContext& context, + const std::shared_ptr& model) + : CudaGraphTopologyRunner(context, {context, model}) {} + +CudaGraphTopologyRunner::CudaGraphTopologyRunner(const CreationContext& context, + const std::shared_ptr& model, + const SubGraph::ExecSequence& sequence, + const std::shared_ptr& memoryManager) + : CudaGraphTopologyRunner(context, {context, model, sequence, memoryManager}) {} + +void CudaGraphTopologyRunner::Run(InferenceRequestContext& context, const Workbuffers& workbuffers) const { const auto& stream = context.getThreadContext().stream(); + auto& graphPack = context.getCurrentCudaGraphInfo(); std::size_t graphIndex = 0; for (auto& subgraph : subgraphs_) { - if (subgraph.IsCudaGraphCompatible()) { - context.getCudaGraphContext().launch(graphIndex, stream); + auto compatibility = subgraph.GetCudaGraphCompatibility(); + if (compatibility == CudaGraphCompatibility::FULL) { + graphPack.select_current_graph(graphIndex); + graphPack.launch(stream); + graphIndex++; + } else if (compatibility == CudaGraphCompatibility::SPECIAL) { + graphPack.select_current_graph(graphIndex); + context.setCurrentCudaGraphInfo(graphPack.get_current_graph()); + subgraph.ExecuteGraph(context, {}, {}, workbuffers); graphIndex++; } else { - Workbuffers workbuffers{}; - workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); subgraph.Execute(context, {}, {}, workbuffers); } } } -void CudaGraphTopologyRunner::Capture(InferenceRequestContext& context, - const DeviceMemBlock& memoryBlock) const { - const auto& stream = context.getThreadContext().stream(); - auto& graphContext = context.getCudaGraphContext(); +void CudaGraphTopologyRunner::Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const { + Workbuffers workbuffers{}; + workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); + context.setCurrentCudaGraphInfo(context.getCudaGraphContext()); + Run(context, workbuffers); +} - graphContext.reset(); +void CudaGraphTopologyRunner::Capture(InferenceRequestContext& context, const Workbuffers& workbuffers) const { + const auto& stream = context.getThreadContext().stream(); + auto& graphPack = context.getCurrentCudaGraphInfo(); + graphPack.reset(); for (const auto& subgraph : subgraphs_) { - if (subgraph.IsCudaGraphCompatible()) { - graphContext.start_next_graph_addition(); + auto compatibility = subgraph.GetCudaGraphCompatibility(); + if (compatibility == CudaGraphCompatibility::FULL) { + graphPack.add(CudaGraphInfo::create()); CUDA::GraphCapture capture{stream}; { auto scope = capture.getScope(); - Workbuffers workbuffers{}; - workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); subgraph.Capture(context, {}, {}, workbuffers); } - const auto& graph = capture.getGraph(); - graphContext.add_graph(graph); + graphPack.set_current_graph(capture.getGraph()); + } else if (compatibility == CudaGraphCompatibility::SPECIAL) { + auto& currentGraph = + hasNestedRunners() ? graphPack.add(CudaGraphContext::create()) : graphPack.add(CudaGraphInfo::create()); + context.setCurrentCudaGraphInfo(currentGraph); + subgraph.Capture(context, {}, {}, workbuffers); } } - OPENVINO_ASSERT(graphContext.get_graphs_count() == GetCudaGraphsCount(), - "CudaGraphTopologyRunner/CudaGraphContext graphs count mismatch"); + OPENVINO_ASSERT(cuda_graphs_count_ == graphPack.get_graphs_count()); +} + +void CudaGraphTopologyRunner::Capture(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const { + Workbuffers workbuffers{}; + workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); + context.setCurrentCudaGraphInfo(context.getCudaGraphContext()); + Capture(context, workbuffers); } const SubGraph& CudaGraphTopologyRunner::GetSubGraph() const { @@ -86,6 +122,11 @@ const SubGraph& CudaGraphTopologyRunner::GetSubGraph() const { std::size_t CudaGraphTopologyRunner::GetCudaGraphsCount() const { return cuda_graphs_count_; } +bool CudaGraphTopologyRunner::hasNestedRunners() const { + return std::any_of( + subgraphs_.begin(), subgraphs_.end(), [](const SubGraph& sg) { return sg.hasTopologyRunners(); }); +} + void CudaGraphTopologyRunner::UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const { if (context.getCudaGraphContext().is_initialized()) { UpdateCapture(context); diff --git a/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp b/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp index 8e7cd1b85..d27af9393 100644 --- a/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp +++ b/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp @@ -4,7 +4,9 @@ #pragma once -#include "cuda_itopology_runner.hpp" +#include +#include +#include namespace ov { namespace nvidia_gpu { @@ -12,15 +14,28 @@ namespace nvidia_gpu { class CudaGraphTopologyRunner final : public ITopologyRunner { public: CudaGraphTopologyRunner(const CreationContext& context, const std::shared_ptr& model); + + CudaGraphTopologyRunner(const CreationContext& context, + const std::shared_ptr& model, + const SubGraph::ExecSequence& sequence, + const std::shared_ptr& memoryManager); + ~CudaGraphTopologyRunner() override = default; - void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; + void Run(InferenceRequestContext& context, const Workbuffers& workbuffers) const override; + void Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; + + void Capture(InferenceRequestContext& context, const Workbuffers& workbuffers) const override; void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; + const SubGraph& GetSubGraph() const override; + std::size_t GetCudaGraphsCount() const override; - std::size_t GetCudaGraphsCount() const; + bool hasNestedRunners() const; private: + explicit CudaGraphTopologyRunner(const CreationContext& context, const SubGraph& subgraph); + void Capture(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const; void UpdateCapture(InferenceRequestContext& context) const; diff --git a/modules/nvidia_plugin/src/cuda_iexecution_delegator.hpp b/modules/nvidia_plugin/src/cuda_iexecution_delegator.hpp index 7147da7d3..c28bfb7c3 100644 --- a/modules/nvidia_plugin/src/cuda_iexecution_delegator.hpp +++ b/modules/nvidia_plugin/src/cuda_iexecution_delegator.hpp @@ -64,6 +64,18 @@ class IExecutionDelegator { const Workbuffers::mutable_buffer& buffer, InferenceRequestContext& context) = 0; + /** + * Execute CUDA graph sequence from SubGraph class + * @param subGraphPtr Pointer to SubGraph + * @param memoryManager Reference to MemoryManager + * @param buffer Reference to orkbuffers::mutable_buffer + * @param context Reference to InferenceRequestContext + */ + virtual void execute_graph_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + InferenceRequestContext& context) = 0; + /** * Returns performance counters * @return Performance counters diff --git a/modules/nvidia_plugin/src/cuda_inference_request_context.hpp b/modules/nvidia_plugin/src/cuda_inference_request_context.hpp index 3e2bb26fb..142d1cc64 100644 --- a/modules/nvidia_plugin/src/cuda_inference_request_context.hpp +++ b/modules/nvidia_plugin/src/cuda_inference_request_context.hpp @@ -62,6 +62,13 @@ class InferenceRequestContext { [[nodiscard]] const CudaGraphContext& getCudaGraphContext() const { return cuda_graph_context_; } [[nodiscard]] CudaGraphContext& getCudaGraphContext() { return cuda_graph_context_; } + void setCurrentCudaGraphInfo(ICudaGraphInfo& info) { current_cuda_graph_info_ = &info; } + + ICudaGraphInfo& getCurrentCudaGraphInfo() { + OPENVINO_ASSERT(current_cuda_graph_info_, "current_cuda_graph_info_ is nullptr"); + return *current_cuda_graph_info_; + } + private: const ThreadContext& threadContext; CancellationToken& token; @@ -69,6 +76,7 @@ class InferenceRequestContext { const TensorMappingContext tensor_mapping_context_; CudaGraphContext& cuda_graph_context_; bool is_benchmark_mode_; + ICudaGraphInfo* current_cuda_graph_info_ = nullptr; }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/cuda_itopology_runner.hpp b/modules/nvidia_plugin/src/cuda_itopology_runner.hpp index 04cb61bbc..c4a18f06b 100644 --- a/modules/nvidia_plugin/src/cuda_itopology_runner.hpp +++ b/modules/nvidia_plugin/src/cuda_itopology_runner.hpp @@ -4,16 +4,25 @@ #pragma once -#include +#include +#include namespace ov { namespace nvidia_gpu { +class SubGraph; + struct ITopologyRunner { - virtual void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0; + virtual ~ITopologyRunner() = default; + + virtual void Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0; + virtual void Run(InferenceRequestContext& context, const Workbuffers& workbuffers) const = 0; + + virtual void Capture(InferenceRequestContext& context, const Workbuffers& workbuffers) const = 0; virtual void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0; + virtual const SubGraph& GetSubGraph() const = 0; - virtual ~ITopologyRunner() = default; + virtual std::size_t GetCudaGraphsCount() const = 0; }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/cuda_operation_base.hpp b/modules/nvidia_plugin/src/cuda_operation_base.hpp index 09118458f..fb2cb72be 100644 --- a/modules/nvidia_plugin/src/cuda_operation_base.hpp +++ b/modules/nvidia_plugin/src/cuda_operation_base.hpp @@ -28,6 +28,8 @@ namespace nvidia_gpu { template using DevicePointer = CUDA::DevicePointer; +enum class CudaGraphCompatibility { NONE, FULL, SPECIAL }; + class IOperationExec { public: using Inputs = gsl::span>; @@ -40,11 +42,17 @@ class IOperationExec { Inputs inputTensors, Outputs outputTensors, const Workbuffers& workbuffers) const = 0; + + virtual CudaGraphCompatibility GetCudaGraphCompatibility() const = 0; + virtual void Capture(InferenceRequestContext& context, Inputs inputTensors, Outputs outputTensors, const Workbuffers& workbuffers) const = 0; - virtual bool IsCudaGraphCompatible() const = 0; + virtual void ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const = 0; virtual void InitSharedImmutableWorkbuffers(const Buffers&) = 0; virtual WorkbufferRequest GetWorkBufferRequest() const = 0; virtual const WorkbufferIds& GetWorkbufferIds() const = 0; @@ -79,7 +87,19 @@ class OperationBase : public IOperationExec, public IOperationMeta, public std:: IndexCollection&& inputIds, IndexCollection&& outputIds); - bool IsCudaGraphCompatible() const override { return false; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::NONE; } + + void Capture(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const override { + Execute(context, inputTensors, outputTensors, workbuffers); + } + // For operations with CudaGraphCompatibility::SPECIAL, e.g. TI; the vast majority or operations doesn't use this + void ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const override {} WorkbufferRequest GetWorkBufferRequest() const override { return {}; // Most operators do not need workbuffers @@ -107,12 +127,6 @@ class OperationBase : public IOperationExec, public IOperationMeta, public std:: workbuffer_ids_ = workbufferIds; return workbuffer_ids_.immutableIds.empty() ? WorkbufferStatus::NoInitNeeded : WorkbufferStatus::InitNeeded; } - void Capture(InferenceRequestContext& context, - Inputs inputTensors, - Outputs outputTensors, - const Workbuffers& workbuffers) const override { - Execute(context, inputTensors, outputTensors, workbuffers); - } protected: std::string node_name_; diff --git a/modules/nvidia_plugin/src/cuda_profiler.cpp b/modules/nvidia_plugin/src/cuda_profiler.cpp index be8a3a61c..4c8f96ce7 100644 --- a/modules/nvidia_plugin/src/cuda_profiler.cpp +++ b/modules/nvidia_plugin/src/cuda_profiler.cpp @@ -147,6 +147,18 @@ void Profiler::capture_sequence(const SubGraph* subGraphPtr, } } +void Profiler::execute_graph_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + InferenceRequestContext& context) { + for (const auto& op : create_exec_sequence(subGraphPtr)) { + const auto& inTensors = memoryManager.inputTensorPointers(*op, buffer); + const auto& outTensors = memoryManager.outputTensorPointers(*op, buffer); + const auto& workBuffers = memoryManager.workBuffers(*op, buffer); + op->execute_graph(context, inTensors, outTensors, workBuffers); + } +} + Profiler::ProfilerSequence Profiler::create_exec_sequence(const SubGraph* subGraphPtr) { OPENVINO_ASSERT(active_stream_); ++infer_count_; diff --git a/modules/nvidia_plugin/src/cuda_profiler.hpp b/modules/nvidia_plugin/src/cuda_profiler.hpp index cea8b53c7..b4e078e7b 100644 --- a/modules/nvidia_plugin/src/cuda_profiler.hpp +++ b/modules/nvidia_plugin/src/cuda_profiler.hpp @@ -70,6 +70,18 @@ class Profiler : public IExecutionDelegator { const Workbuffers::mutable_buffer& buffer, InferenceRequestContext& context) override; + /** + * Execute CUDA graph sequence from SubGraph class + * @param subGraphPtr Pointer to SubGraph + * @param memoryManager Reference to MemoryManager + * @param buffer Reference to orkbuffers::mutable_buffer + * @param context Reference to InferenceRequestContext + */ + virtual void execute_graph_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + InferenceRequestContext& context) override; + /** * Returns performance counters * @return Performance counters @@ -140,6 +152,13 @@ class Profiler::ProfileExecStep { timing_.setStop(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); } + template + void execute_graph(TArgs&&... args) const { + timing_.setStart(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); + exec_step_.ExecuteGraph(std::forward(args)...); + timing_.setStop(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); + } + /** * Adapter method for pointer of operation * @return Reference to ProfileExecStep diff --git a/modules/nvidia_plugin/src/cuda_simple_execution_delegator.hpp b/modules/nvidia_plugin/src/cuda_simple_execution_delegator.hpp index 97d174b76..1c7371476 100644 --- a/modules/nvidia_plugin/src/cuda_simple_execution_delegator.hpp +++ b/modules/nvidia_plugin/src/cuda_simple_execution_delegator.hpp @@ -76,6 +76,25 @@ class SimpleExecutionDelegator : public IExecutionDelegator { } }; + /** + * Call ExecuteGraph for all operations from SubGraph class + * @param subGraphPtr Pointer to SubGraph + * @param memoryManager Reference to MemoryManager + * @param buffer Reference to orkbuffers::mutable_buffer + * @param context Reference to InferenceRequestContext + */ + virtual void execute_graph_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + InferenceRequestContext& context) override { + for (auto& op : subGraphPtr->getExecSequence()) { + const auto& inputTensors = memoryManager.inputTensorPointers(*op, buffer); + const auto& outputTensors = memoryManager.outputTensorPointers(*op, buffer); + const auto& workBuffers = memoryManager.workBuffers(*op, buffer); + op->ExecuteGraph(context, inputTensors, outputTensors, workBuffers); + } + }; + /** * Dummy get_performance_counts implementation */ diff --git a/modules/nvidia_plugin/src/memory_manager/cuda_device_mem_block.hpp b/modules/nvidia_plugin/src/memory_manager/cuda_device_mem_block.hpp index f6f9986e2..f89a995d3 100644 --- a/modules/nvidia_plugin/src/memory_manager/cuda_device_mem_block.hpp +++ b/modules/nvidia_plugin/src/memory_manager/cuda_device_mem_block.hpp @@ -13,8 +13,6 @@ namespace ov { namespace nvidia_gpu { -class CudaGraphContext; - /** * @brief Allocates and owns continuous memory blob on CUDA device. * Uses MemoryModel to determine a size of memory to allocate and diff --git a/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.cpp b/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.cpp index 20a681cbf..8f752f01e 100644 --- a/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.cpp +++ b/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.cpp @@ -59,7 +59,9 @@ void ActivationForwardCuDnnOpBase::Execute(const InferenceRequestContext& contex outputTensors[0].get()); } -bool ActivationForwardCuDnnOpBase::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ActivationForwardCuDnnOpBase::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::FULL; +} } // namespace nvidia_gpu } // namespace ov diff --git a/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.hpp b/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.hpp index 05e5c5598..de065efa7 100644 --- a/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.hpp +++ b/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.hpp @@ -31,7 +31,7 @@ class ActivationForwardCuDnnOpBase : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; protected: std::unique_ptr op_desc_; diff --git a/modules/nvidia_plugin/src/ops/avgpool.cpp b/modules/nvidia_plugin/src/ops/avgpool.cpp index 6131ef5a5..858c50a06 100644 --- a/modules/nvidia_plugin/src/ops/avgpool.cpp +++ b/modules/nvidia_plugin/src/ops/avgpool.cpp @@ -30,7 +30,7 @@ void AvgPoolOp::Execute(const InferenceRequestContext& context, outputs[PoolingImpl::output_index].get()); } -bool AvgPoolOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility AvgPoolOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(AvgPoolOp, AvgPool); diff --git a/modules/nvidia_plugin/src/ops/avgpool.hpp b/modules/nvidia_plugin/src/ops/avgpool.hpp index e22e66fa0..184669f37 100644 --- a/modules/nvidia_plugin/src/ops/avgpool.hpp +++ b/modules/nvidia_plugin/src/ops/avgpool.hpp @@ -23,7 +23,7 @@ class AvgPoolOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: PoolingImpl impl_; diff --git a/modules/nvidia_plugin/src/ops/broadcast.cpp b/modules/nvidia_plugin/src/ops/broadcast.cpp index 5f97adf6c..3b6939a33 100644 --- a/modules/nvidia_plugin/src/ops/broadcast.cpp +++ b/modules/nvidia_plugin/src/ops/broadcast.cpp @@ -65,7 +65,7 @@ void BroadcastOp::Execute(const InferenceRequestContext& context, (*kernel_)(stream, inputs[0].get(), broadcast_params_->mapper(workbuffers.immutable_buffers), outputs[0].get()); } -bool BroadcastOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility BroadcastOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest BroadcastOp::GetWorkBufferRequest() const { return {immutable_buffer_sizes_, {}}; } diff --git a/modules/nvidia_plugin/src/ops/broadcast.hpp b/modules/nvidia_plugin/src/ops/broadcast.hpp index e59b1792c..cb89a06ec 100644 --- a/modules/nvidia_plugin/src/ops/broadcast.hpp +++ b/modules/nvidia_plugin/src/ops/broadcast.hpp @@ -27,7 +27,7 @@ class BroadcastOp : public OperationBase { WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::vector immutable_buffer_sizes_; diff --git a/modules/nvidia_plugin/src/ops/clamp_cuda.cpp b/modules/nvidia_plugin/src/ops/clamp_cuda.cpp index bb0b0a87a..9fa5d7e8c 100644 --- a/modules/nvidia_plugin/src/ops/clamp_cuda.cpp +++ b/modules/nvidia_plugin/src/ops/clamp_cuda.cpp @@ -51,7 +51,7 @@ void ClampCudaOp::Execute(const InferenceRequestContext& context, (*kernel_)(context.getThreadContext().stream().get(), inputTensors[0].get(), outputTensors[0].get()); } -bool ClampCudaOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ClampCudaOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } } // namespace nvidia_gpu } // namespace ov diff --git a/modules/nvidia_plugin/src/ops/clamp_cuda.hpp b/modules/nvidia_plugin/src/ops/clamp_cuda.hpp index 61ee4153e..78baa91cc 100644 --- a/modules/nvidia_plugin/src/ops/clamp_cuda.hpp +++ b/modules/nvidia_plugin/src/ops/clamp_cuda.hpp @@ -26,7 +26,7 @@ class ClampCudaOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/clamp_cudnn.cpp b/modules/nvidia_plugin/src/ops/clamp_cudnn.cpp index 854ce8a77..7188bce83 100644 --- a/modules/nvidia_plugin/src/ops/clamp_cudnn.cpp +++ b/modules/nvidia_plugin/src/ops/clamp_cudnn.cpp @@ -97,7 +97,7 @@ void ClampCuDnnOp::Execute(const InferenceRequestContext& context, outputTensors[0].get()); } -bool ClampCuDnnOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ClampCuDnnOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void ClampCuDnnOp::InitSharedImmutableWorkbuffers(const Buffers& buffers) { switch (data_type_) { diff --git a/modules/nvidia_plugin/src/ops/clamp_cudnn.hpp b/modules/nvidia_plugin/src/ops/clamp_cudnn.hpp index 544ba081d..2a754af8f 100644 --- a/modules/nvidia_plugin/src/ops/clamp_cudnn.hpp +++ b/modules/nvidia_plugin/src/ops/clamp_cudnn.hpp @@ -33,7 +33,7 @@ class ClampCuDnnOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/comparison.cpp b/modules/nvidia_plugin/src/ops/comparison.cpp index 3f9bf2ea0..e96f9b4f0 100644 --- a/modules/nvidia_plugin/src/ops/comparison.cpp +++ b/modules/nvidia_plugin/src/ops/comparison.cpp @@ -84,7 +84,7 @@ Comparison::Comparison(const CreationContext& context, threads_per_block}; } -bool Comparison::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility Comparison::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void Comparison::Execute(const InferenceRequestContext& context, Inputs inputs, diff --git a/modules/nvidia_plugin/src/ops/comparison.hpp b/modules/nvidia_plugin/src/ops/comparison.hpp index 64b73b0a3..c1de21a91 100644 --- a/modules/nvidia_plugin/src/ops/comparison.hpp +++ b/modules/nvidia_plugin/src/ops/comparison.hpp @@ -18,7 +18,7 @@ class Comparison : public OperationBase { IndexCollection&& outputIds, kernel::Comparison::Op_t operation_type); - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: void calculateOffsets(); diff --git a/modules/nvidia_plugin/src/ops/concat.cpp b/modules/nvidia_plugin/src/ops/concat.cpp index 8b1b1bd2a..f0276b0b4 100644 --- a/modules/nvidia_plugin/src/ops/concat.cpp +++ b/modules/nvidia_plugin/src/ops/concat.cpp @@ -95,7 +95,7 @@ void ConcatOp::Execute(const InferenceRequestContext& context, outputs[0].get()); } -bool ConcatOp::IsCudaGraphCompatible() const { return false; } +CudaGraphCompatibility ConcatOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::NONE; } OPERATION_REGISTER(ConcatOp, Concat); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/concat.hpp b/modules/nvidia_plugin/src/ops/concat.hpp index 223e9e337..566edb231 100644 --- a/modules/nvidia_plugin/src/ops/concat.hpp +++ b/modules/nvidia_plugin/src/ops/concat.hpp @@ -28,7 +28,7 @@ class ConcatOp : public OperationBase { WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers&) override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: size_t immutableWbSize() const { return concat_kernel_.value().immutableWbSize(); } diff --git a/modules/nvidia_plugin/src/ops/convert.cpp b/modules/nvidia_plugin/src/ops/convert.cpp index c27d141a5..502ac9b11 100644 --- a/modules/nvidia_plugin/src/ops/convert.cpp +++ b/modules/nvidia_plugin/src/ops/convert.cpp @@ -55,7 +55,7 @@ void ConvertOp::Execute(const InferenceRequestContext& context, (*convert_kernel_)(stream.get(), outputs[0].get(), inputs[0].get()); } -bool ConvertOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ConvertOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(ConvertOp, Convert); diff --git a/modules/nvidia_plugin/src/ops/convert.hpp b/modules/nvidia_plugin/src/ops/convert.hpp index 471a27351..c1ac63f14 100644 --- a/modules/nvidia_plugin/src/ops/convert.hpp +++ b/modules/nvidia_plugin/src/ops/convert.hpp @@ -24,7 +24,7 @@ class ConvertOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; using Type_t = ov::element::Type_t; using convert_t = void (*)( diff --git a/modules/nvidia_plugin/src/ops/convert_color_i420.hpp b/modules/nvidia_plugin/src/ops/convert_color_i420.hpp index 3f5386fc6..2cefeeff9 100644 --- a/modules/nvidia_plugin/src/ops/convert_color_i420.hpp +++ b/modules/nvidia_plugin/src/ops/convert_color_i420.hpp @@ -91,7 +91,7 @@ class I420ConvertColorBase : public OperationBase { } } - bool IsCudaGraphCompatible() const override { return true; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::FULL; } private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/convert_color_nv12.hpp b/modules/nvidia_plugin/src/ops/convert_color_nv12.hpp index 5a160ad71..8a34bb405 100644 --- a/modules/nvidia_plugin/src/ops/convert_color_nv12.hpp +++ b/modules/nvidia_plugin/src/ops/convert_color_nv12.hpp @@ -90,7 +90,7 @@ class NV12ConvertColorBase : public OperationBase { } } - bool IsCudaGraphCompatible() const override { return true; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::FULL; } private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/convolution_backprop_data.cpp b/modules/nvidia_plugin/src/ops/convolution_backprop_data.cpp index 59d0aa762..f23ff2ed9 100644 --- a/modules/nvidia_plugin/src/ops/convolution_backprop_data.cpp +++ b/modules/nvidia_plugin/src/ops/convolution_backprop_data.cpp @@ -43,8 +43,8 @@ void ConvBackpropDataOp::Execute(const InferenceRequestContext& context, } template -bool ConvBackpropDataOp::IsCudaGraphCompatible() const { - return true; +CudaGraphCompatibility ConvBackpropDataOp::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(ConvolutionBackpropDataOp, ConvolutionBackpropData); diff --git a/modules/nvidia_plugin/src/ops/convolution_backprop_data.hpp b/modules/nvidia_plugin/src/ops/convolution_backprop_data.hpp index 213195d75..e761825bf 100644 --- a/modules/nvidia_plugin/src/ops/convolution_backprop_data.hpp +++ b/modules/nvidia_plugin/src/ops/convolution_backprop_data.hpp @@ -32,7 +32,7 @@ class ConvBackpropDataOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/convolution_cudnn.cpp b/modules/nvidia_plugin/src/ops/convolution_cudnn.cpp index cee0b9131..b51bc98d9 100644 --- a/modules/nvidia_plugin/src/ops/convolution_cudnn.cpp +++ b/modules/nvidia_plugin/src/ops/convolution_cudnn.cpp @@ -43,7 +43,7 @@ void ConvolutionCuDnn::Execute(const InferenceRequestContext& context, throwIfError(status); } -bool ConvolutionCuDnn::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ConvolutionCuDnn::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest ConvolutionCuDnn::GetWorkBufferRequest() const { if (descs_.Algo().memory != 0) diff --git a/modules/nvidia_plugin/src/ops/convolution_cudnn.hpp b/modules/nvidia_plugin/src/ops/convolution_cudnn.hpp index abe858d98..3e0f63519 100644 --- a/modules/nvidia_plugin/src/ops/convolution_cudnn.hpp +++ b/modules/nvidia_plugin/src/ops/convolution_cudnn.hpp @@ -29,7 +29,7 @@ class ConvolutionCuDnn : public OperationCuDnn { WorkbufferRequest GetWorkBufferRequest() const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: Convolution::Details::ConvolutionDescriptorsCuDnn descs_; diff --git a/modules/nvidia_plugin/src/ops/convolution_cudnn_be.cpp b/modules/nvidia_plugin/src/ops/convolution_cudnn_be.cpp index 2d4743bd2..4c16c8ff7 100644 --- a/modules/nvidia_plugin/src/ops/convolution_cudnn_be.cpp +++ b/modules/nvidia_plugin/src/ops/convolution_cudnn_be.cpp @@ -147,7 +147,7 @@ void ConvolutionCuDnnBE::Execute(const InferenceRequestContext& context, throwIfError(::cudnnBackendExecute(context.getThreadContext().dnnHandle().get(), plan->get(), variantPack->get())); } -bool ConvolutionCuDnnBE::IsCudaGraphCompatible() const { return false; } +CudaGraphCompatibility ConvolutionCuDnnBE::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::NONE; } std::shared_ptr ConvolutionCuDnnBE::MakeTensorDescriptor(int64_t id, cudnnDataType_t element_type, diff --git a/modules/nvidia_plugin/src/ops/convolution_cudnn_be.hpp b/modules/nvidia_plugin/src/ops/convolution_cudnn_be.hpp index 1a68d8560..ac348e28f 100644 --- a/modules/nvidia_plugin/src/ops/convolution_cudnn_be.hpp +++ b/modules/nvidia_plugin/src/ops/convolution_cudnn_be.hpp @@ -33,7 +33,7 @@ class ConvolutionCuDnnBE : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.cpp b/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.cpp index d40e301e5..0903adae1 100644 --- a/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.cpp +++ b/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.cpp @@ -166,7 +166,7 @@ void CuDnnTensorOpBase::Execute(const InferenceRequestContext& context, outputTensors[0].get()); } -bool CuDnnTensorOpBase::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility CuDnnTensorOpBase::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } CuDnnTensorOpBase::IoParams::IoParams(const ov::Node& node, const Type& io_type, int index) : type_(convertDataType(io_type == Type::INPUT ? node.get_input_element_type(index) diff --git a/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.hpp b/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.hpp index 0dce5eaf5..f9fc14181 100644 --- a/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.hpp +++ b/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.hpp @@ -24,7 +24,7 @@ class CuDnnTensorOpBase : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: struct IoParams { diff --git a/modules/nvidia_plugin/src/ops/detection_output.cpp b/modules/nvidia_plugin/src/ops/detection_output.cpp index 20fff51cd..3418b552a 100644 --- a/modules/nvidia_plugin/src/ops/detection_output.cpp +++ b/modules/nvidia_plugin/src/ops/detection_output.cpp @@ -107,7 +107,7 @@ void DetectionOutputOp::Execute(const InferenceRequestContext& context, } } -bool DetectionOutputOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility DetectionOutputOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void DetectionOutputOp::InitSharedImmutableWorkbuffers(const Buffers& buffers) { kernel_.value().initSharedImmutableWorkbuffers(buffers); diff --git a/modules/nvidia_plugin/src/ops/detection_output.hpp b/modules/nvidia_plugin/src/ops/detection_output.hpp index 5a29d95e2..f2e65303f 100644 --- a/modules/nvidia_plugin/src/ops/detection_output.hpp +++ b/modules/nvidia_plugin/src/ops/detection_output.hpp @@ -25,7 +25,7 @@ class DetectionOutputOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/elementwise_binary.hpp b/modules/nvidia_plugin/src/ops/elementwise_binary.hpp index 3ada107a8..e6ed7454e 100644 --- a/modules/nvidia_plugin/src/ops/elementwise_binary.hpp +++ b/modules/nvidia_plugin/src/ops/elementwise_binary.hpp @@ -59,7 +59,7 @@ class ElementwiseBinaryOp : public OperationBase { static_cast(outputTensors[0].get())); } - bool IsCudaGraphCompatible() const override { return true; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::FULL; } void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) override { in0_broadcast_params_->initWorkbuffers(buffers); diff --git a/modules/nvidia_plugin/src/ops/elementwise_unary.hpp b/modules/nvidia_plugin/src/ops/elementwise_unary.hpp index bbdac1cfb..0f99c41ea 100644 --- a/modules/nvidia_plugin/src/ops/elementwise_unary.hpp +++ b/modules/nvidia_plugin/src/ops/elementwise_unary.hpp @@ -46,7 +46,7 @@ class ElementwiseUnaryOp : public OperationBase { (*kernel_)(stream.get(), inputTensors[0].get(), outputTensors[0].get()); } - bool IsCudaGraphCompatible() const override { return true; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::FULL; } private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/elu.cpp b/modules/nvidia_plugin/src/ops/elu.cpp index a747c3eb0..0d87bd6cf 100644 --- a/modules/nvidia_plugin/src/ops/elu.cpp +++ b/modules/nvidia_plugin/src/ops/elu.cpp @@ -45,7 +45,7 @@ void EluOp::Execute(const InferenceRequestContext& context, (*kernel_)(stream.get(), inputTensors[0].get(), outputTensors[0].get()); } -bool EluOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility EluOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(EluOp, Elu); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/elu.hpp b/modules/nvidia_plugin/src/ops/elu.hpp index 57e9f1ea2..16ddaac4c 100644 --- a/modules/nvidia_plugin/src/ops/elu.hpp +++ b/modules/nvidia_plugin/src/ops/elu.hpp @@ -23,7 +23,7 @@ class EluOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/fake_quantize.cpp b/modules/nvidia_plugin/src/ops/fake_quantize.cpp index aa1a8bee9..2752fd083 100644 --- a/modules/nvidia_plugin/src/ops/fake_quantize.cpp +++ b/modules/nvidia_plugin/src/ops/fake_quantize.cpp @@ -45,7 +45,7 @@ FakeQuantizeOp::FakeQuantizeOp(const CreationContext &context, convertDataType(element_type), output_size, max_threads_per_block, levels}; } -bool FakeQuantizeOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility FakeQuantizeOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void FakeQuantizeOp::Execute(const InferenceRequestContext &context, Inputs inputTensors, diff --git a/modules/nvidia_plugin/src/ops/fake_quantize.hpp b/modules/nvidia_plugin/src/ops/fake_quantize.hpp index a054f4520..9de32fc45 100644 --- a/modules/nvidia_plugin/src/ops/fake_quantize.hpp +++ b/modules/nvidia_plugin/src/ops/fake_quantize.hpp @@ -20,7 +20,7 @@ class FakeQuantizeOp : public OperationBase { IndexCollection&& inputIds, IndexCollection&& outputIds); - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: void Execute(const InferenceRequestContext& context, diff --git a/modules/nvidia_plugin/src/ops/fully_connected.cpp b/modules/nvidia_plugin/src/ops/fully_connected.cpp index d8fc6d9d1..60adc1fae 100644 --- a/modules/nvidia_plugin/src/ops/fully_connected.cpp +++ b/modules/nvidia_plugin/src/ops/fully_connected.cpp @@ -54,7 +54,7 @@ void FullyConnectedOp::Execute(const InferenceRequestContext& context, matmul_op_.Execute(context, inputs.first(inputs.size() - 1), outputs, workbuffers); } -bool FullyConnectedOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility FullyConnectedOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(FullyConnectedOp, FullyConnected); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/fully_connected.hpp b/modules/nvidia_plugin/src/ops/fully_connected.hpp index c60e7d6ad..72e249e67 100644 --- a/modules/nvidia_plugin/src/ops/fully_connected.hpp +++ b/modules/nvidia_plugin/src/ops/fully_connected.hpp @@ -26,7 +26,7 @@ class FullyConnectedOp : public OperationCuBlas { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: MatMulOp matmul_op_; diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.cpp b/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.cpp index be671ac98..d50d1f499 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.cpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.cpp @@ -77,7 +77,9 @@ void FusedConvolutionBackpropDataOp::Execute(const InferenceRequestContext& cont outputs[ArgIndices3Ins::dinput].get())); } -bool FusedConvolutionBackpropDataOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility FusedConvolutionBackpropDataOp::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::FULL; +} void FusedConvolutionBackpropDataOp::InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) { OPENVINO_ASSERT(buffers.size() == 1, "Node name: ", GetName()); diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.hpp b/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.hpp index 8edfd8035..97625cf3d 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.hpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.hpp @@ -26,7 +26,7 @@ class FusedConvolutionBackpropDataOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.cpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.cpp index f103ece4e..0c460ed4c 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.cpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.cpp @@ -95,7 +95,7 @@ void FusedConvolutionCuDnn::Execute(const InferenceRequestContext& context, outputs[ArgIndices::output].get())); } -bool FusedConvolutionCuDnn::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility FusedConvolutionCuDnn::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest FusedConvolutionCuDnn::GetWorkBufferRequest() const { if (conv_descs_->Algo().memory != 0) diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.hpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.hpp index ddf7c5d59..b9fc013da 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.hpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.hpp @@ -35,7 +35,7 @@ class FusedConvolutionCuDnn : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override {} WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.cpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.cpp index c918434d9..175346b81 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.cpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.cpp @@ -326,7 +326,9 @@ void FusedConvolutionCuDnnBE::Execute(const InferenceRequestContext& context, throwIfError(::cudnnBackendExecute(context.getThreadContext().dnnHandle().get(), plan->get(), variantPack->get())); } -bool FusedConvolutionCuDnnBE::IsCudaGraphCompatible() const { return false; } +CudaGraphCompatibility FusedConvolutionCuDnnBE::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::NONE; +} std::shared_ptr FusedConvolutionCuDnnBE::MakeTensorDescriptor( int64_t id, diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.hpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.hpp index e2ddb74ab..910887819 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.hpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.hpp @@ -34,7 +34,7 @@ class FusedConvolutionCuDnnBE : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; private: diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.cpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.cpp index 9318a9d93..ae7fed3d7 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.cpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.cpp @@ -84,7 +84,9 @@ void FusedConvolutionCuDnnDecomposed::Execute(const InferenceRequestContext& con } } -bool FusedConvolutionCuDnnDecomposed::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility FusedConvolutionCuDnnDecomposed::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::FULL; +} WorkbufferRequest FusedConvolutionCuDnnDecomposed::GetWorkBufferRequest() const { if (conv_descs_->Algo().memory != 0) { diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.hpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.hpp index e92be760a..b426008ed 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.hpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.hpp @@ -37,7 +37,7 @@ class FusedConvolutionCuDnnDecomposed : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override {} WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/gather.cpp b/modules/nvidia_plugin/src/ops/gather.cpp index f666cc35e..b954135d3 100644 --- a/modules/nvidia_plugin/src/ops/gather.cpp +++ b/modules/nvidia_plugin/src/ops/gather.cpp @@ -178,7 +178,7 @@ void GatherOp::Execute(const InferenceRequestContext& context, (*gather_kernel_)(context.getThreadContext().stream().get(), inputs[0].get(), inputs[1].get(), outputs[0].get()); } -bool GatherOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility GatherOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(GatherOp, Gather); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/gather.hpp b/modules/nvidia_plugin/src/ops/gather.hpp index 6adb48be8..9753a8fc2 100644 --- a/modules/nvidia_plugin/src/ops/gather.hpp +++ b/modules/nvidia_plugin/src/ops/gather.hpp @@ -22,7 +22,7 @@ class GatherOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::optional gather_kernel_; diff --git a/modules/nvidia_plugin/src/ops/group_convolution.cpp b/modules/nvidia_plugin/src/ops/group_convolution.cpp index b381bb47f..280fa7337 100644 --- a/modules/nvidia_plugin/src/ops/group_convolution.cpp +++ b/modules/nvidia_plugin/src/ops/group_convolution.cpp @@ -25,7 +25,7 @@ void GroupConvolutionOp::Execute(const InferenceRequestContext &context, convolution_.Execute(context, inputTensors, outputTensors, buffers); } -bool GroupConvolutionOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility GroupConvolutionOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest GroupConvolutionOp::GetWorkBufferRequest() const { return convolution_.GetWorkBufferRequest(); } diff --git a/modules/nvidia_plugin/src/ops/group_convolution.hpp b/modules/nvidia_plugin/src/ops/group_convolution.hpp index f44ac6936..5b1bb3d1a 100644 --- a/modules/nvidia_plugin/src/ops/group_convolution.hpp +++ b/modules/nvidia_plugin/src/ops/group_convolution.hpp @@ -27,7 +27,7 @@ class GroupConvolutionOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override final; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override final; private: diff --git a/modules/nvidia_plugin/src/ops/gru_cell.cpp b/modules/nvidia_plugin/src/ops/gru_cell.cpp index b8bf2736d..43d0b0f47 100644 --- a/modules/nvidia_plugin/src/ops/gru_cell.cpp +++ b/modules/nvidia_plugin/src/ops/gru_cell.cpp @@ -61,7 +61,7 @@ void GRUCellOp::Execute(const InferenceRequestContext& context, nullptr); } -bool GRUCellOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility GRUCellOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void GRUCellOp::InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) { OPENVINO_ASSERT(buffers.size() == 1 || buffers.size() == 2, "Node name: ", GetName()); diff --git a/modules/nvidia_plugin/src/ops/gru_cell.hpp b/modules/nvidia_plugin/src/ops/gru_cell.hpp index 6e00cd5b4..2fcebcb0d 100644 --- a/modules/nvidia_plugin/src/ops/gru_cell.hpp +++ b/modules/nvidia_plugin/src/ops/gru_cell.hpp @@ -27,7 +27,7 @@ class GRUCellOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/gru_sequence.cpp b/modules/nvidia_plugin/src/ops/gru_sequence.cpp index 7317aef73..84a6893b4 100644 --- a/modules/nvidia_plugin/src/ops/gru_sequence.cpp +++ b/modules/nvidia_plugin/src/ops/gru_sequence.cpp @@ -19,7 +19,9 @@ GRUSequenceOp::GRUSequenceOp(const CreationContext& context, : OperationCuDnn(context, node, std::move(inputIds), std::move(outputIds)), params_{node}, descs_{context, params_, config()}, - is_cuda_graph_compatible_{RNN::Details::isRNNSequenceCudaGraphCompatible(context.device())} { + graph_compatibility_{RNN::Details::isRNNSequenceCudaGraphCompatible(context.device()) + ? CudaGraphCompatibility::FULL + : CudaGraphCompatibility::NONE} { ib_seq_lengths_.addRequest(immut_sizes_, descs_.seqLengthArraySizeBytes()); ib_weight_space_.addRequest(immut_sizes_, descs_.weightSpaceSize()); @@ -71,7 +73,7 @@ void GRUSequenceOp::Execute(const InferenceRequestContext& context, nullptr); } -bool GRUSequenceOp::IsCudaGraphCompatible() const { return is_cuda_graph_compatible_; } +CudaGraphCompatibility GRUSequenceOp::GetCudaGraphCompatibility() const { return graph_compatibility_; } void GRUSequenceOp::InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) { descs_.initDevSeqLengthArray(CUDA::DevicePointer{ib_seq_lengths_.requiredPtr(buffers)}); diff --git a/modules/nvidia_plugin/src/ops/gru_sequence.hpp b/modules/nvidia_plugin/src/ops/gru_sequence.hpp index 6b2335901..00153a193 100644 --- a/modules/nvidia_plugin/src/ops/gru_sequence.hpp +++ b/modules/nvidia_plugin/src/ops/gru_sequence.hpp @@ -32,7 +32,7 @@ class GRUSequenceOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: static Config config(); @@ -50,7 +50,7 @@ class GRUSequenceOp : public OperationCuDnn { WorkbufferDesc ib_weight_space_; WorkbufferDesc mb_work_space_; - bool is_cuda_graph_compatible_; + CudaGraphCompatibility graph_compatibility_; }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/interpolate_cubic.cpp b/modules/nvidia_plugin/src/ops/interpolate_cubic.cpp index 0fc3cfe91..a89764cee 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_cubic.cpp +++ b/modules/nvidia_plugin/src/ops/interpolate_cubic.cpp @@ -68,7 +68,7 @@ void InterpolateCubicOp::Execute(const InferenceRequestContext& context, (*interpolate_)(context.getThreadContext().stream().get(), inputs[0].get(), outputs[0].get()); } -bool InterpolateCubicOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility InterpolateCubicOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest InterpolateCubicOp::GetWorkBufferRequest() const { return {interpolate_->immutableWorkbufferSizes(), {}}; diff --git a/modules/nvidia_plugin/src/ops/interpolate_cubic.hpp b/modules/nvidia_plugin/src/ops/interpolate_cubic.hpp index 32e06cc00..ee5348657 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_cubic.hpp +++ b/modules/nvidia_plugin/src/ops/interpolate_cubic.hpp @@ -25,7 +25,7 @@ class InterpolateCubicOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/interpolate_linear.cpp b/modules/nvidia_plugin/src/ops/interpolate_linear.cpp index b19d1228f..2c70981ed 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_linear.cpp +++ b/modules/nvidia_plugin/src/ops/interpolate_linear.cpp @@ -69,7 +69,7 @@ void InterpolateLinearOp::Execute(const InferenceRequestContext& context, (*interpolate_)(context.getThreadContext().stream().get(), inputs[0].get(), outputs[0].get()); } -bool InterpolateLinearOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility InterpolateLinearOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest InterpolateLinearOp::GetWorkBufferRequest() const { return {interpolate_->immutableWorkbufferSizes(), {}}; diff --git a/modules/nvidia_plugin/src/ops/interpolate_linear.hpp b/modules/nvidia_plugin/src/ops/interpolate_linear.hpp index eb5e6539a..22b911040 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_linear.hpp +++ b/modules/nvidia_plugin/src/ops/interpolate_linear.hpp @@ -25,7 +25,7 @@ class InterpolateLinearOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/interpolate_nearest.cpp b/modules/nvidia_plugin/src/ops/interpolate_nearest.cpp index 89caf4b47..6367e38fc 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_nearest.cpp +++ b/modules/nvidia_plugin/src/ops/interpolate_nearest.cpp @@ -158,7 +158,7 @@ void InterpolateNearestOp::Execute(const InferenceRequestContext& context, dst); } -bool InterpolateNearestOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility InterpolateNearestOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } template static auto size_in_bytes(const std::vector& v) noexcept { diff --git a/modules/nvidia_plugin/src/ops/interpolate_nearest.hpp b/modules/nvidia_plugin/src/ops/interpolate_nearest.hpp index 311b09177..487ef3829 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_nearest.hpp +++ b/modules/nvidia_plugin/src/ops/interpolate_nearest.hpp @@ -26,7 +26,7 @@ class InterpolateNearestOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/logical_not.cpp b/modules/nvidia_plugin/src/ops/logical_not.cpp index 8c1e7e44d..6c75dd1a6 100644 --- a/modules/nvidia_plugin/src/ops/logical_not.cpp +++ b/modules/nvidia_plugin/src/ops/logical_not.cpp @@ -36,7 +36,7 @@ void LogicalNotOp::Execute(const InferenceRequestContext& context, throwIfError(cudaPeekAtLastError()); } -bool LogicalNotOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility LogicalNotOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(LogicalNotOp, LogicalNot); diff --git a/modules/nvidia_plugin/src/ops/logical_not.hpp b/modules/nvidia_plugin/src/ops/logical_not.hpp index 8c95dc415..681737c7d 100644 --- a/modules/nvidia_plugin/src/ops/logical_not.hpp +++ b/modules/nvidia_plugin/src/ops/logical_not.hpp @@ -21,7 +21,7 @@ class LogicalNotOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: kernel::LogicalNot kernel_; diff --git a/modules/nvidia_plugin/src/ops/lstm_cell.cpp b/modules/nvidia_plugin/src/ops/lstm_cell.cpp index b53232793..5d7f45cb4 100644 --- a/modules/nvidia_plugin/src/ops/lstm_cell.cpp +++ b/modules/nvidia_plugin/src/ops/lstm_cell.cpp @@ -57,7 +57,7 @@ void LSTMCellOp::Execute(const InferenceRequestContext& context, nullptr); } -bool LSTMCellOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility LSTMCellOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void LSTMCellOp::InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) { OPENVINO_ASSERT(buffers.size() == 1 || buffers.size() == 2, "Node name: ", GetName()); diff --git a/modules/nvidia_plugin/src/ops/lstm_cell.hpp b/modules/nvidia_plugin/src/ops/lstm_cell.hpp index b36a4b36b..348b92116 100644 --- a/modules/nvidia_plugin/src/ops/lstm_cell.hpp +++ b/modules/nvidia_plugin/src/ops/lstm_cell.hpp @@ -27,7 +27,7 @@ class LSTMCellOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/lstm_sequence_base.cpp b/modules/nvidia_plugin/src/ops/lstm_sequence_base.cpp index 5873378cc..ab5e35a5d 100644 --- a/modules/nvidia_plugin/src/ops/lstm_sequence_base.cpp +++ b/modules/nvidia_plugin/src/ops/lstm_sequence_base.cpp @@ -20,7 +20,9 @@ LSTMSequenceOpBase::LSTMSequenceOpBase(const CreationContext& context, : OperationCuDnn(context, node, std::move(inputIds), std::move(outputIds)), params_{params}, descs_{context, params_, config}, - is_cuda_graph_compatible_{RNN::Details::isRNNSequenceCudaGraphCompatible(context.device())} { + graph_compatibility_{RNN::Details::isRNNSequenceCudaGraphCompatible(context.device()) + ? CudaGraphCompatibility::FULL + : CudaGraphCompatibility::NONE} { ib_seq_lengths_.addRequest(immut_sizes_, descs_.seqLengthArraySizeBytes()); ib_weight_space_.addRequest(immut_sizes_, descs_.weightSpaceSize()); @@ -76,7 +78,7 @@ void LSTMSequenceOpBase::Execute(const InferenceRequestContext& context, if (cy_adapter) cy_adapter->execute(context, mb, outputs[ArgIndices::cell_output]); } -bool LSTMSequenceOpBase::IsCudaGraphCompatible() const { return is_cuda_graph_compatible_; } +CudaGraphCompatibility LSTMSequenceOpBase::GetCudaGraphCompatibility() const { return graph_compatibility_; } void LSTMSequenceOpBase::InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) { descs_.initDevSeqLengthArray(CUDA::DevicePointer{ib_seq_lengths_.requiredPtr(buffers)}); diff --git a/modules/nvidia_plugin/src/ops/lstm_sequence_base.hpp b/modules/nvidia_plugin/src/ops/lstm_sequence_base.hpp index 6046c1753..6a459cf45 100644 --- a/modules/nvidia_plugin/src/ops/lstm_sequence_base.hpp +++ b/modules/nvidia_plugin/src/ops/lstm_sequence_base.hpp @@ -30,7 +30,7 @@ class LSTMSequenceOpBase : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override; WorkbufferRequest GetWorkBufferRequest() const override; @@ -59,7 +59,7 @@ class LSTMSequenceOpBase : public OperationCuDnn { OutputTensorAdapterPtr cy_adapter; private: - bool is_cuda_graph_compatible_; + CudaGraphCompatibility graph_compatibility_; }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/matmul.cpp b/modules/nvidia_plugin/src/ops/matmul.cpp index 85f97adad..52de989f1 100644 --- a/modules/nvidia_plugin/src/ops/matmul.cpp +++ b/modules/nvidia_plugin/src/ops/matmul.cpp @@ -226,7 +226,7 @@ void MatMulOp::Execute(const InferenceRequestContext& context, CUBLAS_GEMM_DEFAULT)); } -bool MatMulOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility MatMulOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(MatMulOp, MatMul); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/matmul.hpp b/modules/nvidia_plugin/src/ops/matmul.hpp index 10dbe2884..d30b46bfa 100644 --- a/modules/nvidia_plugin/src/ops/matmul.hpp +++ b/modules/nvidia_plugin/src/ops/matmul.hpp @@ -29,7 +29,7 @@ class MatMulOp : public OperationCuBlas { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; int GetBatchCount() const { return batch_count_; } diff --git a/modules/nvidia_plugin/src/ops/maxpool.cpp b/modules/nvidia_plugin/src/ops/maxpool.cpp index f6a696112..1b9a5c58d 100644 --- a/modules/nvidia_plugin/src/ops/maxpool.cpp +++ b/modules/nvidia_plugin/src/ops/maxpool.cpp @@ -30,7 +30,7 @@ void MaxPoolOp::Execute(const InferenceRequestContext& context, outputs[PoolingImpl::output_index].get()); } -bool MaxPoolOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility MaxPoolOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(MaxPoolOp, MaxPool); diff --git a/modules/nvidia_plugin/src/ops/maxpool.hpp b/modules/nvidia_plugin/src/ops/maxpool.hpp index a43554c42..b12e39525 100644 --- a/modules/nvidia_plugin/src/ops/maxpool.hpp +++ b/modules/nvidia_plugin/src/ops/maxpool.hpp @@ -23,7 +23,7 @@ class MaxPoolOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: PoolingImpl impl_; diff --git a/modules/nvidia_plugin/src/ops/mvn.cpp b/modules/nvidia_plugin/src/ops/mvn.cpp index 87c5d271c..8e61213bd 100644 --- a/modules/nvidia_plugin/src/ops/mvn.cpp +++ b/modules/nvidia_plugin/src/ops/mvn.cpp @@ -99,7 +99,7 @@ void MvnOp::Execute(const InferenceRequestContext& context, {tensor_desc_, outputTensors[0]}); } -bool MvnOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility MvnOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void MvnOp::Context::reduceMean(ConstTensor input, Tensor output) { context.getThreadContext().dnnHandle().reduceTensor(op.reduce_mean_desc_, diff --git a/modules/nvidia_plugin/src/ops/mvn.hpp b/modules/nvidia_plugin/src/ops/mvn.hpp index bb2c3e228..9420a42ea 100644 --- a/modules/nvidia_plugin/src/ops/mvn.hpp +++ b/modules/nvidia_plugin/src/ops/mvn.hpp @@ -25,7 +25,7 @@ class MvnOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; private: diff --git a/modules/nvidia_plugin/src/ops/nop_op.hpp b/modules/nvidia_plugin/src/ops/nop_op.hpp index dcb555fb3..a734d7351 100644 --- a/modules/nvidia_plugin/src/ops/nop_op.hpp +++ b/modules/nvidia_plugin/src/ops/nop_op.hpp @@ -39,7 +39,7 @@ class NopOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override {} - bool IsCudaGraphCompatible() const override { return true; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::FULL; } }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/pad.cpp b/modules/nvidia_plugin/src/ops/pad.cpp index 6bf760670..da33feaca 100644 --- a/modules/nvidia_plugin/src/ops/pad.cpp +++ b/modules/nvidia_plugin/src/ops/pad.cpp @@ -58,7 +58,7 @@ void PadOp::Execute(const InferenceRequestContext& context, inputTensors[InputIndex::kPadValue].get()); } -bool PadOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility PadOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest PadOp::GetWorkBufferRequest() const { auto rank = src_shape_.size(); diff --git a/modules/nvidia_plugin/src/ops/pad.hpp b/modules/nvidia_plugin/src/ops/pad.hpp index c819118c2..846a65feb 100644 --- a/modules/nvidia_plugin/src/ops/pad.hpp +++ b/modules/nvidia_plugin/src/ops/pad.hpp @@ -23,7 +23,7 @@ class PadOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers&) override; diff --git a/modules/nvidia_plugin/src/ops/parameter.cpp b/modules/nvidia_plugin/src/ops/parameter.cpp index 54c5dbe14..233c8dec0 100644 --- a/modules/nvidia_plugin/src/ops/parameter.cpp +++ b/modules/nvidia_plugin/src/ops/parameter.cpp @@ -32,7 +32,7 @@ void ParameterOp::Execute(const InferenceRequestContext& context, context.getThreadContext().stream().upload(outputs[0], tensor->data(), tensor->get_byte_size()); } -bool ParameterOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ParameterOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } std::string ParameterOp::GetInputTensorName(const ov::Node& node) { return node.get_friendly_name(); } diff --git a/modules/nvidia_plugin/src/ops/parameter.hpp b/modules/nvidia_plugin/src/ops/parameter.hpp index 4cbbe40dc..decd83aff 100644 --- a/modules/nvidia_plugin/src/ops/parameter.hpp +++ b/modules/nvidia_plugin/src/ops/parameter.hpp @@ -27,7 +27,7 @@ class ParameterOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; static std::string GetInputTensorName(const ov::Node& node); private: diff --git a/modules/nvidia_plugin/src/ops/range.cpp b/modules/nvidia_plugin/src/ops/range.cpp index f8df8a09a..df08e04d4 100644 --- a/modules/nvidia_plugin/src/ops/range.cpp +++ b/modules/nvidia_plugin/src/ops/range.cpp @@ -64,7 +64,7 @@ void RangeOp::Execute(const InferenceRequestContext& context, outputs[OUTPUT_INDX].get()); } -bool RangeOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility RangeOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(RangeOp, Range); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/range.hpp b/modules/nvidia_plugin/src/ops/range.hpp index 89da68a6a..67a90f15c 100644 --- a/modules/nvidia_plugin/src/ops/range.hpp +++ b/modules/nvidia_plugin/src/ops/range.hpp @@ -26,7 +26,7 @@ class RangeOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: size_t output_size_; diff --git a/modules/nvidia_plugin/src/ops/reduce.cpp b/modules/nvidia_plugin/src/ops/reduce.cpp index d57d75f17..36b7df0ff 100644 --- a/modules/nvidia_plugin/src/ops/reduce.cpp +++ b/modules/nvidia_plugin/src/ops/reduce.cpp @@ -58,7 +58,7 @@ void ReduceOp::Execute(const InferenceRequestContext& context, outputTensors[0]); } -bool ReduceOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ReduceOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } } // namespace nvidia_gpu } // namespace ov diff --git a/modules/nvidia_plugin/src/ops/reduce.hpp b/modules/nvidia_plugin/src/ops/reduce.hpp index 35d9b4822..275eefbd3 100644 --- a/modules/nvidia_plugin/src/ops/reduce.hpp +++ b/modules/nvidia_plugin/src/ops/reduce.hpp @@ -22,7 +22,7 @@ class ReduceOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; static cudnnDataType_t reduceCompType(const ov::Node& node); diff --git a/modules/nvidia_plugin/src/ops/result.cpp b/modules/nvidia_plugin/src/ops/result.cpp index 2b867804b..65eb7faa4 100644 --- a/modules/nvidia_plugin/src/ops/result.cpp +++ b/modules/nvidia_plugin/src/ops/result.cpp @@ -42,7 +42,7 @@ void ResultOp::Execute(const InferenceRequestContext& context, context.getThreadContext().stream().download(tensor->data(), inputs[0], tensor->get_byte_size()); } -bool ResultOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ResultOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } std::optional ResultOp::GetOutputTensorSubIndex(const ov::Output& node) { const auto& opRegistry = OperationRegistry::getInstance(); diff --git a/modules/nvidia_plugin/src/ops/result.hpp b/modules/nvidia_plugin/src/ops/result.hpp index 7e66794a2..275074583 100644 --- a/modules/nvidia_plugin/src/ops/result.hpp +++ b/modules/nvidia_plugin/src/ops/result.hpp @@ -29,7 +29,7 @@ class ResultOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; static std::vector GetOutputTensorName(const ov::op::v0::Result& node); diff --git a/modules/nvidia_plugin/src/ops/round.cpp b/modules/nvidia_plugin/src/ops/round.cpp index 4edd12e55..9fa98f216 100644 --- a/modules/nvidia_plugin/src/ops/round.cpp +++ b/modules/nvidia_plugin/src/ops/round.cpp @@ -48,7 +48,7 @@ void RoundOp::Execute(const InferenceRequestContext& context, (*kernel_)(context.getThreadContext().stream().get(), inputTensors[0].get(), outputTensors[0].get()); } -bool RoundOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility RoundOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(RoundOp, Round); diff --git a/modules/nvidia_plugin/src/ops/round.hpp b/modules/nvidia_plugin/src/ops/round.hpp index 86762f3cc..baa62db95 100644 --- a/modules/nvidia_plugin/src/ops/round.hpp +++ b/modules/nvidia_plugin/src/ops/round.hpp @@ -25,7 +25,7 @@ class RoundOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/scatter_nd_update.cpp b/modules/nvidia_plugin/src/ops/scatter_nd_update.cpp index bd5b72eb4..bb00b459a 100644 --- a/modules/nvidia_plugin/src/ops/scatter_nd_update.cpp +++ b/modules/nvidia_plugin/src/ops/scatter_nd_update.cpp @@ -100,7 +100,7 @@ void ScatterNDUpdateOp::Execute(const InferenceRequestContext& context, outputs[0].get()); } -bool ScatterNDUpdateOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ScatterNDUpdateOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } template static auto size_in_bytes(const std::vector& v) noexcept { diff --git a/modules/nvidia_plugin/src/ops/scatter_nd_update.hpp b/modules/nvidia_plugin/src/ops/scatter_nd_update.hpp index 394413f0f..778a16c55 100644 --- a/modules/nvidia_plugin/src/ops/scatter_nd_update.hpp +++ b/modules/nvidia_plugin/src/ops/scatter_nd_update.hpp @@ -22,7 +22,7 @@ class ScatterNDUpdateOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/select.cpp b/modules/nvidia_plugin/src/ops/select.cpp index c8dc04f01..708b57c74 100644 --- a/modules/nvidia_plugin/src/ops/select.cpp +++ b/modules/nvidia_plugin/src/ops/select.cpp @@ -90,7 +90,7 @@ void SelectOp::Execute(const InferenceRequestContext& context, outputs[0].get()); } -bool SelectOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility SelectOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest SelectOp::GetWorkBufferRequest() const { return {std::vector(SIZES + 1, kOffsetBufferSize), {}}; diff --git a/modules/nvidia_plugin/src/ops/select.hpp b/modules/nvidia_plugin/src/ops/select.hpp index 61287a269..8eb0e800e 100644 --- a/modules/nvidia_plugin/src/ops/select.hpp +++ b/modules/nvidia_plugin/src/ops/select.hpp @@ -26,7 +26,7 @@ class SelectOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/softmax.cpp b/modules/nvidia_plugin/src/ops/softmax.cpp index e3ecedaa2..0019cd7db 100644 --- a/modules/nvidia_plugin/src/ops/softmax.cpp +++ b/modules/nvidia_plugin/src/ops/softmax.cpp @@ -192,7 +192,7 @@ void SoftmaxOp::Execute(const InferenceRequestContext& context, outputs[0].get())); } -bool SoftmaxOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility SoftmaxOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(SoftmaxOp, Softmax); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/softmax.hpp b/modules/nvidia_plugin/src/ops/softmax.hpp index 608e1e657..abafaed63 100644 --- a/modules/nvidia_plugin/src/ops/softmax.hpp +++ b/modules/nvidia_plugin/src/ops/softmax.hpp @@ -27,7 +27,7 @@ class SoftmaxOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: void mapRankAxis(const ov::Shape& shape, int axis); diff --git a/modules/nvidia_plugin/src/ops/split.cpp b/modules/nvidia_plugin/src/ops/split.cpp index b53aeadce..b5f66cbad 100644 --- a/modules/nvidia_plugin/src/ops/split.cpp +++ b/modules/nvidia_plugin/src/ops/split.cpp @@ -89,7 +89,7 @@ void SplitOp::Execute(const InferenceRequestContext& context, (*split_kernel_)(stream.get(), reinterpret_cast(in.get()), reinterpret_cast(outputPtrs.get())); } -bool SplitOp::IsCudaGraphCompatible() const { return false; } +CudaGraphCompatibility SplitOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::NONE; } OPERATION_REGISTER(SplitOp, Split); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/split.hpp b/modules/nvidia_plugin/src/ops/split.hpp index b38be6a0d..f6eda60bc 100644 --- a/modules/nvidia_plugin/src/ops/split.hpp +++ b/modules/nvidia_plugin/src/ops/split.hpp @@ -26,7 +26,7 @@ class SplitOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/strided_slice.cpp b/modules/nvidia_plugin/src/ops/strided_slice.cpp index 992eab97b..5d028ceb6 100644 --- a/modules/nvidia_plugin/src/ops/strided_slice.cpp +++ b/modules/nvidia_plugin/src/ops/strided_slice.cpp @@ -103,7 +103,9 @@ void StridedSliceOp::Execute(const InferenceRequestContext& context, } template -bool StridedSliceOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility StridedSliceOp::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::FULL; +} template WorkbufferRequest StridedSliceOp::GetWorkBufferRequest() const { diff --git a/modules/nvidia_plugin/src/ops/strided_slice.hpp b/modules/nvidia_plugin/src/ops/strided_slice.hpp index f154e8967..9754cafb3 100644 --- a/modules/nvidia_plugin/src/ops/strided_slice.hpp +++ b/modules/nvidia_plugin/src/ops/strided_slice.hpp @@ -39,7 +39,7 @@ class StridedSliceOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/subgraph.cpp b/modules/nvidia_plugin/src/ops/subgraph.cpp index 012cbb7a4..b24a80768 100644 --- a/modules/nvidia_plugin/src/ops/subgraph.cpp +++ b/modules/nvidia_plugin/src/ops/subgraph.cpp @@ -6,6 +6,7 @@ #include +#include #include #include #include @@ -24,23 +25,29 @@ SubGraph::SubGraph(const CreationContext& context, const SubGraphOp& op, IndexCollection&& inputIds, IndexCollection&& outputIds) - : OperationBase(context, op, std::move(inputIds), std::move(outputIds)), model_{op.get_function()} { + : OperationBase(context, op, std::move(inputIds), std::move(outputIds)), + model_{op.get_function()}, + creation_context_{context} { const bool isStableParamsAndResultsNeeded = nullptr != dynamic_cast(&op); - initExecuteSequence(context, isStableParamsAndResultsNeeded, isStableParamsAndResultsNeeded); + initExecuteSequence(isStableParamsAndResultsNeeded, isStableParamsAndResultsNeeded); } SubGraph::SubGraph(const CreationContext& context, const std::shared_ptr& model) - : OperationBase(context, nullptr), model_{model} { - initExecuteSequence(context, false, false); + : OperationBase(context, nullptr), model_{model}, creation_context_{context} { + initExecuteSequence(false, false); } SubGraph::SubGraph(const CreationContext& context, const std::shared_ptr& model, - ExecSequence&& sequence, - std::shared_ptr memoryManager) - : OperationBase{context, nullptr}, model_{model}, exec_sequence_{sequence}, memory_manager_{memoryManager} {} - -void SubGraph::initExecuteSequence(const CreationContext& context, bool isStableParams, bool isStableResults) { + const ExecSequence& sequence, + const std::shared_ptr& memoryManager) + : OperationBase{context, nullptr}, + memory_manager_{memoryManager}, + exec_sequence_{sequence}, + model_{model}, + creation_context_{context} {} + +void SubGraph::initExecuteSequence(bool isStableParams, bool isStableResults) { static constexpr auto InitNeeded = IOperationExec::WorkbufferStatus::InitNeeded; if (!model_) { @@ -65,7 +72,7 @@ void SubGraph::initExecuteSequence(const CreationContext& context, bool isStable } auto inIds = opBuffersExtractor.inputTensorIds(*node); auto outIds = opBuffersExtractor.outputTensorIds(*node); - auto operation = OperationRegistry::getInstance().createOperation(context, node, move(inIds), move(outIds)); + auto operation = OperationRegistry::getInstance().createOperation(creation_context_, node, move(inIds), move(outIds)); if (dynamic_cast(operation.get())) { continue; } @@ -110,6 +117,20 @@ std::unique_ptr SubGraph::createMemoryManager(const OperationBuff return std::make_unique(shared_constants_blob, memory_model, immutable_workbuffers); } +std::size_t SubGraph::GetCudaGraphsCount() const { + if (!hasTopologyRunners()) { + return graph_compatibility_ == CudaGraphCompatibility::NONE ? 0 : 1; + } + auto count = runner_ ? runner_->GetCudaGraphsCount() : static_cast(0); + for (const auto& op : exec_sequence_) { + const auto sg = std::dynamic_pointer_cast(op); + if (sg) { + count += sg->GetCudaGraphsCount(); + } + } + return count; +} + void SubGraph::initSharedImmutableWorkbuffers(const std::vector& init_sequence) { for (auto op : init_sequence) { op->InitSharedImmutableWorkbuffers(getSharedWorkbuffers(*op)); @@ -128,43 +149,63 @@ std::vector> SubGraph::getSharedWorkbuffers(const IOperatio return result; } -void SubGraph::Capture(InferenceRequestContext &context, Inputs, Outputs, - const Workbuffers &workbuffers) const { +void SubGraph::Execute(const InferenceRequestContext& context, Inputs, Outputs, const Workbuffers& workbuffers) const { const auto& stream = context.getThreadContext().stream(); const auto& memoryManager = *memory_manager_; auto& mutableBuffer = workbuffers.mutable_buffers.at(0); auto& executionDelegator = context.getExecutionDelegator(); executionDelegator.set_stream(stream); - executionDelegator.capture_sequence(this, memoryManager, mutableBuffer, context); + executionDelegator.execute_sequence(this, memoryManager, mutableBuffer, context); } -WorkbufferRequest SubGraph::GetWorkBufferRequest() const { - const auto memoryBlockSize = memory_manager_->mutableTensorsMemoryModel()->deviceMemoryBlockSize(); - return {{}, {memoryBlockSize}}; +CudaGraphCompatibility SubGraph::GetCudaGraphCompatibility() const { + if (!is_compatibility_analyzed_) { + graph_compatibility_ = CudaGraphCompatibility::FULL; + for (const auto& op : exec_sequence_) { + auto opCompatability = op->GetCudaGraphCompatibility(); + if (opCompatability == CudaGraphCompatibility::SPECIAL) { + graph_compatibility_ = opCompatability; + } else if (opCompatability == CudaGraphCompatibility::NONE) { + graph_compatibility_ = opCompatability; + break; + } + } + is_compatibility_analyzed_ = true; + } + return graph_compatibility_; } -void SubGraph::Execute(const InferenceRequestContext& context, Inputs, Outputs, const Workbuffers& workbuffers) const { +void SubGraph::Capture(InferenceRequestContext& context, Inputs, Outputs, const Workbuffers& workbuffers) const { const auto& stream = context.getThreadContext().stream(); const auto& memoryManager = *memory_manager_; auto& mutableBuffer = workbuffers.mutable_buffers.at(0); auto& executionDelegator = context.getExecutionDelegator(); executionDelegator.set_stream(stream); - executionDelegator.execute_sequence(this, memoryManager, mutableBuffer, context); + executionDelegator.capture_sequence(this, memoryManager, mutableBuffer, context); } -bool SubGraph::IsCudaGraphCompatible() const { - if (is_cuda_graph_compatible_ == CompatibleState::NOT_INITIALIZED) { - is_cuda_graph_compatible_ = CompatibleState::COMPATIBLE; - for (const auto& op : exec_sequence_) { - if (!op->IsCudaGraphCompatible()) { - is_cuda_graph_compatible_ = CompatibleState::NOT_COMPATIBLE; - break; - } - } - } - return is_cuda_graph_compatible_ == CompatibleState::COMPATIBLE; +void SubGraph::ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const { + const auto& stream = context.getThreadContext().stream(); + const auto& memoryManager = *memory_manager_; + auto& mutableBuffer = workbuffers.mutable_buffers.at(0); + + auto& executionDelegator = context.getExecutionDelegator(); + executionDelegator.set_stream(stream); + executionDelegator.execute_graph_sequence(this, memoryManager, mutableBuffer, context); +} + +void SubGraph::initializeRunner() { + runner_ = std::make_shared(creation_context_, model_, exec_sequence_, memory_manager_); +} + +WorkbufferRequest SubGraph::GetWorkBufferRequest() const { + const auto memoryBlockSize = memory_manager_->mutableTensorsMemoryModel()->deviceMemoryBlockSize(); + return {{}, {memoryBlockSize}}; } } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/subgraph.hpp b/modules/nvidia_plugin/src/ops/subgraph.hpp index 443aeea96..8052ab958 100644 --- a/modules/nvidia_plugin/src/ops/subgraph.hpp +++ b/modules/nvidia_plugin/src/ops/subgraph.hpp @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -22,8 +23,8 @@ class SubGraph : public OperationBase { SubGraph(const CreationContext& context, const std::shared_ptr& model, - ExecSequence&& sequence, - std::shared_ptr memoryManager); + const ExecSequence& sequence, + const std::shared_ptr& memoryManager); virtual ~SubGraph() = default; @@ -32,12 +33,19 @@ class SubGraph : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; + void Capture(InferenceRequestContext& context, Inputs inputTensors, Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + void ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const override; + + virtual void initializeRunner(); inline std::shared_ptr memoryManager() const { return memory_manager_; } @@ -48,13 +56,35 @@ class SubGraph : public OperationBase { const std::vector& getParams() const; const std::vector& getResults() const; + bool hasTopologyRunners() const { + if (runners_status_ == NestedRunnersStatus::UNKNOWN) { + if (runner_ != nullptr) { + runners_status_ = NestedRunnersStatus::PRESENT; + } else { + runners_status_ = NestedRunnersStatus::ABSENT; + for (const auto& op : exec_sequence_) { + const auto sg = std::dynamic_pointer_cast(op); + if (sg && sg->hasTopologyRunners()) { + runners_status_ = NestedRunnersStatus::PRESENT; + break; + } + } + } + } + return runners_status_ == NestedRunnersStatus::PRESENT; + } + + virtual std::size_t GetCudaGraphsCount() const; + private: void initSharedImmutableWorkbuffers(const std::vector& init_sequence); - void initExecuteSequence(const CreationContext& context, bool isStableParams, bool isStableResults); + void initExecuteSequence(bool isStableParams, bool isStableResults); static std::unique_ptr createMemoryManager(const OperationBuffersExtractor& opBuffersExtractor); std::vector> getSharedWorkbuffers(const IOperationExec& operation); protected: + enum class NestedRunnersStatus { UNKNOWN = -1, ABSENT, PRESENT }; + using SubGraphOp = ov::op::util::SubGraphOp; SubGraph(const CreationContext& context, @@ -78,8 +108,6 @@ class SubGraph : public OperationBase { ov::Shape shape_{}; }; - enum class CompatibleState { NOT_INITIALIZED = -1, NOT_COMPATIBLE, COMPATIBLE }; - std::shared_ptr memory_manager_; std::vector params_; std::vector params_info_; @@ -88,7 +116,12 @@ class SubGraph : public OperationBase { std::vector results_info_; std::shared_ptr model_; - mutable CompatibleState is_cuda_graph_compatible_ = CompatibleState::NOT_INITIALIZED; + const CreationContext& creation_context_; + std::shared_ptr runner_ = nullptr; + + mutable CudaGraphCompatibility graph_compatibility_; + mutable bool is_compatibility_analyzed_ = false; + mutable NestedRunnersStatus runners_status_{NestedRunnersStatus::UNKNOWN}; }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/swish.cpp b/modules/nvidia_plugin/src/ops/swish.cpp index 61244cef9..7308d7e07 100644 --- a/modules/nvidia_plugin/src/ops/swish.cpp +++ b/modules/nvidia_plugin/src/ops/swish.cpp @@ -69,7 +69,7 @@ void SwishOp::Execute(const InferenceRequestContext& context, (*kernel_)(stream.get(), inputTensors[0].get(), outputTensors[0].get()); } -bool SwishOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility SwishOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(SwishOp, Swish); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/swish.hpp b/modules/nvidia_plugin/src/ops/swish.hpp index 26a353850..8f1ddbb3a 100644 --- a/modules/nvidia_plugin/src/ops/swish.hpp +++ b/modules/nvidia_plugin/src/ops/swish.hpp @@ -24,7 +24,7 @@ class SwishOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp index c92238aec..0f4ddb230 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp @@ -133,6 +133,28 @@ TensorIteratorOp::TensorIteratorOp(const CreationContext& context, } updateExecSequence(); + + // Input mapping of ports + slices_.reserve(portmap_inputs_.size()); + for (const auto& it : portmap_inputs_) { + const auto& inputIdx = it.first; + const auto& paramIdx = inputs_parameters_map_.at(inputIdx); + slices_.emplace_back(*this, inputIdx, paramIdx); + } + + // Back-edge mapping + transfers_.reserve(results_parameters_map_.size()); + for (const auto& [resultIdx, paramIdx] : results_parameters_map_) { + transfers_.emplace_back(*this, resultIdx, paramIdx); + } + + // Output mapping of ports + inserts_.reserve(results_outputs_map_.size()); + for (const auto& [resultIdx, outputIdx] : results_outputs_map_) { + if (portmap_outputs_.count(outputIdx) > 0) { + inserts_.emplace_back(*this, resultIdx, outputIdx); + } + } } void TensorIteratorOp::Execute(const InferenceRequestContext& context, @@ -142,64 +164,154 @@ void TensorIteratorOp::Execute(const InferenceRequestContext& context, const auto& stream = context.getThreadContext().stream(); const auto& memoryManager = *memory_manager_; auto& mutableBuffer = workbuffers.mutable_buffers.at(0); - auto& cancellationToken = context.getCancellationToken(); auto& executionDelegator = context.getExecutionDelegator(); executionDelegator.set_stream(stream); // First iteration for (const auto inputIdx : invariant_inputs_) { const auto paramIdx = inputs_parameters_map_.at(inputIdx); - copyParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); } for (const auto& [inputIdx, paramIdx] : inputs_parameters_map_) { if (portmap_inputs_.count(inputIdx) == 0) { - copyParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); } } for (int64_t iter = 0; iter < num_iterations_; ++iter) { - // Input mapping of ports - for (auto& it : portmap_inputs_) { - const auto& inputIdx = it.first; - const auto& paramIdx = inputs_parameters_map_.at(inputIdx); - copyParam(stream, mutableBuffer, inputTensors, iter, inputIdx, paramIdx); + for (const auto& slice : slices_) { + slice(stream, inputTensors, mutableBuffer, iter); } // Inner loop executionDelegator.execute_sequence(this, memoryManager, mutableBuffer, context); // Back-edge mapping - for (auto& [resultIdx, paramIdx] : results_parameters_map_) { - copyBackEdge(stream, mutableBuffer, resultIdx, paramIdx); + for (const auto& transfer : transfers_) { + transfer(stream, mutableBuffer); } // Output mapping of ports - for (const auto& [resultIdx, outputIdx] : results_outputs_map_) { - if (portmap_outputs_.count(outputIdx) > 0) { - copyResult(stream, mutableBuffer, outputTensors, iter, resultIdx, outputIdx); - } + for (const auto& insert : inserts_) { + insert(stream, mutableBuffer, outputTensors, iter); } // Copy data to output if (iterations_results_map_.count(iter) > 0) { for (const auto& resultIdx : iterations_results_map_.at(iter)) { const auto& outputIdx = results_outputs_map_.at(resultIdx); - copyResult(stream, mutableBuffer, outputTensors, iter, resultIdx, outputIdx); + transferResult(stream, mutableBuffer, outputTensors, iter, resultIdx, outputIdx); + } + } + } +} + +CudaGraphCompatibility TensorIteratorOp::GetCudaGraphCompatibility() const { + // This implementation is CUDA graph compatible only if this is the standard TI with output only of the last + // iteration (which is handled outside of the iterations loop) + if (iterations_results_map_.size() != 1 || iterations_results_map_.count(num_iterations_ - 1) == 0) { + return CudaGraphCompatibility::NONE; + } + if (!is_compatibility_analyzed_) { + graph_compatibility_ = CudaGraphCompatibility::NONE; + for (const auto& op : exec_sequence_) { + auto opCompatability = op->GetCudaGraphCompatibility(); + if (opCompatability == CudaGraphCompatibility::SPECIAL || opCompatability == CudaGraphCompatibility::FULL) { + graph_compatibility_ = CudaGraphCompatibility::SPECIAL; + break; } } + is_compatibility_analyzed_ = true; } + return graph_compatibility_; } -// TODO: Investigate problem with multi-graphs in some networks -// benchmark_app may hang in throughput mode -bool TensorIteratorOp::IsCudaGraphCompatible() const { return false; } +void TensorIteratorOp::initializeRunner() { + // For better performance nested topology runners are not used if all operations in execution sequence are + // fully CUDA graph compatible + if (std::any_of(exec_sequence_.begin(), exec_sequence_.end(), [](const auto& op) { + return op->GetCudaGraphCompatibility() != CudaGraphCompatibility::FULL; + })) { + SubGraph::initializeRunner(); + } +} -void TensorIteratorOp::Capture(InferenceRequestContext& context, - Inputs inputTensors, - Outputs outputTensors, - const Workbuffers& workbuffers) const { - Execute(context, inputTensors, outputTensors, workbuffers); +TensorIteratorOp::SliceLauncher::SliceLauncher(const TensorIteratorOp& ti, uint64_t inputIdx, uint64_t paramIdx) + : input_idx_{inputIdx}, + param_{*ti.params_[paramIdx]}, + memory_manager_{*ti.memory_manager_}, + slice_{ti.kernelmap_inputs_.at(inputIdx)} { + OPENVINO_ASSERT(ti.portmap_inputs_.count(inputIdx) != 0, "Node name: ", ti.GetName()); + const auto& portMap = ti.portmap_inputs_.at(input_idx_); + const auto& inputShape = ti.inputs_info_[input_idx_].shape_; + start_ = portMap.start < 0 ? inputShape[portMap.axis] + portMap.start : portMap.start; + stride_ = portMap.stride; +} + +void TensorIteratorOp::SliceLauncher::addKernelNode(ICudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Inputs& inputTensors) { + const auto* src = inputTensors[input_idx_].get(); + auto* dst = memory_manager_.outputTensorPointers(param_, mutableBuffer)[0].get(); + info.add_kernel(stream, + slice_.getKernel(), + slice_.getNumBlocks(), + slice_.getThreadsPerBlock(), + slice_.getPropsPtr(), + start_, + slice_.getSize(), + src, + dst); +} + +TensorIteratorOp::TransferLauncher::TransferLauncher(const TensorIteratorOp& ti, uint64_t resultIdx, uint64_t paramIdx) + : param_{*ti.params_[paramIdx]}, result_{*ti.results_[resultIdx]}, memory_manager_{*ti.memory_manager_} { + param_size_ = ti.params_info_[paramIdx].size_; + const auto resultSize = ti.results_info_[resultIdx].size_; + OPENVINO_ASSERT(param_size_ == resultSize, "Node name: ", ti.GetName()); +} + +void TensorIteratorOp::TransferLauncher::addTransferNode(ICudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer) { + const auto& paramTensors = memory_manager_.outputTensorPointers(param_, mutableBuffer); + auto dst = paramTensors[0]; + const auto& resultTensors = memory_manager_.inputTensorPointers(result_, mutableBuffer); + const auto src = resultTensors[0]; + info.add_transfer(stream, dst, src, param_size_); +} + +TensorIteratorOp::InsertLauncher::InsertLauncher(const TensorIteratorOp& ti, + const std::size_t resultIdx, + const std::size_t outputIdx) + : output_idx_{outputIdx}, + result_{*ti.results_[resultIdx]}, + memory_manager_{*ti.memory_manager_}, + insert_{ti.kernelmap_outputs_.at(outputIdx)} { + OPENVINO_ASSERT(ti.portmap_outputs_.count(outputIdx) != 0, "Node name: ", ti.GetName()); + const auto& portMap = ti.portmap_outputs_.at(output_idx_); + const auto& outputShape = ti.outputs_info_[output_idx_].shape_; + start_ = portMap.start < 0 ? outputShape[portMap.axis] + portMap.start : portMap.start; + stride_ = portMap.stride; +} + +void TensorIteratorOp::InsertLauncher::addKernelNode(ICudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors) { + const auto* src = memory_manager_.inputTensorPointers(result_, mutableBuffer)[0].get(); + auto* dst = outputTensors[output_idx_].get(); + info.add_kernel(stream, + insert_.getKernel(), + insert_.getNumBlocks(), + insert_.getThreadsPerBlock(), + insert_.getPropsPtr(), + start_, + insert_.getSize(), + src, + dst); } WorkbufferRequest TensorIteratorOp::GetWorkBufferRequest() const { @@ -227,87 +339,226 @@ void TensorIteratorOp::InitSharedImmutableWorkbuffers(const Buffers& buffers) { } } -void TensorIteratorOp::copyParam(const CUDA::Stream& stream, - const CUDA::DevicePointer mutableBuffer, - const IOperationExec::Inputs& inputTensors, - const std::int64_t iter, - const uint64_t inputIdx, - const uint64_t paramIdx) const { - auto& memoryManager = *memory_manager_; - const std::size_t inputSize = inputs_info_[inputIdx].size_; - const std::size_t paramSize = params_info_[paramIdx].size_; - if (portmap_inputs_.count(inputIdx) == 0) { - auto& input = inputTensors[inputIdx]; - const auto& param = params_[paramIdx]; - auto outputTensors = memoryManager.outputTensorPointers(*param, mutableBuffer); - OPENVINO_ASSERT(inputSize == paramSize, "Node name: ", GetName()); - stream.transfer(outputTensors[0], input, inputSize); - } else { - const auto& portMap = portmap_inputs_.at(inputIdx); - const auto& param = params_[paramIdx]; - auto outputTensors = memoryManager.outputTensorPointers(*param, mutableBuffer); - const auto inputShape = inputs_info_[inputIdx].shape_; +void TensorIteratorOp::CaptureSingle(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const { + const auto& stream = context.getThreadContext().stream(); + const auto& memoryManager = *memory_manager_; + auto& mutableBuffer = workbuffers.mutable_buffers.at(0); + auto& executionDelegator = context.getExecutionDelegator(); + executionDelegator.set_stream(stream); + // auto& graphInfo = context.getCudaGraphContext().get_current_graph(); + auto& graphInfo = context.getCurrentCudaGraphInfo(); + OPENVINO_ASSERT(!graphInfo.is_nested(), "For single-graph mode graphInfo shouldn't be nested"); + + CUDA::GraphCapture capture{stream}; + { + auto scope = capture.getScope(); + // Input mapping of ports + for (auto& slice : slices_) { + slice.addKernelNode(graphInfo, stream, mutableBuffer, inputTensors); + } + + // Inner loop + executionDelegator.capture_sequence(this, memoryManager, mutableBuffer, context); + + // Back-edge mapping + for (auto& transfer : transfers_) { + transfer.addTransferNode(graphInfo, stream, mutableBuffer); + } - const auto& slice = kernelmap_inputs_.at(inputIdx); - std::size_t start; - if (portMap.start < 0) { - start = inputShape[portMap.axis] + portMap.start; - } else { - start = portMap.start; + // Output mapping of ports + for (auto& insert : inserts_) { + insert.addKernelNode(graphInfo, stream, mutableBuffer, outputTensors); } - start += iter * portMap.stride; - auto input = inputTensors[inputIdx]; - slice(stream.get(), input.get(), outputTensors[0].get(), start); } + graphInfo.set_current_graph(capture.getGraph()); } -void TensorIteratorOp::copyBackEdge(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - const uint64_t resultIdx, - const uint64_t paramIdx) const { +void TensorIteratorOp::ExecuteGraphSingle(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const { + const auto& stream = context.getThreadContext().stream(); + const auto& memoryManager = *memory_manager_; + const auto& mutableBuffer = workbuffers.mutable_buffers.at(0); + + // First iteration + for (const auto inputIdx : invariant_inputs_) { + const auto paramIdx = inputs_parameters_map_.at(inputIdx); + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + } + for (const auto& [inputIdx, paramIdx] : inputs_parameters_map_) { + if (portmap_inputs_.count(inputIdx) == 0) { + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + } + } + + // auto& graphInfo = context.getCudaGraphContext().get_current_graph(); + auto& graphInfo = context.getCurrentCudaGraphInfo(); + OPENVINO_ASSERT(graphInfo.get_kernels_count() == slices_.size() + inserts_.size(), + "CudaGraphContext/TensorIteratorOp slices or inserts count incosistency"); + + // TI body loop + for (int64_t iter = 0; iter < num_iterations_; ++iter) { + for (std::size_t i = 0; i < slices_.size(); ++i) { + slices_[i].updateKernelNode(graphInfo, i, mutableBuffer, inputTensors, iter); + } + for (std::size_t i = 0; i < inserts_.size(); ++i) { + inserts_[i].updateKernelNode(graphInfo, i + slices_.size(), mutableBuffer, outputTensors, iter); + } + graphInfo.launch(stream); + } + + // Copy data to output + if (iterations_results_map_.count(num_iterations_ - 1) > 0) { + for (const auto& resultIdx : iterations_results_map_.at(num_iterations_ - 1)) { + const auto& outputIdx = results_outputs_map_.at(resultIdx); + transferResult(stream, mutableBuffer, outputTensors, num_iterations_ - 1, resultIdx, outputIdx); + } + } +} + +void TensorIteratorOp::CaptureMulti(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const { + const auto& stream = context.getThreadContext().stream(); + const auto& memoryManager = *memory_manager_; + auto& mutableBuffer = workbuffers.mutable_buffers.at(0); + auto& executionDelegator = context.getExecutionDelegator(); + executionDelegator.set_stream(stream); + auto& graphPack = context.getCurrentCudaGraphInfo(); + OPENVINO_ASSERT(graphPack.is_nested(), "For multi-graph mode graphPack should be nested"); + + graphPack.add(CudaGraphInfo::create()); + CUDA::GraphCapture capture{stream}; + { + auto scope = capture.getScope(); + // Input mapping of ports + for (auto& slice : slices_) { + slice.addKernelNode(graphPack, stream, mutableBuffer, inputTensors); + } + } + graphPack.set_current_graph(capture.getGraph()); + + auto& bodyGraphInfo = graphPack.add(CudaGraphPack::create()); + context.setCurrentCudaGraphInfo(bodyGraphInfo); + // Inner loop + runner_->Capture(context, workbuffers); + + graphPack.add(CudaGraphInfo::create()); + CUDA::GraphCapture capture2{stream}; + { + auto scope = capture2.getScope(); + // Back-edge mapping + for (auto& transfer : transfers_) { + transfer.addTransferNode(graphPack, stream, mutableBuffer); + } + + // Output mapping of ports + for (auto& insert : inserts_) { + insert.addKernelNode(graphPack, stream, mutableBuffer, outputTensors); + } + } + graphPack.set_current_graph(capture2.getGraph()); +} + +void TensorIteratorOp::ExecuteGraphMulti(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const { + const auto& stream = context.getThreadContext().stream(); + const auto& memoryManager = *memory_manager_; + const auto& mutableBuffer = workbuffers.mutable_buffers.at(0); + + // First iteration + for (const auto inputIdx : invariant_inputs_) { + const auto paramIdx = inputs_parameters_map_.at(inputIdx); + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + } + for (const auto& [inputIdx, paramIdx] : inputs_parameters_map_) { + if (portmap_inputs_.count(inputIdx) == 0) { + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + } + } + + auto& graphPack = context.getCurrentCudaGraphInfo(); + OPENVINO_ASSERT(graphPack.get_kernels_count() == slices_.size() + inserts_.size(), + "CudaGraphContext/TensorIteratorOp slices or inserts count incosistency"); + + OPENVINO_ASSERT(graphPack.get_graphs_count() == 3, "Current graphPack should contain 3 sub-elements"); + + graphPack.select_current_graph(0); + auto& preInfo = graphPack.get_current_graph(); + + graphPack.select_current_graph(1); + auto& bodyContext = graphPack.get_current_graph(); + + graphPack.select_current_graph(2); + auto& postInfo = graphPack.get_current_graph(); + + // TI body loop + for (int64_t iter = 0; iter < num_iterations_; ++iter) { + for (std::size_t i = 0; i < slices_.size(); ++i) { + slices_[i].updateKernelNode(preInfo, i, mutableBuffer, inputTensors, iter); + } + preInfo.launch(stream); + + context.setCurrentCudaGraphInfo(bodyContext); + runner_->Run(context, workbuffers); + + for (std::size_t i = 0; i < inserts_.size(); ++i) { + inserts_[i].updateKernelNode(postInfo, i, mutableBuffer, outputTensors, iter); + } + postInfo.launch(stream); + } + // Copy data to output + if (iterations_results_map_.count(num_iterations_ - 1) > 0) { + for (const auto& resultIdx : iterations_results_map_.at(num_iterations_ - 1)) { + const auto& outputIdx = results_outputs_map_.at(resultIdx); + transferResult(stream, mutableBuffer, outputTensors, num_iterations_ - 1, resultIdx, outputIdx); + } + } +} + +void TensorIteratorOp::transferParam(const CUDA::Stream& stream, + const CUDA::DevicePointer mutableBuffer, + const IOperationExec::Inputs& inputTensors, + const std::int64_t iter, + const uint64_t inputIdx, + const uint64_t paramIdx) const { + OPENVINO_ASSERT(portmap_inputs_.count(inputIdx) == 0, "Node name: ", GetName()); auto& memoryManager = *memory_manager_; - const auto& result = results_[resultIdx]; - const auto& param = params_[paramIdx]; - auto paramTensors = memoryManager.outputTensorPointers(*param, mutableBuffer); - auto resultTensors = memoryManager.inputTensorPointers(*result, mutableBuffer); + const std::size_t inputSize = inputs_info_[inputIdx].size_; const std::size_t paramSize = params_info_[paramIdx].size_; - const std::size_t resultSize = results_info_[resultIdx].size_; - OPENVINO_ASSERT(paramSize == resultSize, "Node name: ", GetName()); - stream.transfer(paramTensors[0], resultTensors[0], paramSize); + + auto& input = inputTensors[inputIdx]; + const auto& param = params_[paramIdx]; + auto outputTensors = memoryManager.outputTensorPointers(*param, mutableBuffer); + OPENVINO_ASSERT(inputSize == paramSize, "Node name: ", GetName()); + + stream.transfer(outputTensors[0], input, inputSize); } -void TensorIteratorOp::copyResult(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - const IOperationExec::Outputs& outputTensors, - const std::int64_t iter, - const std::size_t resultIdx, - const std::size_t outputIdx) const { +void TensorIteratorOp::transferResult(const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors, + const std::int64_t iter, + const std::size_t resultIdx, + const std::size_t outputIdx) const { + OPENVINO_ASSERT(portmap_outputs_.count(outputIdx) == 0, "Node name: ", GetName()); auto& memoryManager = *memory_manager_; const auto resultSize = results_info_[resultIdx].size_; const std::size_t outputSize = outputs_info_[outputIdx].size_; - if (portmap_outputs_.count(outputIdx) == 0) { - const auto result = results_[resultIdx]; - auto inTensors = memoryManager.inputTensorPointers(*result, mutableBuffer); - const auto output = outputTensors[outputIdx]; - OPENVINO_ASSERT(resultSize == outputSize, "Node name: ", GetName()); - stream.transfer(output, inTensors[0], outputSize); - } else { - auto output = outputTensors[outputIdx]; - const auto& result = results_[resultIdx]; - auto inputTensors = memoryManager.inputTensorPointers(*result, mutableBuffer); - const auto portMap = portmap_outputs_.at(outputIdx); - const auto outputShape = outputs_info_[outputIdx].shape_; - - const auto& insert = kernelmap_outputs_.at(outputIdx); - std::size_t start; - if (portMap.start < 0) { - start = outputShape[portMap.axis] + portMap.start; - } else { - start = portMap.start; - } - start += iter * portMap.stride; - insert(stream.get(), inputTensors[0].get(), output.get(), start); - } + + const auto result = results_[resultIdx]; + auto inTensors = memoryManager.inputTensorPointers(*result, mutableBuffer); + const auto output = outputTensors[outputIdx]; + OPENVINO_ASSERT(resultSize == outputSize, "Node name: ", GetName()); + + stream.transfer(output, inTensors[0], outputSize); } void TensorIteratorOp::updateExecSequence() { diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp index d172f360f..7e568a1e7 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -27,12 +28,38 @@ class TensorIteratorOp : public SubGraph { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void Capture(InferenceRequestContext& context, Inputs inputTensors, Outputs outputTensors, - const Workbuffers& workbuffers) const override; + const Workbuffers& workbuffers) const override { + if (hasTopologyRunners()) { + CaptureMulti(context, inputTensors, outputTensors, workbuffers); + } else { + CaptureSingle(context, inputTensors, outputTensors, workbuffers); + } + } + + void ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const override { + if (hasTopologyRunners()) { + ExecuteGraphMulti(context, inputTensors, outputTensors, workbuffers); + } else { + ExecuteGraphSingle(context, inputTensors, outputTensors, workbuffers); + } + } + + void initializeRunner() override; + + std::size_t GetCudaGraphsCount() const override { + if (hasTopologyRunners()) { + return 3; + } + return 1; + } private: struct PortMap { @@ -43,25 +70,139 @@ class TensorIteratorOp : public SubGraph { int64_t axis{0}; }; + class SliceLauncher { + public: + SliceLauncher(const TensorIteratorOp& ti, uint64_t inputIdx, uint64_t paramIdx); + + void operator()(const CUDA::Stream& stream, + const IOperationExec::Inputs& inputTensors, + CUDA::DevicePointer mutableBuffer, + int64_t iter) const { + const auto* src = inputTensors[input_idx_].get(); + auto* dst = memory_manager_.outputTensorPointers(param_, mutableBuffer)[0].get(); + slice_(stream.get(), src, dst, start_ + iter * stride_); + } + + void addKernelNode(ICudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Inputs& inputTensors); + + void updateKernelNode(ICudaGraphInfo& info, + std::size_t index, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Inputs& inputTensors, + int64_t iter) { + const auto* src = inputTensors[input_idx_].get(); + auto* dst = memory_manager_.outputTensorPointers(param_, mutableBuffer)[0].get(); + info.update_kernel(index, slice_.getPropsPtr(), start_ + iter * stride_, slice_.getSize(), src, dst); + } + + private: + uint64_t input_idx_; + const OperationBase& param_; + const MemoryManager& memory_manager_; + const kernel::Slice& slice_; + size_t start_; + int64_t stride_; + }; + + class TransferLauncher { + public: + TransferLauncher(const TensorIteratorOp& ti, uint64_t resultIdx, uint64_t paramIdx); + + void operator()(const CUDA::Stream& stream, CUDA::DevicePointer mutableBuffer) const { + const auto& paramTensors = memory_manager_.outputTensorPointers(param_, mutableBuffer); + const auto& resultTensors = memory_manager_.inputTensorPointers(result_, mutableBuffer); + auto* dst = paramTensors[0].get(); + const auto* src = resultTensors[0].get(); + throwIfError(cudaMemcpyAsync(dst, src, param_size_, cudaMemcpyDeviceToDevice, stream.get())); + } + + void addTransferNode(ICudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer); + + private: + const OperationBase& param_; + const OperationBase& result_; + const MemoryManager& memory_manager_; + std::size_t param_size_; + }; + + class InsertLauncher { + public: + InsertLauncher(const TensorIteratorOp& ti, const std::size_t resultIdx, const std::size_t outputIdx); + + void operator()(const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors, + int64_t iter) const { + const auto* src = memory_manager_.inputTensorPointers(result_, mutableBuffer)[0].get(); + auto* dst = outputTensors[output_idx_].get(); + insert_(stream.get(), src, dst, start_ + iter * stride_); + } + + void addKernelNode(ICudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors); + + void updateKernelNode(ICudaGraphInfo& info, + std::size_t index, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors, + int64_t iter) { + const auto* src = memory_manager_.inputTensorPointers(result_, mutableBuffer)[0].get(); + auto* dst = outputTensors[output_idx_].get(); + info.update_kernel(index, insert_.getPropsPtr(), start_ + iter * stride_, insert_.getSize(), src, dst); + } + + private: + uint64_t output_idx_; + const OperationBase& result_; + const MemoryManager& memory_manager_; + size_t start_; + int64_t stride_; + const kernel::Insert& insert_; + }; + WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; - void copyParam(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - const IOperationExec::Inputs& inputTensors, - std::int64_t iter, - uint64_t inputIdx, - uint64_t paramIdx) const; - void copyBackEdge(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - uint64_t resultIdx, - uint64_t paramIdx) const; - void copyResult(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - const IOperationExec::Outputs& outputTensors, - int64_t iter, - std::size_t resultIdx, - std::size_t outputIdx) const; + void CaptureSingle(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const; + + void ExecuteGraphSingle(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const; + + void CaptureMulti(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const; + + void ExecuteGraphMulti(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const; + + void transferParam(const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Inputs& inputTensors, + std::int64_t iter, + uint64_t inputIdx, + uint64_t paramIdx) const; + + void transferResult(const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors, + int64_t iter, + std::size_t resultIdx, + std::size_t outputIdx) const; void updateExecSequence(); @@ -78,6 +219,10 @@ class TensorIteratorOp : public SubGraph { std::unordered_map portmap_outputs_; std::unordered_map kernelmap_outputs_; std::unordered_map results_parameters_map_; + + mutable std::vector slices_; + mutable std::vector transfers_; + mutable std::vector inserts_; }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/topk.cpp b/modules/nvidia_plugin/src/ops/topk.cpp index 7b72deaa6..83af29d21 100644 --- a/modules/nvidia_plugin/src/ops/topk.cpp +++ b/modules/nvidia_plugin/src/ops/topk.cpp @@ -172,7 +172,7 @@ void TopKOp::Execute(const InferenceRequestContext& context, static_cast(kernel_param.get())); } -bool TopKOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility TopKOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void TopKOp::InitSharedImmutableWorkbuffers(const Buffers& buffers) { OPENVINO_ASSERT(buffers.size() == 1, "Node name: ", GetName()); diff --git a/modules/nvidia_plugin/src/ops/topk.hpp b/modules/nvidia_plugin/src/ops/topk.hpp index bf311c1ed..aeb1ea01d 100644 --- a/modules/nvidia_plugin/src/ops/topk.hpp +++ b/modules/nvidia_plugin/src/ops/topk.hpp @@ -25,7 +25,7 @@ class TopKOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const Buffers&) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/transpose.cpp b/modules/nvidia_plugin/src/ops/transpose.cpp index 1ab7085e0..0cbe59947 100644 --- a/modules/nvidia_plugin/src/ops/transpose.cpp +++ b/modules/nvidia_plugin/src/ops/transpose.cpp @@ -113,7 +113,7 @@ void TransposeOp::Execute(const InferenceRequestContext& context, context.getThreadContext().stream().get())); } -bool TransposeOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility TransposeOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } std::vector TransposeOp::extractInputExtents(const ov::Node& node) { std::vector result; diff --git a/modules/nvidia_plugin/src/ops/transpose.hpp b/modules/nvidia_plugin/src/ops/transpose.hpp index cab45c730..bfb9fd099 100644 --- a/modules/nvidia_plugin/src/ops/transpose.hpp +++ b/modules/nvidia_plugin/src/ops/transpose.hpp @@ -24,7 +24,7 @@ class TransposeOp : public OperationCuTensor { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: using ExtentsMap = std::unordered_map; diff --git a/modules/nvidia_plugin/src/ops/variadic_split.cpp b/modules/nvidia_plugin/src/ops/variadic_split.cpp index 807c2bdca..e83ba5ec5 100644 --- a/modules/nvidia_plugin/src/ops/variadic_split.cpp +++ b/modules/nvidia_plugin/src/ops/variadic_split.cpp @@ -199,7 +199,7 @@ void VariadicSplitOp::Execute(const InferenceRequestContext& context, static_cast(axis_offset_sizes.get())); } -bool VariadicSplitOp::IsCudaGraphCompatible() const { return false; } +CudaGraphCompatibility VariadicSplitOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::NONE; } OPERATION_REGISTER(VariadicSplitOp, VariadicSplit); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/variadic_split.hpp b/modules/nvidia_plugin/src/ops/variadic_split.hpp index 82580d546..0ede2a7dc 100644 --- a/modules/nvidia_plugin/src/ops/variadic_split.hpp +++ b/modules/nvidia_plugin/src/ops/variadic_split.hpp @@ -28,7 +28,7 @@ class VariadicSplitOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: enum { kOutputPtrsMWBIdx = 0, kNumberOfMWBIdx }; diff --git a/modules/nvidia_plugin/tests/unit/is_cuda_graph_compatible.cpp b/modules/nvidia_plugin/tests/unit/cuda_graph_compatibility.cpp similarity index 95% rename from modules/nvidia_plugin/tests/unit/is_cuda_graph_compatible.cpp rename to modules/nvidia_plugin/tests/unit/cuda_graph_compatibility.cpp index 02ab32e2f..08766acdb 100644 --- a/modules/nvidia_plugin/tests/unit/is_cuda_graph_compatible.cpp +++ b/modules/nvidia_plugin/tests/unit/cuda_graph_compatibility.cpp @@ -22,7 +22,7 @@ using namespace ov::nvidia_gpu; using DevPtr = CUDA::DevicePointer; using CDevPtr = CUDA::DevicePointer; -struct IsCudaGraphCompatibleTest : testing::Test { +struct CudaGraphCompatibilityTest : testing::Test { template static void generate(C& c) { std::random_device randDevice; @@ -40,7 +40,7 @@ struct IsCudaGraphCompatibleTest : testing::Test { OperationBase::Outputs outputs, const Workbuffers& workbuffers) { auto& stream = context.getThreadContext().stream(); - if (operation->IsCudaGraphCompatible()) { + if (operation->GetCudaGraphCompatibility() == CudaGraphCompatibility::FULL) { stream.synchronize(); CUDA::GraphCapture capture{stream}; { @@ -59,7 +59,7 @@ struct IsCudaGraphCompatibleTest : testing::Test { } }; -struct ReluIsCudaGraphCompatibleTest : IsCudaGraphCompatibleTest { +struct ReluCudaGraphCompatibilityTest : CudaGraphCompatibilityTest { void run() { using ElementType = float; @@ -130,9 +130,9 @@ struct ReluIsCudaGraphCompatibleTest : IsCudaGraphCompatibleTest { } }; -TEST_F(ReluIsCudaGraphCompatibleTest, Compatibile) { run(); } +TEST_F(ReluCudaGraphCompatibilityTest, Compatibile) { run(); } -struct ConcatIsCudaGraphCompatibleTest : IsCudaGraphCompatibleTest { +struct ConcatCudaGraphCompatibilityTest : CudaGraphCompatibilityTest { void run() { using ElementType = float; @@ -228,6 +228,6 @@ struct ConcatIsCudaGraphCompatibleTest : IsCudaGraphCompatibleTest { } }; -TEST_F(ConcatIsCudaGraphCompatibleTest, NotCompatible) { run(); } +TEST_F(ConcatCudaGraphCompatibilityTest, NotCompatible) { run(); } } // namespace diff --git a/modules/nvidia_plugin/tests/unit/cuda_graph_topology_runner_test.cpp b/modules/nvidia_plugin/tests/unit/cuda_graph_topology_runner_test.cpp index 3ee08029a..32ccf1126 100644 --- a/modules/nvidia_plugin/tests/unit/cuda_graph_topology_runner_test.cpp +++ b/modules/nvidia_plugin/tests/unit/cuda_graph_topology_runner_test.cpp @@ -92,7 +92,14 @@ TEST_F(CudaGraphTopologyRunnerTest, CheckMemcpyNodesArePopulated) { TEST_F(CudaGraphTopologyRunnerTest, CheckMemcpyNodesAreUpdated) { runner_.UpdateContext(inferRequestContext_, deviceMemBlock_); - const auto oldCudaGraphContext = cudaGraphContext_; + cudaGraphContext_.select_current_graph(0); + const auto& oldCurrentGraph = cudaGraphContext_.get_current_graph(); + ASSERT_FALSE(oldCurrentGraph.is_nested()); + + const auto& oldInfo = dynamic_cast(oldCurrentGraph); + const auto oldParamNodes = std::map{oldInfo.get_parameter_nodes()}; + const auto oldResultNodes = std::map{oldInfo.get_result_nodes()}; + std::vector> inputTensors{PopulateTensors(model_->inputs())}; std::vector> outputTensors{PopulateTensors(model_->outputs())}; InferenceRequestContext inferRequestContext{inputTensors, @@ -105,12 +112,29 @@ TEST_F(CudaGraphTopologyRunnerTest, CheckMemcpyNodesAreUpdated) { cudaGraphContext_, false}; runner_.UpdateContext(inferRequestContext, deviceMemBlock_); - EXPECT_NE(cudaGraphContext_, oldCudaGraphContext); + + cudaGraphContext_.select_current_graph(0); + const auto& newCurrentGraph = cudaGraphContext_.get_current_graph(); + ASSERT_FALSE(newCurrentGraph.is_nested()); + + const auto& newInfo = dynamic_cast(newCurrentGraph); + const auto& newParamNodes = newInfo.get_parameter_nodes(); + const auto& newResultNodes = newInfo.get_result_nodes(); + + EXPECT_NE(newParamNodes, oldParamNodes); + EXPECT_NE(newResultNodes, oldResultNodes); } TEST_F(CudaGraphTopologyRunnerTest, CheckMemcpyNodesAreNotUpdatedIfPointersUnchanged) { runner_.UpdateContext(inferRequestContext_, deviceMemBlock_); - const auto oldCudaGraphContext = cudaGraphContext_; + cudaGraphContext_.select_current_graph(0); + const auto& oldCurrentGraph = cudaGraphContext_.get_current_graph(); + ASSERT_FALSE(oldCurrentGraph.is_nested()); + + const auto& oldInfo = dynamic_cast(oldCurrentGraph); + const auto oldParamNodes = std::map{oldInfo.get_parameter_nodes()}; + const auto oldResultNodes = std::map{oldInfo.get_result_nodes()}; + InferenceRequestContext inferRequestContext{inputTensors_, inputIndeces_, outputTensors_, @@ -121,5 +145,14 @@ TEST_F(CudaGraphTopologyRunnerTest, CheckMemcpyNodesAreNotUpdatedIfPointersUncha cudaGraphContext_, false}; runner_.UpdateContext(inferRequestContext, deviceMemBlock_); - EXPECT_EQ(cudaGraphContext_, oldCudaGraphContext); + cudaGraphContext_.select_current_graph(0); + const auto& newCurrentGraph = cudaGraphContext_.get_current_graph(); + ASSERT_FALSE(newCurrentGraph.is_nested()); + + const auto& newInfo = dynamic_cast(newCurrentGraph); + const auto& newParamNodes = newInfo.get_parameter_nodes(); + const auto& newResultNodes = newInfo.get_result_nodes(); + + EXPECT_EQ(newParamNodes, oldParamNodes); + EXPECT_EQ(newResultNodes, oldResultNodes); } diff --git a/modules/nvidia_plugin/tests/unit/cuda_multi_graph_test.cpp b/modules/nvidia_plugin/tests/unit/cuda_multi_graph_test.cpp index db3fadb62..4b7188926 100644 --- a/modules/nvidia_plugin/tests/unit/cuda_multi_graph_test.cpp +++ b/modules/nvidia_plugin/tests/unit/cuda_multi_graph_test.cpp @@ -83,7 +83,7 @@ class AddMul { static void checkSubGraph(const SubGraph& subGraph) { // Original SubGraph for AddMul network should be CUDA Graph compatible - EXPECT_TRUE(subGraph.IsCudaGraphCompatible()); + EXPECT_EQ(subGraph.GetCudaGraphCompatibility(), CudaGraphCompatibility::FULL); } static std::vector> calcRefs( @@ -129,7 +129,7 @@ class AddConcat { static void checkSubGraph(const SubGraph& subGraph) { // Original SubGraph for AddConcat network should not be CUDA Graph compatible - EXPECT_FALSE(subGraph.IsCudaGraphCompatible()); + EXPECT_EQ(subGraph.GetCudaGraphCompatibility(), CudaGraphCompatibility::NONE); } static std::vector> calcRefs( diff --git a/modules/nvidia_plugin/tests/unit/cuda_multi_graph_ti_test.cpp b/modules/nvidia_plugin/tests/unit/cuda_multi_graph_ti_test.cpp new file mode 100644 index 000000000..5c06d28da --- /dev/null +++ b/modules/nvidia_plugin/tests/unit/cuda_multi_graph_ti_test.cpp @@ -0,0 +1,376 @@ +// Copyright (C) 2020-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "cuda_graph_topology_runner.hpp" +#include "cuda_simple_execution_delegator.hpp" +#include "ops/parameter.hpp" +#include "ops/result.hpp" +#include "ov_models/builders.hpp" +#include "ov_models/utils/data_utils.hpp" + +using namespace ov::nvidia_gpu; +using namespace testing; +using ov::test::utils::EltwiseTypes; + +namespace { + +constexpr int TO = 10; +constexpr int FROM = 0; +constexpr int SEED = 1; + +constexpr std::size_t INPUTS_COUNT = 2; +constexpr int64_t CONCAT_AXIS = 0; + +constexpr float THRESHOLD = 0.01f; + +using CalcType = float; +constexpr auto CALC_ELEMENT_TYPE = ov::element::Type_t::f32; + +inline CalcType* getMutablePtr(ov::Tensor& tensor) { return static_cast(tensor.data()); } + +inline const CalcType* getConstPtr(const ov::Tensor& tensor) { return static_cast(tensor.data()); } + +void generateInput(ov::Tensor& tensor, int to = TO, int from = FROM, int seed = SEED) { + EXPECT_EQ(tensor.get_element_type(), CALC_ELEMENT_TYPE); + auto* ptr = getMutablePtr(tensor); + std::mt19937 engine(seed); + std::uniform_real_distribution dist(from, to); + std::generate(ptr, ptr + tensor.get_size(), [&dist, &engine]() { return CalcType{dist(engine)}; }); +} + +std::vector> calcRefs(std::shared_ptr model, + const std::vector>& inputs) { + auto refModel = model->clone(); + + auto referenceInputs = std::vector>(inputs.size()); + auto refInputsTypes = std::vector(inputs.size()); + for (std::size_t i = 0; i < inputs.size(); ++i) { + const auto& input = inputs[i]; + const auto inputSize = input->get_byte_size(); + + auto& referenceInput = referenceInputs[i]; + referenceInput.resize(inputSize); + + const auto* buffer = static_cast(input->data()); + std::copy(buffer, buffer + inputSize, referenceInput.data()); + + refInputsTypes[i] = CALC_ELEMENT_TYPE; + } + + const auto expectedOutputs = ngraph::helpers::interpreterFunction(refModel, referenceInputs, refInputsTypes); + + std::vector> res(expectedOutputs.size()); + for (std::size_t i = 0; i < expectedOutputs.size(); ++i) { + EXPECT_EQ(expectedOutputs[i].first, CALC_ELEMENT_TYPE); + const auto& expOut = expectedOutputs[i].second; + auto& resOut = res[i]; + const auto resOutSize = expOut.size() / sizeof(CalcType); + resOut.resize(resOutSize); + + const auto* buffer = static_cast(static_cast(expOut.data())); + std::copy(buffer, buffer + resOutSize, resOut.data()); + } + return res; +} + +void validateOutput(const ov::Tensor& tensor, const std::vector& refVector, float threshold) { + EXPECT_EQ(tensor.get_element_type(), CALC_ELEMENT_TYPE); + const auto size = tensor.get_size(); + EXPECT_EQ(size, refVector.size()); + const auto* ptr = getConstPtr(tensor); + bool areEqual = std::equal(ptr, ptr + size, refVector.cbegin(), [threshold](auto val1, auto val2) { + return std::abs(val1 - val2) < threshold; + }); + EXPECT_TRUE(areEqual); +} + +} // namespace + +class GRUTI { +public: + static std::shared_ptr createNetwork() { + constexpr size_t seqLengths = 20; + constexpr size_t batch = 1; + constexpr size_t hidden_size = 10; + constexpr size_t inputSize = 10; + constexpr size_t seqAxis = 1; + constexpr float clip = 0.0; + constexpr ov::element::Type ngPrc = CALC_ELEMENT_TYPE; + + auto tensorIterator = std::make_shared(); + auto axis = std::make_shared( + ov::element::Type_t::i64, ov::Shape{1}, std::vector{static_cast(seqAxis)}); + std::vector> inputShapes = { + {{batch, seqLengths, inputSize}, + {batch, hidden_size}, + {3 * hidden_size, inputSize}, + {3 * hidden_size, hidden_size}, + {3 * hidden_size}}, + }; + ov::ParameterVector outerParams{std::make_shared(ngPrc, ov::Shape(inputShapes[0])), + std::make_shared(ngPrc, ov::Shape(inputShapes[1]))}; + + inputShapes[0][seqAxis] = 1; // sliced dimension + ov::ParameterVector bodyParams{std::make_shared(ngPrc, ov::Shape(inputShapes[0])), + std::make_shared(ngPrc, ov::Shape(inputShapes[1]))}; + + std::vector WRB = {inputShapes[2], inputShapes[3], inputShapes[4]}; + auto squeeze = std::make_shared(bodyParams[0], axis); + ov::OutputVector out_vector = {squeeze, bodyParams[1]}; + auto gru_cell = + ngraph::builder::makeGRU(out_vector, WRB, hidden_size, {"sigmoid", "tanh"}, {}, {}, clip, false); + auto unsqueeze = std::make_shared(gru_cell->output(0), axis); + ov::ResultVector results{std::make_shared(gru_cell->output(0)), + std::make_shared(unsqueeze)}; + auto body = std::make_shared(results, bodyParams, "gru_cell"); + tensorIterator->set_function(body); + + tensorIterator->set_sliced_input(bodyParams[0], outerParams[0], -1, -1, 1, 0, seqAxis); + tensorIterator->get_concatenated_slices(results[1], -1, -1, 1, 0, seqAxis); + + tensorIterator->set_merged_input(bodyParams[1], outerParams[1], results[0]); + tensorIterator->get_iter_value(results[0]); + + return std::make_shared(ov::OutputVector{tensorIterator->output(0), tensorIterator->output(1)}, + outerParams); + } + + static void checkContext(CudaGraphContext& cudaGraphContext) { + // TI has always a separate graph in CudaGraphContext + // Single-graph TI version uses CudaGraphInfo object with 1 graph + // Total graph count should be 3 + EXPECT_EQ(cudaGraphContext.get_graphs_count(), 3); + EXPECT_TRUE(cudaGraphContext.is_nested()); + + cudaGraphContext.select_current_graph(1); + const auto& tiGraph = cudaGraphContext.get_current_graph(); + EXPECT_FALSE(tiGraph.is_nested()); + EXPECT_EQ(tiGraph.get_graphs_count(), 1); + } + + static void checkRunner(const CudaGraphTopologyRunner& runner) { + // CudaGraphTopologyRunner always puts a TI into a separate SubGraph + // Single-graph TI version doesn't use nested CudaGraphTopologyRunner objects and uses 1 graph + // Total graph count should be 3 + EXPECT_EQ(runner.GetSubGraph().GetCudaGraphCompatibility(), CudaGraphCompatibility::SPECIAL); + EXPECT_EQ(runner.GetCudaGraphsCount(), 3); + EXPECT_FALSE(runner.hasNestedRunners()); + } +}; + +class SplitConcatAddTI { +public: + static void createNetworkInternal(std::shared_ptr& model) { + constexpr size_t seqLengths = 20; + constexpr size_t batch = 1; + constexpr size_t inputSize = 10; + constexpr size_t seqAxis = 1; + constexpr float clip = 0.0; + ov::element::Type ngPrc = CALC_ELEMENT_TYPE; + + auto tensorIterator = std::make_shared(); + auto axisConstant = std::make_shared( + ov::element::Type_t::i64, ov::Shape{1}, std::vector{static_cast(seqAxis)}); + std::vector outerShape = {{batch, seqLengths, inputSize}}; + std::vector> bodyShapes; + for (std::size_t i = 0; i < INPUTS_COUNT; ++i) { + bodyShapes.emplace_back(std::vector{batch, 1, inputSize}); + } + ov::ParameterVector outerParams; + outerParams.emplace_back(std::make_shared(ngPrc, ov::Shape{outerShape})); + for (std::size_t i = 1; i < INPUTS_COUNT; ++i) { + outerParams.emplace_back(std::make_shared(ngPrc, ov::Shape{bodyShapes[i]})); + } + + for (std::size_t i = 0; i < INPUTS_COUNT; ++i) { + ASSERT_EQ(outerShape.size(), bodyShapes[i].size()); + for (std::size_t j = 0; j < bodyShapes[i].size(); ++j) { + if (j == seqAxis) { + ASSERT_EQ(bodyShapes[i][j], 1); + ASSERT_EQ(outerShape[j], seqLengths); + } else { + ASSERT_EQ(bodyShapes[i][j], outerShape[j]); + } + } + } + ov::ParameterVector bodyParams; + for (std::size_t i = 0; i < INPUTS_COUNT; ++i) { + bodyParams.emplace_back(std::make_shared(ngPrc, ov::Shape{bodyShapes[i]})); + } + + auto squeeze = std::make_shared(bodyParams[0], axisConstant); + const auto split = ngraph::builder::makeSplit(squeeze, CALC_ELEMENT_TYPE, 2, 1); + const auto concat = + std::make_shared(ov::OutputVector{split->output(0), split->output(1)}, 1); + const auto add0 = ngraph::builder::makeEltwise(concat->output(0), bodyParams[1], EltwiseTypes::ADD); + + auto unsqueeze = std::make_shared(add0->output(0), axisConstant); + ov::ResultVector results{std::make_shared(add0->output(0)), + std::make_shared(unsqueeze)}; + + auto body = std::make_shared(results, bodyParams, "AddConcat"); + tensorIterator->set_function(body); + + tensorIterator->set_sliced_input(bodyParams[0], outerParams[0], -1, -1, 1, 0, seqAxis); + tensorIterator->get_concatenated_slices(results[1], -1, -1, 1, 0, seqAxis); + + tensorIterator->set_merged_input(bodyParams[1], outerParams[1], results[0]); + tensorIterator->get_iter_value(results[0]); + + model = std::make_shared(ov::OutputVector{tensorIterator->output(0), tensorIterator->output(1)}, + outerParams); + } + + static std::shared_ptr createNetwork() { + std::shared_ptr model; + createNetworkInternal(model); + return model; + } + + static void checkContext(CudaGraphContext& cudaGraphContext) { + // TI has always a separate graph in CudaGraphContext + // Multi-graph TI version uses CudaGraphPack object with 3 graphs + // Total graph count should be 5 + EXPECT_EQ(cudaGraphContext.get_graphs_count(), 5); + EXPECT_TRUE(cudaGraphContext.is_nested()); + cudaGraphContext.select_current_graph(1); + const auto& tiGraph = cudaGraphContext.get_current_graph(); + EXPECT_TRUE(tiGraph.is_nested()); + EXPECT_EQ(tiGraph.get_graphs_count(), 3); + } + + static void checkRunner(const CudaGraphTopologyRunner& runner) { + // CudaGraphTopologyRunner always puts a TI into a separate SubGraph + // Multi-graph TI version uses nested CudaGraphTopologyRunner and uses 3 graphs + // Total graph count should be 5 + EXPECT_EQ(runner.GetSubGraph().GetCudaGraphCompatibility(), CudaGraphCompatibility::SPECIAL); + EXPECT_EQ(runner.GetCudaGraphsCount(), 5); + EXPECT_TRUE(runner.hasNestedRunners()); + } +}; + +template +class CudaMultiGraphTest : public Test { +protected: + static std::map populateInputIndices(std::shared_ptr model) { + std::map inputIndices; + for (const auto& parameter : model->get_parameters()) { + const auto& parameter_index = model->get_parameter_index(parameter); + inputIndices.emplace(ParameterOp::GetInputTensorName(*parameter), parameter_index); + } + return inputIndices; + } + + static std::map populateOutputIndices(std::shared_ptr model) { + std::map outputIndices; + for (auto& result : model->get_results()) { + const auto& result_index = model->get_result_index(result->input_value(0)); + for (const auto& outputName : ResultOp::GetOutputTensorName(*result)) { + outputIndices.emplace(outputName, result_index); + } + } + return outputIndices; + } + + static std::vector> populateTensors(const std::vector>& nodes) { + std::vector> result; + for (const auto& node : nodes) { + result.push_back(std::make_shared(node.get_element_type(), node.get_shape())); + } + return result; + } + + void generateInputs() { + for (auto& input : inputTensors_) { + generateInput(*input, TO, FROM, currentSeed_); + ++currentSeed_; + } + } + + void updateContext() { runner_.UpdateContext(*inferRequestContext_, deviceMemBlock_); } + + void checkConditions() { + Network::checkContext(cudaGraphContext_); + Network::checkRunner(runner_); + } + + void run() { runner_.Run(*inferRequestContext_, deviceMemBlock_); } + + void calcRefs() { refOutputs_ = ::calcRefs(model_, inputTensors_); } + + void validate(float threshold = THRESHOLD) { + const auto size = outputTensors_.size(); + EXPECT_EQ(size, refOutputs_.size()); + for (std::size_t i = 0; i < size; ++i) { + validateOutput(*outputTensors_[i], refOutputs_[i], THRESHOLD); + } + } + + void updateTensors() { + inputTensors_ = {populateTensors(model_->inputs())}; + outputTensors_ = {populateTensors(model_->outputs())}; + inferRequestContext_ = std::make_unique(inputTensors_, + inputIndices_, + outputTensors_, + outputIndices_, + threadContext_, + cancellationToken_, + simpleExecutionDelegator_, + cudaGraphContext_, + false); + } + + void runTest() { + generateInputs(); + updateContext(); + checkConditions(); + run(); + calcRefs(); + validate(); + + updateTensors(); + generateInputs(); + updateContext(); + checkConditions(); + run(); + calcRefs(); + validate(); + } + + std::shared_ptr model_{Network::createNetwork()}; + CreationContext creationContext_{{}, false}; + ThreadContext threadContext_{{}}; + CancellationToken cancellationToken_{}; + CudaGraphContext cudaGraphContext_{}; + CudaGraphTopologyRunner runner_{creationContext_, model_}; + SimpleExecutionDelegator simpleExecutionDelegator_{}; + std::vector> inputTensors_{populateTensors(model_->inputs())}; + std::vector> outputTensors_{populateTensors(model_->outputs())}; + std::map inputIndices_{populateInputIndices(model_)}; + std::map outputIndices_{populateOutputIndices(model_)}; + std::unique_ptr inferRequestContext_ = + std::make_unique(inputTensors_, + inputIndices_, + outputTensors_, + outputIndices_, + threadContext_, + cancellationToken_, + simpleExecutionDelegator_, + cudaGraphContext_, + false); + DeviceMemBlock deviceMemBlock_{runner_.GetSubGraph().memoryManager()->mutableTensorsMemoryModel()}; + + std::vector> refOutputs_; + int currentSeed_ = SEED; +}; + +using GRUTIMultiGraphTest = CudaMultiGraphTest; + +TEST_F(GRUTIMultiGraphTest, CudaMultiGraphTest) { runTest(); } + +using SplitConcatAddTIMultiGraphTest = CudaMultiGraphTest; + +TEST_F(SplitConcatAddTIMultiGraphTest, CudaMultiGraphTest) { runTest(); }