Skip to content

Commit

Permalink
[NVIDIA] TensorIterator Body as Multiple CUDA Graphs (openvinotoolkit…
Browse files Browse the repository at this point in the history
…#808)

* [NVIDIA] Add operator==/!= to DevicePointer

* [NVIDIA] Add CUDA::NodeParams, CUDA::TransferNode, CUDA::KernelNode

* [NVIDIA] Add kernel args getters for Insert/Slice

* [NVIDIA] Add KernelNodeTest and TransferNodeTest

* [NVIDIA] Fix review issues

* [NVIDIA] Add launchers to TI, refactor Execute()

* [NVIDIA] Add TiCudaGraphInfo

* [NVIDIA] Update TI to support CUDA graph as a body of iterations loop

* [NVIDIA] Add operator== for dim3, KernelNode and NodeParams

* [NVIDIA] Update Run() of *TopologyRunners to take non-const context reference

* [NVIDIA] Remove TiCudaGraphInfo, add set_current_graph(), add_new_graph_info(), get_current_graph_info(), select_current_graph()

* [NVIDIA] Change IsCudaGraphCompatible() interface to GetCudaGraphCompatibility() using enum

* [NVIDIA] Add ExecuteGraph() to IOperationExec/OperationBase

* [NVIDIA] Remove paramsGraph_/resultsGraph_ from CudaGraphInfo

* [NVIDIA] Add multi-graph support for TI

* [NVIDIA] Add multi-graph TI tests

* [NVIDIA] Update CudaGraphTopologyRunnerTest

---------

Co-authored-by: Pavel Durandin <pavel.durandin@intel.com>
  • Loading branch information
apavliuk55 and p-durandin authored Jan 23, 2024
1 parent 09c02a5 commit 63a55e1
Show file tree
Hide file tree
Showing 127 changed files with 1,654 additions and 463 deletions.
3 changes: 3 additions & 0 deletions modules/nvidia_plugin/src/cuda/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,4 +189,7 @@ bool CUDA::TransferNode::operator==(const TransferNode& rhs) const {
return size_ == rhs.size_ && src_.get() == rhs.src_.get() && dst_.get() == rhs.dst_.get() && node_ == rhs.node_;
}

bool KernelNode::operator==(const KernelNode& rhs) const {
return node_ == rhs.node_ && node_params_ == rhs.node_params_;
}
} // namespace CUDA
2 changes: 2 additions & 0 deletions modules/nvidia_plugin/src/cuda/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ class KernelNode {
throwIfError(cudaGraphExecKernelNodeSetParams(exec.get(), node_, &node_params_.get_knp()));
}

bool operator==(const KernelNode& rhs) const;

private:
KernelNode(cudaGraphNode_t node, CUDA::NodeParams&& params);

Expand Down
9 changes: 9 additions & 0 deletions modules/nvidia_plugin/src/cuda/node_params.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <cuda_runtime_api.h>

#include <cuda/utils.hpp>
#include <vector>

namespace CUDA {
Expand Down Expand Up @@ -33,9 +34,17 @@ struct NodeParams {

void reset_args() { ptrs_.clear(); }

friend bool operator==(const NodeParams& lhs, const NodeParams& rhs);

private:
std::vector<void*> ptrs_;
cudaKernelNodeParams knp_;
};

inline bool operator==(const NodeParams& lhs, const NodeParams& rhs) {
return lhs.ptrs_ == rhs.ptrs_ && rhs.knp_.func == lhs.knp_.func && rhs.knp_.gridDim == lhs.knp_.gridDim &&
rhs.knp_.blockDim == lhs.knp_.blockDim && rhs.knp_.sharedMemBytes == lhs.knp_.sharedMemBytes &&
rhs.knp_.extra == lhs.knp_.extra;
}

} // namespace CUDA
15 changes: 15 additions & 0 deletions modules/nvidia_plugin/src/cuda/utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Copyright (C) 2020-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <cuda_runtime_api.h>

namespace CUDA {

inline bool operator==(dim3 rhs, dim3 lhs) { return rhs.x == lhs.x && rhs.y == lhs.y && rhs.z == lhs.z; }

inline bool operator!=(dim3 rhs, dim3 lhs) { return !(rhs == lhs); }

} // namespace CUDA
24 changes: 0 additions & 24 deletions modules/nvidia_plugin/src/cuda_eager_topology_runner.cpp

This file was deleted.

17 changes: 14 additions & 3 deletions modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,23 @@ namespace nvidia_gpu {

class EagerTopologyRunner final : public SubGraph, public ITopologyRunner {
public:
EagerTopologyRunner(const CreationContext& context, const std::shared_ptr<const ov::Model>& model);
EagerTopologyRunner(const CreationContext& context, const std::shared_ptr<const ov::Model>& model) : SubGraph(context, model) {}
~EagerTopologyRunner() override = default;

void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override;
void Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override {
Workbuffers workbuffers{};
workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data());
SubGraph::Execute(context, {}, {}, workbuffers);
}

void Run(InferenceRequestContext& context, const Workbuffers& workbuffers) const override{};

void Capture(InferenceRequestContext& context, const Workbuffers& workbuffers) const override{};
void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override{};
const SubGraph& GetSubGraph() const override;

const SubGraph& GetSubGraph() const override { return *this; }

std::size_t GetCudaGraphsCount() const override { return 0; }
};

} // namespace nvidia_gpu
Expand Down
179 changes: 102 additions & 77 deletions modules/nvidia_plugin/src/cuda_graph_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,126 +7,151 @@
namespace ov {
namespace nvidia_gpu {

void CudaGraphContext::reset() {
graphs_.clear();
currentGraphIndex_ = 0;
void CudaGraphInfo::reset() {
graph_.reset();
graphExec_.reset();
parameterNodes_.clear();
resultNodes_.clear();
transferNodes_.clear();
kernelNodes_.clear();
}

void CudaGraphContext::start_next_graph_addition() {
currentGraphIndex_ = graphs_.size();
graphs_.emplace_back();
void CudaGraphInfo::add_parameter(const std::string& tensorName,
const CUDA::Stream& stream,
CUDA::DevicePointer<void*> dst,
const void* src,
std::size_t size) {
CUDA::CaptureInfo captureInfo{stream};
parameterNodes_.emplace(tensorName, captureInfo.addUploadNode(dst, src, size));
}

void CudaGraphInfo::add_result(const std::string& tensorName,
const CUDA::Stream& stream,
void* dst,
CUDA::DevicePointer<const void*> src,
std::size_t size) {
CUDA::CaptureInfo captureInfo{stream};
resultNodes_.emplace(tensorName, captureInfo.addDownloadNode(dst, src, size));
}

void CudaGraphInfo::add_transfer(const CUDA::Stream& stream,
CUDA::DevicePointer<void*> dst,
CUDA::DevicePointer<const void*> src,
std::size_t size) {
CUDA::CaptureInfo captureInfo{stream};
transferNodes_.emplace_back(captureInfo.addTransferNode(dst, src, size));
}

bool CudaGraphInfo::is_initialized() const { return graph_.has_value() && graphExec_.has_value(); }

void CudaGraphInfo::update_capture(const TensorMappingContext& context) {
for (auto&& [tensorName, node] : parameterNodes_) {
node.update_src(graphExec_.value(), (context.get_input_tensor(tensorName)->data()));
}
for (auto&& [tensorName, node] : resultNodes_) {
node.update_dst(graphExec_.value(), context.get_output_tensor(tensorName)->data());
}
}

std::size_t CudaGraphInfo::get_graphs_count() const { return is_initialized() ? 1 : 0; }

void CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); }

void CudaGraphPack::reset() {
graphs_.clear();
currentGraphIndex_ = 0;
}

void CudaGraphContext::add_parameter(const std::string& tensorName,
void CudaGraphPack::add_parameter(const std::string& tensorName,
const CUDA::Stream& stream,
CUDA::DevicePointer<void*> dst,
const void* src,
std::size_t size) {
OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
graphs_[currentGraphIndex_].add_parameter(tensorName, stream, dst, src, size);
graphs_[currentGraphIndex_]->add_parameter(tensorName, stream, dst, src, size);
}

void CudaGraphContext::add_result(const std::string& tensorName,
void CudaGraphPack::add_result(const std::string& tensorName,
const CUDA::Stream& stream,
void* dst,
CUDA::DevicePointer<const void*> src,
std::size_t size) {
OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
graphs_[currentGraphIndex_].add_result(tensorName, stream, dst, src, size);
graphs_[currentGraphIndex_]->add_result(tensorName, stream, dst, src, size);
}

void CudaGraphPack::add_transfer(const CUDA::Stream& stream,
CUDA::DevicePointer<void*> dst,
CUDA::DevicePointer<const void*> src,
std::size_t size) {
graphs_[currentGraphIndex_]->add_transfer(stream, dst, src, size);
}

void CudaGraphContext::add_graph(const CUDA::Graph& graph) {
void CudaGraphPack::set_current_graph(const CUDA::Graph& graph) {
OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
graphs_[currentGraphIndex_].set_graph(graph);
graphs_[currentGraphIndex_]->set_current_graph(graph);
}

bool CudaGraphContext::is_initialized() const {
bool CudaGraphPack::is_initialized() const {
const auto size = graphs_.size();
return size != 0 && graphs_[size - 1].is_initialized();
return size != 0 && graphs_[size - 1]->is_initialized();
}

void CudaGraphContext::update_capture(const TensorMappingContext& context) {
void CudaGraphPack::update_capture(const TensorMappingContext& context) {
for (currentGraphIndex_ = 0; currentGraphIndex_ < graphs_.size(); ++currentGraphIndex_) {
graphs_[currentGraphIndex_].update_capture(context);
}
}

void CudaGraphContext::launch(std::size_t index, const CUDA::Stream& stream) const {
currentGraphIndex_ = index;
OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
graphs_[currentGraphIndex_].launch(stream);
}

std::size_t CudaGraphContext::get_params_count() const {
std::size_t res = 0;
for (const auto& graph : graphs_) {
res += graph.get_params_count();
graphs_[currentGraphIndex_]->update_capture(context);
}
return res;
}

std::size_t CudaGraphContext::get_results_count() const {
std::size_t res = 0;
for (const auto& graph : graphs_) {
res += graph.get_results_count();
}
return res;
ICudaGraphInfo& CudaGraphPack::add(std::shared_ptr<ICudaGraphInfo> ptr) {
currentGraphIndex_ = graphs_.size();
graphs_.emplace_back(ptr);
return *graphs_.back();
}

std::size_t CudaGraphContext::get_graphs_count() const { return graphs_.size(); }
ICudaGraphInfo& CudaGraphPack::get_current_graph() { return *graphs_[currentGraphIndex_]; }

void CudaGraphContext::CudaGraphInfo::add_parameter(const std::string& tensorName,
const CUDA::Stream& stream,
CUDA::DevicePointer<void*> dst,
const void* src,
std::size_t size) {
CUDA::CaptureInfo captureInfo{stream};
parameterNodes_.emplace(tensorName, captureInfo.addUploadNode(dst, src, size));
void CudaGraphPack::select_current_graph(std::size_t index) {
OPENVINO_ASSERT(index < graphs_.size(), "Graph index/vector size incosistency");
currentGraphIndex_ = index;
}

void CudaGraphContext::CudaGraphInfo::add_result(const std::string& tensorName,
const CUDA::Stream& stream,
void* dst,
CUDA::DevicePointer<const void*> src,
std::size_t size) {
CUDA::CaptureInfo captureInfo{stream};
resultNodes_.emplace(tensorName, captureInfo.addDownloadNode(dst, src, size));
std::size_t CudaGraphPack::get_params_count() const {
return std::accumulate(
graphs_.begin(), graphs_.end(), static_cast<std::size_t>(0), [](auto sum, const auto& graph) {
return sum + graph->get_params_count();
});
}

void CudaGraphContext::CudaGraphInfo::set_graph(const CUDA::Graph& graph) {
graph_.emplace(graph);
graphExec_.emplace(graph);
std::size_t CudaGraphPack::get_results_count() const {
return std::accumulate(
graphs_.begin(), graphs_.end(), static_cast<std::size_t>(0), [](auto sum, const auto& graph) {
return sum + graph->get_results_count();
});
}

bool CudaGraphContext::CudaGraphInfo::is_initialized() const { return graph_.has_value() && graphExec_.has_value(); }

void CudaGraphContext::CudaGraphInfo::update_capture(const TensorMappingContext& context) {
for (auto&& [tensorName, node] : parameterNodes_) {
node.update_src(graphExec_.value(), (context.get_input_tensor(tensorName)->data()));
}
for (auto&& [tensorName, node] : resultNodes_) {
node.update_dst(graphExec_.value(), context.get_output_tensor(tensorName)->data());
}
std::size_t CudaGraphPack::get_transfers_count() const {
return std::accumulate(
graphs_.begin(), graphs_.end(), static_cast<std::size_t>(0), [](auto sum, const auto& graph) {
return sum + graph->get_transfers_count();
});
}

void CudaGraphContext::CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); }

std::size_t CudaGraphContext::CudaGraphInfo::get_params_count() const { return parameterNodes_.size(); }

std::size_t CudaGraphContext::CudaGraphInfo::get_results_count() const { return resultNodes_.size(); }

bool operator==(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) {
return lhs.graph_ == rhs.graph_ && lhs.graphExec_ == rhs.graphExec_ && lhs.parameterNodes_ == rhs.parameterNodes_ &&
lhs.resultNodes_ == rhs.resultNodes_;
std::size_t CudaGraphPack::get_kernels_count() const {
return std::accumulate(
graphs_.begin(), graphs_.end(), static_cast<std::size_t>(0), [](auto sum, const auto& graph) {
return sum + graph->get_kernels_count();
});
}

bool operator!=(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) {
return !(lhs == rhs);
std::size_t CudaGraphPack::get_graphs_count() const {
return std::accumulate(
graphs_.begin(), graphs_.end(), static_cast<std::size_t>(0), [](auto sum, const auto& graph) {
return sum + graph->get_graphs_count();
});
}

bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return lhs.graphs_ == rhs.graphs_; }

bool operator!=(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return !(lhs == rhs); }
void CudaGraphPack::launch(const CUDA::Stream& stream) const { graphs_[currentGraphIndex_]->launch(stream); }

} // namespace nvidia_gpu
} // namespace ov
Loading

0 comments on commit 63a55e1

Please sign in to comment.