Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove memory copy between TensorRT and CUDA #1561

Merged
merged 23 commits into from
Aug 9, 2019
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
35c9e96
remove memory copy between CUDA and TRT
stevenlix Aug 1, 2019
c66fbe8
add info to RegisterExecutionProvider input
stevenlix Aug 1, 2019
bb6e268
use new IDeviceAllocator for trt allocator
stevenlix Aug 1, 2019
3cad0ba
remove SetDefaultInputsMemoryType from TRT EP
stevenlix Aug 1, 2019
a5d1893
Merge branch 'master' into stevenlix/trtoverhead
stevenlix Aug 1, 2019
231cf0b
remove onnx-tensorrt 5.0
stevenlix Aug 1, 2019
a7f534e
add submodule onnx-tensorrt branch 5.1
stevenlix Aug 2, 2019
4d7812f
remove redundancy
stevenlix Aug 5, 2019
7aab699
Update transformer_memcpy.cc
stevenlix Aug 5, 2019
7f306ff
Update tensorrt_execution_provider.cc
stevenlix Aug 5, 2019
5cc3e1c
switch to TensorRT 5.1.5.0
stevenlix Aug 5, 2019
7cbd863
Merge branch 'stevenlix/trtoverhead' of https://github.com/Microsoft/…
stevenlix Aug 5, 2019
196386a
update python binding
stevenlix Aug 6, 2019
5c2d5e5
disable failed test case on TensorRT
stevenlix Aug 6, 2019
bd604e9
Merge branch 'master' into stevenlix/trtoverhead
stevenlix Aug 6, 2019
96c7560
Update activation_op_test.cc
stevenlix Aug 6, 2019
62ec7cc
upgrade to TensorRT container 19.06
stevenlix Aug 6, 2019
40240d0
Merge branch 'stevenlix/trtoverhead' of https://github.com/Microsoft/…
stevenlix Aug 6, 2019
3bee98d
update according to feedback
stevenlix Aug 8, 2019
94923d7
add comments
stevenlix Aug 8, 2019
5d720b0
remove tensorrt allocator and use cuda(gpu) allocator
stevenlix Aug 8, 2019
b9cb8f1
update onnx-tensorrt submodule
stevenlix Aug 8, 2019
f153d91
change ci build cuda directory name
stevenlix Aug 8, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,6 @@
[submodule "cmake/external/re2"]
path = cmake/external/re2
url = https://github.com/google/re2.git
[submodule "cmake/external/onnx-tensorrt"]
path = cmake/external/onnx-tensorrt
url = https://github.com/onnx/onnx-tensorrt.git
branch = v5.0
[submodule "cmake/external/eigen"]
path = cmake/external/eigen
url = https://github.com/eigenteam/eigen-git-mirror.git
Expand All @@ -41,3 +37,7 @@
[submodule "cmake/external/spdlog"]
path = cmake/external/spdlog
url = https://github.com/gabime/spdlog.git
[submodule "cmake/external/onnx-tensorrt"]
path = cmake/external/onnx-tensorrt
url = https://github.com/onnx/onnx-tensorrt.git
branch = 5.1
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
extern "C" {
#endif

ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Tensorrt, _In_ OrtSessionOptions* options);
ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Tensorrt, _In_ OrtSessionOptions* options, int device_id);

#ifdef __cplusplus
}
Expand Down
3 changes: 1 addition & 2 deletions onnxruntime/core/framework/graph_partitioner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
// Licensed under the MIT License.

#include "core/framework/graph_partitioner.h"

#include "core/framework/kernel_registry_manager.h"
#include "core/graph/function.h"
#include "core/graph/graph_viewer.h"
Expand Down Expand Up @@ -176,7 +175,7 @@ Status GraphPartitioner::Partition(Graph& graph, bool export_dll, FuncManager& f
//prepare the func kernel
KernelDefBuilder builder;
BuildFusedKernelDef(builder, *node);
if (node->GetExecutionProviderType() == onnxruntime::kTensorrtExecutionProvider || node->GetExecutionProviderType() == onnxruntime::kNGraphExecutionProvider || node->GetExecutionProviderType() == onnxruntime::kNnapiExecutionProvider) {
if (node->GetExecutionProviderType() == onnxruntime::kNGraphExecutionProvider || node->GetExecutionProviderType() == onnxruntime::kNnapiExecutionProvider) {
builder.SetDefaultInputsMemoryType(OrtMemTypeCPUInput);
builder.SetDefaultOutputMemoryType(OrtMemTypeCPUOutput);
}
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/framework/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ common::Status CopyOneInputAcrossDevices(const SessionState& session_state, cons
ORT_ENFORCE(p_input_provider);
}

//no copy for TRT and nGraph
if (required_provider_type == onnxruntime::kTensorrtExecutionProvider || required_provider_type == onnxruntime::kNGraphExecutionProvider) {
//no copy for nGraph
if (required_provider_type == onnxruntime::kNGraphExecutionProvider) {
new_mlvalue = orig_mlvalue;
break;
}
Expand Down
11 changes: 6 additions & 5 deletions onnxruntime/core/optimizer/transformer_memcpy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class TransformerMemcpyImpl {
std::string provider_;
};

/** Helper that returns a pointer to the corresponding TensorProto for a name if it is an initializer.
/** Helper that returns a pointer to the corresponding TensorProto for a name if it is an initializer.
@param check_outer_scope If true and the graph is a subgraph, check parent graph/s for 'name' if not found in 'graph'.
*/
static const onnx::TensorProto* GetInitializer(const Graph& graph, const std::string& name, bool check_outer_scope) {
Expand All @@ -73,7 +73,6 @@ common::Status MemcpyTransformer::ApplyImpl(Graph& graph, bool& modified, int gr
provider != onnxruntime::kMklDnnExecutionProvider &&
provider != onnxruntime::kNGraphExecutionProvider &&
provider != onnxruntime::kNupharExecutionProvider &&
provider != onnxruntime::kTensorrtExecutionProvider &&
provider != onnxruntime::kOpenVINOExecutionProvider) {
TransformerMemcpyImpl copy_impl(graph, provider);
auto current_modified = copy_impl.ModifyGraph(registry_manager_);
Expand All @@ -100,7 +99,7 @@ common::Status MemcpyTransformer::ApplyImpl(Graph& graph, bool& modified, int gr

Overview: The transformer transforms the input graph as follows:

(1) For every initializer W that is referenced by both provider and non-provider nodes,
(1) For every initializer W that is referenced by both provider and non-provider nodes,
we create a duplicate initializer W2 and change all provider nodes to reference this
duplicate copy.

Expand Down Expand Up @@ -167,7 +166,9 @@ bool TransformerMemcpyImpl::ModifyGraph(const KernelRegistryManager& kernel_regi
}

void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node, const KernelRegistryManager& kernel_registries, InitializedTensorSet& initializers_consumed) {
if (node.GetExecutionProviderType() == provider_) {
if (node.GetExecutionProviderType() == provider_
|| (node.GetExecutionProviderType() == kCudaExecutionProvider && provider_ == kTensorrtExecutionProvider)
|| (node.GetExecutionProviderType() == kTensorrtExecutionProvider && provider_ == kCudaExecutionProvider)) {
provider_nodes_.insert(&node);
// note KernelCreateInfo might be nullptr for custom kernel
const KernelCreateInfo* kci = nullptr;
Expand Down Expand Up @@ -206,7 +207,7 @@ void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node, const KernelReg
}
} else {
// TODO: copy between devices? i.e. multiple GPUs
if (node.GetExecutionProviderType() != onnxruntime::kCpuExecutionProvider && node.GetExecutionProviderType() != onnxruntime::kTensorrtExecutionProvider &&
if (node.GetExecutionProviderType() != onnxruntime::kCpuExecutionProvider &&
node.GetExecutionProviderType() != onnxruntime::kNGraphExecutionProvider && !node.GetExecutionProviderType().empty()) {
ORT_THROW("Execution type '", node.GetExecutionProviderType(), "' doesn't support memcpy ");
}
Expand Down
72 changes: 72 additions & 0 deletions onnxruntime/core/providers/tensorrt/tensorrt_allocator.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "tensorrt_allocator.h"
#include "core/providers/cuda/cuda_common.h"
#include "core/framework/allocatormgr.h"
#include "core/framework/session_state.h"
#include "core/providers/cuda/cuda_fence.h"
#include "core/providers/cuda/gpu_data_transfer.h"

namespace onnxruntime {

static const GPUDataTransfer* GetGPUDataTransfer(const SessionState* session_state) {
OrtDevice gpu_device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0);
OrtDevice cpu_device;
return dynamic_cast<const GPUDataTransfer*>(session_state->GetDataTransferMgr().GetDataTransfer(gpu_device, cpu_device));
}

void TensorrtAllocator::CheckDevice() const {
#ifndef NDEBUG
// check device to match at debug build
// if it's expected to change, call cudaSetDevice instead of the check
int current_device;
CUDA_CALL_THROW(cudaGetDevice(&current_device));
ORT_ENFORCE(current_device == info_.id);
#endif
}

void* TensorrtAllocator::Alloc(size_t size) {
CheckDevice();
void* p = nullptr;
if (size > 0) {
CUDA_CALL_THROW(cudaMalloc((void**)&p, size));
}
return p;
}

void TensorrtAllocator::Free(void* p) {
CheckDevice();
cudaFree(p); // do not throw error since it's OK for cudaFree to fail during shutdown
}

const OrtAllocatorInfo& TensorrtAllocator::Info() const {
return info_;
}

FencePtr TensorrtAllocator::CreateFence(const SessionState* session_state) {
return std::make_shared<CUDAFence>(GetGPUDataTransfer(session_state));
}

void* TensorrtPinnedAllocator::Alloc(size_t size) {
void* p = nullptr;
if (size > 0) {
CUDA_CALL_THROW(cudaMallocHost((void**)&p, size));
}
return p;
}

void TensorrtPinnedAllocator::Free(void* p) {
CUDA_CALL_THROW(cudaFreeHost(p));
}

const OrtAllocatorInfo& TensorrtPinnedAllocator::Info() const {
static constexpr OrtAllocatorInfo tensorrt_allocator_info(TRT_PINNED, OrtDeviceAllocator, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, 0), 0, OrtMemTypeCPUOutput);
return tensorrt_allocator_info;
}

FencePtr TensorrtPinnedAllocator::CreateFence(const SessionState* session_state) {
return std::make_shared<CUDAFence>(GetGPUDataTransfer(session_state));
}

} // namespace onnxruntime
38 changes: 21 additions & 17 deletions onnxruntime/core/providers/tensorrt/tensorrt_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,31 @@
#include "core/framework/allocator.h"

namespace onnxruntime {
constexpr const char* TRT = "Trt";

class TensorrtPinnedAllocator : public CPUAllocator {
constexpr const char* TRT = "Tensorrt";
constexpr const char* TRT_PINNED = "TensorrtPinned";

class TensorrtAllocator : public IDeviceAllocator {
public:
virtual const OrtAllocatorInfo& Info() const override {
static OrtAllocatorInfo tensorrt_cpu_allocator_info(TRT,
OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0,
OrtMemType::OrtMemTypeCPU);
return tensorrt_cpu_allocator_info;
}
TensorrtAllocator(int device_id) : info_(TRT, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id), device_id, OrtMemTypeDefault) {}
virtual void* Alloc(size_t size) override;
virtual void Free(void* p) override;
virtual const OrtAllocatorInfo& Info() const override;
virtual FencePtr CreateFence(const SessionState* session_state) override;

private:
void CheckDevice() const;

private:
const OrtAllocatorInfo info_;
};

/*! \brief The default allocator doesn't allocate anything. It's used here to let allocation
planner get allocator information.
*/
class TensorrtAllocator : public CPUAllocator {
class TensorrtPinnedAllocator : public IDeviceAllocator {
public:
virtual const OrtAllocatorInfo& Info() const override {
static OrtAllocatorInfo tensorrt_default_allocator_info(TRT,
OrtAllocatorType::OrtDeviceAllocator);
return tensorrt_default_allocator_info;
}
virtual void* Alloc(size_t size) override;
virtual void Free(void* p) override;
virtual const OrtAllocatorInfo& Info() const override;
virtual FencePtr CreateFence(const SessionState* session_state) override;
};

} // namespace onnxruntime
Loading