microsoft · stevenlix · Aug 9, 2019 · Aug 1, 2019 · Aug 1, 2019 · Aug 1, 2019
diff --git a/.gitmodules b/.gitmodules
@@ -25,10 +25,6 @@
 [submodule "cmake/external/re2"]
 	path = cmake/external/re2
 	url = https://github.com/google/re2.git
-[submodule "cmake/external/onnx-tensorrt"]
-	path = cmake/external/onnx-tensorrt
-	url = https://github.com/onnx/onnx-tensorrt.git
-        branch = v5.0
 [submodule "cmake/external/eigen"]
 	path = cmake/external/eigen
 	url = https://github.com/eigenteam/eigen-git-mirror.git
@@ -41,3 +37,7 @@
 [submodule "cmake/external/spdlog"]
 	path = cmake/external/spdlog
 	url = https://github.com/gabime/spdlog.git
+[submodule "cmake/external/onnx-tensorrt"]
+	path = cmake/external/onnx-tensorrt
+	url = https://github.com/onnx/onnx-tensorrt.git
+	branch = 5.1
diff --git a/cmake/external/onnx-tensorrt b/cmake/external/onnx-tensorrt
diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.h
@@ -7,7 +7,7 @@
 extern "C" {
 #endif
 
-ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Tensorrt, _In_ OrtSessionOptions* options);
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Tensorrt, _In_ OrtSessionOptions* options, int device_id);
 
 #ifdef __cplusplus
 }

diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "core/framework/graph_partitioner.h"
-
 #include "core/framework/kernel_registry_manager.h"
 #include "core/graph/function.h"
 #include "core/graph/graph_viewer.h"
@@ -176,7 +175,7 @@ Status GraphPartitioner::Partition(Graph& graph, bool export_dll, FuncManager& f
         //prepare the func kernel
         KernelDefBuilder builder;
         BuildFusedKernelDef(builder, *node);
-        if (node->GetExecutionProviderType() == onnxruntime::kTensorrtExecutionProvider || node->GetExecutionProviderType() == onnxruntime::kNGraphExecutionProvider || node->GetExecutionProviderType() == onnxruntime::kNnapiExecutionProvider) {
+        if (node->GetExecutionProviderType() == onnxruntime::kNGraphExecutionProvider || node->GetExecutionProviderType() == onnxruntime::kNnapiExecutionProvider) {
           builder.SetDefaultInputsMemoryType(OrtMemTypeCPUInput);
           builder.SetDefaultOutputMemoryType(OrtMemTypeCPUOutput);
         }

diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
@@ -121,8 +121,8 @@ common::Status CopyOneInputAcrossDevices(const SessionState& session_state, cons
       ORT_ENFORCE(p_input_provider);
     }
 
-    //no copy for TRT and  nGraph
-    if (required_provider_type == onnxruntime::kTensorrtExecutionProvider || required_provider_type == onnxruntime::kNGraphExecutionProvider) {
+    //no copy for nGraph
+    if (required_provider_type == onnxruntime::kNGraphExecutionProvider) {
       new_mlvalue = orig_mlvalue;
       break;
     }

diff --git a/onnxruntime/core/optimizer/transformer_memcpy.cc b/onnxruntime/core/optimizer/transformer_memcpy.cc
@@ -52,7 +52,7 @@ class TransformerMemcpyImpl {
   std::string provider_;
 };
 
-/** Helper that returns a pointer to the corresponding TensorProto for a name if it is an initializer. 
+/** Helper that returns a pointer to the corresponding TensorProto for a name if it is an initializer.
 @param check_outer_scope If true and the graph is a subgraph, check parent graph/s for 'name' if not found in 'graph'.
 */
 static const onnx::TensorProto* GetInitializer(const Graph& graph, const std::string& name, bool check_outer_scope) {
@@ -73,7 +73,6 @@ common::Status MemcpyTransformer::ApplyImpl(Graph& graph, bool& modified, int gr
         provider != onnxruntime::kMklDnnExecutionProvider &&
         provider != onnxruntime::kNGraphExecutionProvider &&
         provider != onnxruntime::kNupharExecutionProvider &&
-        provider != onnxruntime::kTensorrtExecutionProvider &&
         provider != onnxruntime::kOpenVINOExecutionProvider) {
       TransformerMemcpyImpl copy_impl(graph, provider);
       auto current_modified = copy_impl.ModifyGraph(registry_manager_);
@@ -100,7 +99,7 @@ common::Status MemcpyTransformer::ApplyImpl(Graph& graph, bool& modified, int gr
 
 Overview: The transformer transforms the input graph as follows:
 
-(1) For every initializer W that is referenced by both provider and non-provider nodes, 
+(1) For every initializer W that is referenced by both provider and non-provider nodes,
 we create a duplicate initializer W2 and change all provider nodes to reference this
 duplicate copy.
 
@@ -167,7 +166,9 @@ bool TransformerMemcpyImpl::ModifyGraph(const KernelRegistryManager& kernel_regi
 }
 
 void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node, const KernelRegistryManager& kernel_registries, InitializedTensorSet& initializers_consumed) {
-  if (node.GetExecutionProviderType() == provider_) {
+  if (node.GetExecutionProviderType() == provider_
+      || (node.GetExecutionProviderType() == kCudaExecutionProvider && provider_ == kTensorrtExecutionProvider)
+      || (node.GetExecutionProviderType() == kTensorrtExecutionProvider && provider_ == kCudaExecutionProvider)) {
     provider_nodes_.insert(&node);
     // note KernelCreateInfo might be nullptr for custom kernel
     const KernelCreateInfo* kci = nullptr;
@@ -206,7 +207,7 @@ void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node, const KernelReg
     }
   } else {
     // TODO: copy between devices? i.e. multiple GPUs
-    if (node.GetExecutionProviderType() != onnxruntime::kCpuExecutionProvider && node.GetExecutionProviderType() != onnxruntime::kTensorrtExecutionProvider &&
+    if (node.GetExecutionProviderType() != onnxruntime::kCpuExecutionProvider &&
         node.GetExecutionProviderType() != onnxruntime::kNGraphExecutionProvider && !node.GetExecutionProviderType().empty()) {
       ORT_THROW("Execution type '", node.GetExecutionProviderType(), "' doesn't support memcpy ");
     }

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_allocator.cc b/onnxruntime/core/providers/tensorrt/tensorrt_allocator.cc
@@ -0,0 +1,72 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "tensorrt_allocator.h"
+#include "core/providers/cuda/cuda_common.h"
+#include "core/framework/allocatormgr.h"
+#include "core/framework/session_state.h"
+#include "core/providers/cuda/cuda_fence.h"
+#include "core/providers/cuda/gpu_data_transfer.h"
+
+namespace onnxruntime {
+
+static const GPUDataTransfer* GetGPUDataTransfer(const SessionState* session_state) {
+  OrtDevice gpu_device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0);
+  OrtDevice cpu_device;
+  return dynamic_cast<const GPUDataTransfer*>(session_state->GetDataTransferMgr().GetDataTransfer(gpu_device, cpu_device));
+}
+
+void TensorrtAllocator::CheckDevice() const {
+#ifndef NDEBUG
+  // check device to match at debug build
+  // if it's expected to change, call cudaSetDevice instead of the check
+  int current_device;
+  CUDA_CALL_THROW(cudaGetDevice(&current_device));
+  ORT_ENFORCE(current_device == info_.id);
+#endif
+}
+
+void* TensorrtAllocator::Alloc(size_t size) {
+  CheckDevice();
+  void* p = nullptr;
+  if (size > 0) {
+    CUDA_CALL_THROW(cudaMalloc((void**)&p, size));
+  }
+  return p;
+}
+
+void TensorrtAllocator::Free(void* p) {
+  CheckDevice();
+  cudaFree(p);  // do not throw error since it's OK for cudaFree to fail during shutdown
+}
+
+const OrtAllocatorInfo& TensorrtAllocator::Info() const {
+  return info_;
+}
+
+FencePtr TensorrtAllocator::CreateFence(const SessionState* session_state) {
+  return std::make_shared<CUDAFence>(GetGPUDataTransfer(session_state));
+}
+
+void* TensorrtPinnedAllocator::Alloc(size_t size) {
+  void* p = nullptr;
+  if (size > 0) {
+    CUDA_CALL_THROW(cudaMallocHost((void**)&p, size));
+  }
+  return p;
+}
+
+void TensorrtPinnedAllocator::Free(void* p) {
+  CUDA_CALL_THROW(cudaFreeHost(p));
+}
+
+const OrtAllocatorInfo& TensorrtPinnedAllocator::Info() const {
+  static constexpr OrtAllocatorInfo tensorrt_allocator_info(TRT_PINNED, OrtDeviceAllocator, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, 0), 0, OrtMemTypeCPUOutput);
+  return tensorrt_allocator_info;
+}
+
+FencePtr TensorrtPinnedAllocator::CreateFence(const SessionState* session_state) {
+  return std::make_shared<CUDAFence>(GetGPUDataTransfer(session_state));
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_allocator.h b/onnxruntime/core/providers/tensorrt/tensorrt_allocator.h
@@ -6,27 +6,31 @@
 #include "core/framework/allocator.h"
 
 namespace onnxruntime {
-constexpr const char* TRT = "Trt";
 
-class TensorrtPinnedAllocator : public CPUAllocator {
+constexpr const char* TRT = "Tensorrt";
+constexpr const char* TRT_PINNED = "TensorrtPinned";
+
+class TensorrtAllocator : public IDeviceAllocator {
  public:
-  virtual const OrtAllocatorInfo& Info() const override {
-    static OrtAllocatorInfo tensorrt_cpu_allocator_info(TRT,
-                                                   OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0,
-                                                   OrtMemType::OrtMemTypeCPU);
-    return tensorrt_cpu_allocator_info;
-  }
+  TensorrtAllocator(int device_id) : info_(TRT, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id), device_id, OrtMemTypeDefault) {}
+  virtual void* Alloc(size_t size) override;
+  virtual void Free(void* p) override;
+  virtual const OrtAllocatorInfo& Info() const override;
+  virtual FencePtr CreateFence(const SessionState* session_state) override;
+
+ private:
+  void CheckDevice() const;
+
+ private:
+  const OrtAllocatorInfo info_;
 };
 
-/*! \brief The default allocator doesn't allocate anything. It's used here to let allocation
-           planner get allocator information.
-*/
-class TensorrtAllocator : public CPUAllocator {
+class TensorrtPinnedAllocator : public IDeviceAllocator {
  public:
-  virtual const OrtAllocatorInfo& Info() const override {
-    static OrtAllocatorInfo tensorrt_default_allocator_info(TRT,
-                                                       OrtAllocatorType::OrtDeviceAllocator);
-    return tensorrt_default_allocator_info;
-  }
+  virtual void* Alloc(size_t size) override;
+  virtual void Free(void* p) override;
+  virtual const OrtAllocatorInfo& Info() const override;
+  virtual FencePtr CreateFence(const SessionState* session_state) override;
 };
+
 }  // namespace onnxruntime
+9 −9		FancyActivation.cu
+57 −13		FancyActivation.hpp
+3 −5		ImporterContext.hpp
+1 −1		InstanceNormalization.cpp
+48 −4		InstanceNormalization.hpp
+14 −3		ModelImporter.cpp
+6 −0		OnnxAttrs.cpp
+2 −3		PluginFactory.cpp
+57 −43		README.md
+3 −2		ResizeNearest.cu
+47 −4		ResizeNearest.hpp
+25 −11		ShapedWeights.cpp
+4 −2		Split.cu
+47 −4		Split.hpp
+387 −465		builtin_op_importers.cpp
+0 −43		builtin_plugins.cpp
+9 −0		contributing.md
+2 −2		onnx2trt.hpp
+5 −61		onnx2trt_utils.cpp
+17 −7		onnx2trt_utils.hpp
+10 −13		onnx_backend_test.py
+145 −0		operators.md
+78 −55		plugin.cpp
+63 −22		plugin.hpp