Skip to content

Commit

Permalink
Extend simple lib example to support GPU inputs / outputs (triton-inf…
Browse files Browse the repository at this point in the history
…erence-server#579)

* Extend simple lib example to allow GPU inputs / outputs

* Fix bug and typo

* Extend L0_simple_lib to run with GPU input

* Make "-g" affects both input and output. Add condition on ENABLE_GPU
  • Loading branch information
GuanLuo authored and deadeyegoodwin committed Aug 26, 2019
1 parent d9b4bda commit 61aef4a
Show file tree
Hide file tree
Showing 5 changed files with 187 additions and 37 deletions.
6 changes: 6 additions & 0 deletions qa/L0_simple_lib/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ if [ $? -ne 0 ]; then
RET=1
fi

# Set input data in GPU memory
$SIMPLE_CLIENT -r $MODELSDIR -g >>$CLIENT_LOG 2>&1
if [ $? -ne 0 ]; then
RET=1
fi

set -e

if [ $RET -eq 0 ]; then
Expand Down
16 changes: 8 additions & 8 deletions src/core/backend_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ BackendContext::CreateCudaStream(const int cuda_stream_priority)
if (cuerr != cudaSuccess) {
return Status(
RequestStatusCode::INTERNAL, "unable to create stream for " + name_ +
": " + cudaGetErrorString(cuerr));
": " + cudaGetErrorString(cuerr));
}
}
#endif // TRTIS_ENABLE_GPU
Expand Down Expand Up @@ -188,8 +188,8 @@ BackendContext::SetFixedSizeOutputBuffer(
name, src_memory_type, dst_memory_type, expected_byte_size,
content + content_offset, buffer, &cuda_used);
cuda_copy |= cuda_used;
}
}
}
}
}

payload.status_ = status;
Expand All @@ -201,7 +201,8 @@ BackendContext::SetFixedSizeOutputBuffer(
return cuda_copy;
}

Status BackendContext::CopyBuffer(
Status
BackendContext::CopyBuffer(
const std::string& name, const TRTSERVER_Memory_Type src_memory_type,
const TRTSERVER_Memory_Type dst_memory_type, const size_t byte_size,
const void* src, void* dst, bool* cuda_used)
Expand All @@ -224,16 +225,15 @@ Status BackendContext::CopyBuffer(
if (err != cudaSuccess) {
return Status(
RequestStatusCode::INTERNAL,
"failed to use CUDA copy for input '" + name +
"failed to use CUDA copy for tensor '" + name +
"': " + std::string(cudaGetErrorString(err)));
} else {
*cuda_used = true;
}
#else
return Status(
RequestStatusCode::INTERNAL,
"try to use CUDA copy for tensor '" + name +
"' while GPU is not supported");
RequestStatusCode::INTERNAL, "try to use CUDA copy for tensor '" +
name + "' while GPU is not supported");
#endif // TRTIS_ENABLE_GPU
}
return Status::Success;
Expand Down
2 changes: 1 addition & 1 deletion src/core/provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,7 @@ InferResponseProvider::~InferResponseProvider()
if (output.release_buffer_ != nullptr) {
TRTSERVER_Error* err = release_fn_(
allocator_, output.release_buffer_, output.release_userp_,
output.byte_size_, TRTSERVER_MEMORY_CPU, 0);
output.byte_size_, output.memory_type_, 0);
if (err != nullptr) {
LOG_ERROR << "failed to release result tensor '" << output.name_
<< "': " << TRTSERVER_ErrorMessage(err);
Expand Down
9 changes: 9 additions & 0 deletions src/servers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,15 @@ target_link_libraries(
PRIVATE trtserver
PRIVATE protobuf::libprotobuf
)
if(${TRTIS_ENABLE_GPU})
target_include_directories(simple PRIVATE ${CUDA_INCLUDE_DIRS})
target_link_libraries(
simple
PUBLIC -L/usr/local/cuda/lib64/stubs
PUBLIC -lnvidia-ml
PRIVATE ${CUDA_LIBRARIES}
)
endif() # TRTIS_ENABLE_GPU
install(
TARGETS simple
RUNTIME DESTINATION bin
Expand Down
191 changes: 163 additions & 28 deletions src/servers/simple.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,38 @@
#include "src/core/trtserver.h"
#include "src/servers/common.h"

#ifdef TRTIS_ENABLE_GPU
#include <cuda_runtime_api.h>
#endif // TRTIS_ENABLE_GPU

namespace ni = nvidia::inferenceserver;

namespace {

bool use_gpu_memory = false;

#ifdef TRTIS_ENABLE_GPU
#define FAIL_IF_CUDA_ERR(X, MSG) \
do { \
cudaError_t err = (X); \
if (err != cudaSuccess) { \
LOG_ERROR << "error: " << (MSG) << ": " << cudaGetErrorString(err); \
exit(1); \
} \
} while (false)


static auto gpu_data_deleter = [](void* data) {
if (data != nullptr) {
auto err = cudaFree(data);
if (err != cudaSuccess) {
LOG_ERROR << "error: failed to cudaFree " << data << ": "
<< cudaGetErrorString(err);
}
}
};
#endif // TRTIS_ENABLE_GPU

void
Usage(char** argv, const std::string& msg = std::string())
{
Expand All @@ -48,11 +76,18 @@ Usage(char** argv, const std::string& msg = std::string())
}

LOG_ERROR << "Usage: " << argv[0] << " [options]";
LOG_ERROR << "\t-g Use GPU memory for input and output tensors";
LOG_ERROR << "\t-r [model repository absolute path]";

exit(1);
}

std::string
MemoryTypeString(TRTSERVER_Memory_Type memory_type)
{
return (memory_type == TRTSERVER_MEMORY_CPU) ? "CPU memory" : "GPU memory";
}

TRTSERVER_Error*
ResponseAlloc(
TRTSERVER_ResponseAllocator* allocator, void** buffer, void** buffer_userp,
Expand All @@ -63,18 +98,33 @@ ResponseAlloc(
// releasing the buffer.

// If 'byte_size' is zero just return 'buffer'==nullptr, we don't
// need to do any other book-keeping. Only handle allocation in the
// CPU region.
if ((byte_size == 0) || (memory_type == TRTSERVER_MEMORY_CPU)) {
*buffer = (byte_size == 0) ? nullptr : malloc(byte_size);
*buffer_userp = new std::string(tensor_name);
// need to do any other book-keeping.
if (byte_size == 0) {
*buffer = nullptr;
*buffer_userp = nullptr;
LOG_INFO << "allocated " << byte_size << " bytes for result tensor "
<< tensor_name;
} else {
*buffer = nullptr;
*buffer_userp = nullptr;
LOG_INFO << "failed to allocated " << byte_size
<< " bytes for result tensor " << tensor_name;
void* allocated_ptr = nullptr;
if (memory_type == TRTSERVER_MEMORY_CPU) {
allocated_ptr = malloc(byte_size);
#ifdef TRTIS_ENABLE_GPU
} else if (use_gpu_memory) {
auto err = cudaMalloc(&allocated_ptr, byte_size);
if (err != cudaSuccess) {
LOG_INFO << "cudaMalloc failed: " << cudaGetErrorString(err);
allocated_ptr = nullptr;
}
#endif // TRTIS_ENABLE_GPU
}

if (allocated_ptr != nullptr) {
*buffer = allocated_ptr;
*buffer_userp = new std::string(tensor_name);
LOG_INFO << "allocated " << byte_size << " bytes in "
<< MemoryTypeString(memory_type) << " for result tensor "
<< tensor_name;
}
}

return nullptr; // Success
Expand All @@ -93,8 +143,21 @@ ResponseRelease(
}

LOG_INFO << "Releasing buffer " << buffer << " of size " << byte_size
<< " for result '" << *name << "'";
free(buffer);
<< " in " << MemoryTypeString(memory_type) << " for result '"
<< *name << "'";
if (memory_type == TRTSERVER_MEMORY_CPU) {
free(buffer);
#ifdef TRTIS_ENABLE_GPU
} else if (use_gpu_memory) {
auto err = cudaFree(buffer);
if (err != cudaSuccess) {
LOG_ERROR << "error: failed to cudaFree " << buffer << ": "
<< cudaGetErrorString(err);
}
#endif // TRTIS_ENABLE_GPU
} else {
LOG_ERROR << "error: unexpected buffer allocated in GPU memory";
}

delete name;

Expand All @@ -121,8 +184,11 @@ main(int argc, char** argv)

// Parse commandline...
int opt;
while ((opt = getopt(argc, argv, "r:")) != -1) {
while ((opt = getopt(argc, argv, "gr:")) != -1) {
switch (opt) {
case 'g':
use_gpu_memory = true;
break;
case 'r':
model_repository_path = optarg;
break;
Expand All @@ -135,6 +201,11 @@ main(int argc, char** argv)
if (model_repository_path.empty()) {
Usage(argv, "-r must be used to specify model repository path");
}
#ifndef TRTIS_ENABLE_GPU
if (use_gpu_memory) {
Usage(argv, "-g can not be used without enabling GPU");
}
#endif // TRTIS_ENABLE_GPU

// Create the server...
TRTSERVER_ServerOptions* server_options = nullptr;
Expand Down Expand Up @@ -290,15 +361,47 @@ main(int argc, char** argv)
input1_data[i] = 1;
}

size_t input0_size = input0_data.size() * sizeof(int32_t);
size_t input1_size = input1_data.size() * sizeof(int32_t);

const void* input0_base = &input0_data[0];
const void* input1_base = &input1_data[0];
auto memory_type = TRTSERVER_MEMORY_CPU;
#ifdef TRTIS_ENABLE_GPU
std::unique_ptr<void, decltype(gpu_data_deleter)> input0_gpu(
nullptr, gpu_data_deleter);
std::unique_ptr<void, decltype(gpu_data_deleter)> input1_gpu(
nullptr, gpu_data_deleter);
if (use_gpu_memory) {
void* dst;
FAIL_IF_CUDA_ERR(
cudaMalloc(&dst, input0_size), "allocating GPU memory for INPUT0 data");
input0_gpu.reset(dst);
FAIL_IF_CUDA_ERR(
cudaMemcpy(dst, &input0_data[0], input0_size, cudaMemcpyHostToDevice),
"setting INPUT0 data in GPU memory");
FAIL_IF_CUDA_ERR(
cudaMalloc(&dst, input1_size), "allocating GPU memory for INPUT1 data");
input1_gpu.reset(dst);
FAIL_IF_CUDA_ERR(
cudaMemcpy(dst, &input1_data[0], input1_size, cudaMemcpyHostToDevice),
"setting INPUT1 data in GPU memory");
}

input0_base = use_gpu_memory ? input0_gpu.get() : &input0_data[0];
input1_base = use_gpu_memory ? input1_gpu.get() : &input1_data[0];
memory_type = use_gpu_memory ? TRTSERVER_MEMORY_GPU : TRTSERVER_MEMORY_CPU;
#endif // TRTIS_ENABLE_GPU

FAIL_IF_ERR(
TRTSERVER_InferenceRequestProviderSetInputData(
request_provider, input0->name().c_str(), &input0_data[0],
input0_data.size() * sizeof(int32_t), TRTSERVER_MEMORY_CPU),
request_provider, input0->name().c_str(), input0_base, input0_size,
memory_type),
"assigning INPUT0 data");
FAIL_IF_ERR(
TRTSERVER_InferenceRequestProviderSetInputData(
request_provider, input1->name().c_str(), &input1_data[0],
input1_data.size() * sizeof(int32_t), TRTSERVER_MEMORY_CPU),
request_provider, input1->name().c_str(), input1_base, input1_size,
memory_type),
"assigning INPUT1 data");

// Perform inference...
Expand Down Expand Up @@ -348,6 +451,8 @@ main(int argc, char** argv)
}

// Check the output tensor values...
// Note that depending on whether the backend supports outputs in GPU memory,
// the output tensor may be in CPU memory even if -g flag is set.
const void* output0_content;
size_t output0_byte_size;
TRTSERVER_Memory_Type output0_memory_type;
Expand All @@ -361,12 +466,13 @@ main(int argc, char** argv)
"unexpected output0 byte-size, expected " +
std::to_string(16 * sizeof(int32_t)) + ", got " +
std::to_string(output0_byte_size));
} else if (output0_memory_type != TRTSERVER_MEMORY_CPU) {
} else if (
(!use_gpu_memory) && (output0_memory_type == TRTSERVER_MEMORY_GPU)) {
FAIL(
"unexpected output0 memory type, expected to be allocated "
"on CPU memory (" +
std::to_string(TRTSERVER_MEMORY_CPU) + "), got (" +
std::to_string(output0_memory_type) + ")");
"in " +
MemoryTypeString(TRTSERVER_MEMORY_CPU) + ", got " +
MemoryTypeString(output0_memory_type));
}

const void* output1_content;
Expand All @@ -382,18 +488,47 @@ main(int argc, char** argv)
"unexpected output1 byte-size, expected " +
std::to_string(16 * sizeof(int32_t)) + ", got " +
std::to_string(output1_byte_size));
} else if (output1_memory_type != TRTSERVER_MEMORY_CPU) {
} else if (
(!use_gpu_memory) && (output1_memory_type == TRTSERVER_MEMORY_GPU)) {
FAIL(
"unexpected output1 memory type, expected to be allocated "
"on CPU memory (" +
std::to_string(TRTSERVER_MEMORY_CPU) + "), got (" +
std::to_string(output1_memory_type) + ")");
"in " +
MemoryTypeString(TRTSERVER_MEMORY_CPU) + ", got " +
MemoryTypeString(output1_memory_type));
}

const int32_t* output0_result =
reinterpret_cast<const int32_t*>(output0_content);
const int32_t* output1_result =
reinterpret_cast<const int32_t*>(output1_content);
const int32_t* output0_result = reinterpret_cast<const int32_t*>(output0_content);
const int32_t* output1_result = reinterpret_cast<const int32_t*>(output1_content);

#ifdef TRTIS_ENABLE_GPU
// Different from CPU memory, outputs in GPU memory must be copied to CPU
// memory to be read directly.
std::vector<int32_t> output0_data(16);
std::vector<int32_t> output1_data(16);
if (output0_memory_type == TRTSERVER_MEMORY_CPU) {
LOG_INFO << "OUTPUT0 are stored in CPU memory";
} else {
LOG_INFO << "OUTPUT0 are stored in GPU memory";
FAIL_IF_CUDA_ERR(
cudaMemcpy(
&output0_data[0], output0_content, output0_byte_size,
cudaMemcpyDeviceToHost),
"setting INPUT0 data in GPU memory");
output0_result = reinterpret_cast<const int32_t*>(&output0_data[0]);
}

if (output1_memory_type == TRTSERVER_MEMORY_CPU) {
LOG_INFO << "OUTPUT1 are stored in CPU memory";
} else {
LOG_INFO << "OUTPUT1 are stored in GPU memory";
FAIL_IF_CUDA_ERR(
cudaMemcpy(
&output1_data[0], output1_content, output1_byte_size,
cudaMemcpyDeviceToHost),
"setting INPUT0 data in GPU memory");
output1_result = reinterpret_cast<const int32_t*>(&output1_data[0]);
}
#endif // TRTIS_ENABLE_GPU

for (size_t i = 0; i < 16; ++i) {
LOG_INFO << input0_data[i] << " + " << input1_data[i] << " = "
Expand Down

0 comments on commit 61aef4a

Please sign in to comment.