From 3e1258d3c940d83b44c534362a5d83979e96c2a1 Mon Sep 17 00:00:00 2001 From: Nikhil Kulkarni Date: Thu, 8 Dec 2022 01:09:06 +0530 Subject: [PATCH] Add Ensemble support in SageMaker MME, improve 507 on SM Invoke OOM and entrypoint changes (#5002) * Use personal repo * Correct personal repo * Fix: Improve 507 on SM Invoke OOM * Revert custom repo changes * Personal repo fix * Add Ensemble support in SageMaker MME * Update ensemble test * Update ensemble test to include invoke and fix comment --- docker/sagemaker/serve | 13 +- qa/L0_sagemaker/sagemaker_multi_model_test.py | 63 +++++++- qa/L0_sagemaker/test.sh | 33 ++++- src/sagemaker_server.cc | 140 +++++++++++++++--- src/sagemaker_server.h | 4 +- 5 files changed, 220 insertions(+), 33 deletions(-) diff --git a/docker/sagemaker/serve b/docker/sagemaker/serve index eff78493f8..803b697afa 100755 --- a/docker/sagemaker/serve +++ b/docker/sagemaker/serve @@ -35,7 +35,7 @@ if [ -n "$SAGEMAKER_MULTI_MODEL" ]; then if [ "$SAGEMAKER_MULTI_MODEL" == "true" ]; then SAGEMAKER_MODEL_REPO=${SAGEMAKER_MULTI_MODEL_REPO} is_mme_mode=true - echo "Triton is running in SageMaker MME mode" + echo "Triton is running in SageMaker MME mode." fi fi @@ -66,13 +66,24 @@ if [ -n "$SAGEMAKER_TRITON_LOG_ERROR" ]; then fi if [ -n "$SAGEMAKER_TRITON_SHM_DEFAULT_BYTE_SIZE" ]; then SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --backend-config=python,shm-default-byte-size=${SAGEMAKER_TRITON_SHM_DEFAULT_BYTE_SIZE}" +else + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --backend-config=python,shm-default-byte-size=16777216" #16MB fi if [ -n "$SAGEMAKER_TRITON_SHM_GROWTH_BYTE_SIZE" ]; then SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --backend-config=python,shm-growth-byte-size=${SAGEMAKER_TRITON_SHM_GROWTH_BYTE_SIZE}" +else + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --backend-config=python,shm-growth-byte-size=1048576" #1MB fi if [ -n "$SAGEMAKER_TRITON_TENSORFLOW_VERSION" ]; then SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --backend-config=tensorflow,version=${SAGEMAKER_TRITON_TENSORFLOW_VERSION}" fi +if [ -n "$SAGEMAKER_TRITON_MODEL_LOAD_GPU_LIMIT" ]; then + num_gpus=$(nvidia-smi -L | wc -l) + for ((i=0; i<${num_gpus}; i++)); do + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --model-load-gpu-limit ${i}:${SAGEMAKER_TRITON_MODEL_LOAD_GPU_LIMIT}" + done +fi + if [ "${is_mme_mode}" = false ] && [ -f "${SAGEMAKER_MODEL_REPO}/config.pbtxt" ]; then echo "ERROR: Incorrect directory structure." diff --git a/qa/L0_sagemaker/sagemaker_multi_model_test.py b/qa/L0_sagemaker/sagemaker_multi_model_test.py index e57c5a544e..18f6d679e9 100644 --- a/qa/L0_sagemaker/sagemaker_multi_model_test.py +++ b/qa/L0_sagemaker/sagemaker_multi_model_test.py @@ -97,6 +97,10 @@ def setUp(self): # Output is same as input since this is an identity model self.model2_input_data_ = [0, 1, 2, 3, 4, 5, 6, 7] + # ensemble model setup + self.model3_name = "123456789ensemble" + self.model3_url = "/opt/ml/models/123456789ensemble/model" + def test_sm_0_environment_variables_set(self): self.assertEqual(os.getenv("SAGEMAKER_MULTI_MODEL"), "true", "Variable SAGEMAKER_MULTI_MODEL must be set to true") @@ -142,11 +146,11 @@ def test_sm_2_model_list(self): "models": [ { "modelName": self.model1_name, - "modelUrl": self.model1_url + "modelUrl": self.model1_url.rstrip("/model") }, { "modelName": self.model2_name, - "modelUrl": self.model2_url + "modelUrl": self.model2_url.rstrip("/model") }, ] } @@ -154,11 +158,11 @@ def test_sm_2_model_list(self): "models": [ { "modelName": self.model2_name, - "modelUrl": self.model2_url + "modelUrl": self.model2_url.rstrip("/model") }, { "modelName": self.model1_name, - "modelUrl": self.model1_url + "modelUrl": self.model1_url.rstrip("/model") }, ] } @@ -177,7 +181,7 @@ def test_sm_3_model_get(self): time.sleep(3) expected_response = { "modelName": self.model1_name, - "modelUrl": self.model1_url + "modelUrl": self.model1_url.rstrip("/model") } self.assertEqual( r.json(), expected_response, @@ -280,6 +284,55 @@ def test_sm_5_model_unload(self): r.status_code, 404, "Expected status code 404, received {}".format(r.status_code)) + def test_sm_6_ensemble_model(self): + # Load ensemble model + request_body = {"model_name": self.model3_name, "url": self.model3_url} + headers = {"Content-Type": "application/json", "X-Amzn-SageMaker-Target-Model": f"{self.model3_name}"} + r = requests.post(self.url_mme_, + data=json.dumps(request_body), + headers=headers) + time.sleep(5) # wait for model to load + self.assertEqual( + r.status_code, 200, + "Expected status code 200, received {}".format(r.status_code)) + + # Invoke ensemble model + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "FP32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "FP32")) + + # Initialize the data + input_data = np.array(self.model1_input_data_, dtype=np.float32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=False) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append( + httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append( + httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + request_body, _ = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs) + + headers = {"Content-Type": "application/json"} + invoke_url = "{}/{}/invoke".format(self.url_mme_, self.model3_name) + r = requests.post(invoke_url, data=request_body, headers=headers) + print(f"response: {r.text}") + r.raise_for_status() + self.assertEqual( + r.status_code, 200, + "Expected status code 200, received {}".format(r.status_code)) + + # Unload ensemble model + unload_url = "{}/{}".format(self.url_mme_, self.model3_name) + r = requests.delete(unload_url, headers=headers) + time.sleep(5) + self.assertEqual( + r.status_code, 200, + "Expected status code 200, received {}".format(r.status_code)) + + if __name__ == "__main__": unittest.main() diff --git a/qa/L0_sagemaker/test.sh b/qa/L0_sagemaker/test.sh index e701e8dd71..c98787d056 100755 --- a/qa/L0_sagemaker/test.sh +++ b/qa/L0_sagemaker/test.sh @@ -56,11 +56,12 @@ rm -f *.out SAGEMAKER_TEST=sagemaker_test.py SAGEMAKER_MULTI_MODEL_TEST=sagemaker_multi_model_test.py -MULTI_MODEL_UNIT_TEST_COUNT=6 +MULTI_MODEL_UNIT_TEST_COUNT=7 UNIT_TEST_COUNT=9 CLIENT_LOG="./client.log" DATADIR=/data/inferenceserver/${REPO_VERSION} +ENSEMBLEDIR=/data/inferenceserver/${REPO_VERSION}/qa_ensemble_model_repository/qa_model_repository SERVER=/opt/tritonserver/bin/tritonserver SERVER_LOG="./server.log" # Link model repository to "/opt/ml/model" @@ -382,6 +383,34 @@ cp -r $DATADIR/qa_model_repository/onnx_int32_int32_int32/* ${MODEL1_PATH} && \ cp -r $DATADIR/qa_identity_model_repository/onnx_zero_1_float32/* ${MODEL2_PATH} && \ sed -i "s/onnx_zero_1_float32/sm_mme_model_2/" ${MODEL2_PATH}/config.pbtxt +# Ensemble model +ENSEMBLE_MODEL_PATH="models/123456789ensemble/model" +mkdir -p "${ENSEMBLE_MODEL_PATH}" + +model_name=python_float32_float32_float32 + +mkdir -p ${ENSEMBLE_MODEL_PATH}/${model_name}/1 && \ +cp ../python_models/add_sub/model.py ${ENSEMBLE_MODEL_PATH}/${model_name}/1/. && \ +cp ../python_models/add_sub/config.pbtxt ${ENSEMBLE_MODEL_PATH}/${model_name}/. +(cd ${ENSEMBLE_MODEL_PATH}/${model_name} && \ + sed -i "s/label_filename:.*//" config.pbtxt && \ + sed -i "0,/name:.*/{s/name:.*/name: \"${model_name}\"/}" config.pbtxt && \ + echo "max_batch_size: 64" >> config.pbtxt) + +# Ensemble part +mkdir -p ${ENSEMBLE_MODEL_PATH}/fan_${model_name}/1 && \ + cp ../python_models/add_sub/model.py ${ENSEMBLE_MODEL_PATH}/fan_${model_name}/1/. && \ + cp ../python_models/fan_add_sub/config.pbtxt ${ENSEMBLE_MODEL_PATH}/fan_${model_name}/. && \ + (cd ${ENSEMBLE_MODEL_PATH}/fan_${model_name} && \ + sed -i "s/label_filename:.*//" config.pbtxt && \ + sed -i "s/model_name: \"ENSEMBLE_MODEL_NAME\"/model_name: \"${model_name}\"/" config.pbtxt && \ + sed -i "0,/name:.*/{s/name:.*/name: \"fan_${model_name}\"/}" config.pbtxt && \ + echo "max_batch_size: 64" >> config.pbtxt) + +# # custom float32 component of ensemble +cp -r $ENSEMBLEDIR/nop_TYPE_FP32_-1 ${ENSEMBLE_MODEL_PATH}/. && \ + mkdir -p ${ENSEMBLE_MODEL_PATH}/nop_TYPE_FP32_-1/1 + # Start server with 'serve' script export SAGEMAKER_MULTI_MODEL=true export SAGEMAKER_TRITON_LOG_VERBOSE=true @@ -423,10 +452,8 @@ rm -rf /opt/ml/models kill $SERVER_PID wait $SERVE_PID - # MME end - unlink /opt/ml/model rm -rf /opt/ml/model diff --git a/src/sagemaker_server.cc b/src/sagemaker_server.cc index dd65671d1c..7c6203d5a5 100644 --- a/src/sagemaker_server.cc +++ b/src/sagemaker_server.cc @@ -226,7 +226,8 @@ SagemakerAPIServer::Handle(evhtp_request_t* req) evhtp_send_reply(req, EVHTP_RES_NOTFOUND); /* 404*/ return; } - LOG_VERBOSE(1) << "SM MME Custom Invoke Model" << std::endl; + LOG_VERBOSE(1) << "SageMaker MME Custom Invoke Model Path" + << std::endl; SageMakerMMEHandleInfer(req, multi_model_name, model_version_str_); return; } @@ -331,6 +332,16 @@ SagemakerAPIServer::ParseSageMakerRequest( } (*parse_map)["model_name"] = model_name_string.c_str(); + /* Extract targetModel to log the associated archive */ + + /* Read headers*/ + (*parse_map)["TargetModel"] = "targetModel.tar.gz"; + + const char* targetModel = + evhtp_kv_find(req->headers_in, "X-Amzn-SageMaker-Target-Model"); + + LOG_INFO << "Loading SageMaker TargetModel: " << targetModel << std::endl; + return; } @@ -383,12 +394,15 @@ SagemakerAPIServer::SagemakeInferRequestClass::InferResponseComplete( evthr_defer(infer_request->thread_, OKReplyCallback, infer_request); } else { EVBufferAddErrorJson(infer_request->req_->buffer_out, err); - TRITONSERVER_ErrorDelete(err); if (SageMakerMMECheckOOMError(err) == true) { + LOG_VERBOSE(1) + << "Received an OOM error during INVOKE MODEL. Returning a 507." + << std::endl; evthr_defer(infer_request->thread_, BADReplyCallback507, infer_request); } else { evthr_defer(infer_request->thread_, BADReplyCallback, infer_request); } + TRITONSERVER_ErrorDelete(err); } LOG_TRITONSERVER_ERROR( @@ -429,6 +443,11 @@ SagemakerAPIServer::SageMakerMMEHandleInfer( return; } + /* Extract targetModel to log the associated archive */ + const char* targetModel = + evhtp_kv_find(req->headers_in, "X-Amzn-SageMaker-Target-Model"); + LOG_INFO << "Invoking SageMaker TargetModel: " << targetModel << std::endl; + bool connection_paused = false; int64_t requested_model_version; @@ -616,19 +635,42 @@ SagemakerAPIServer::SageMakerMMEUnloadModel( std::lock_guard lock(mutex_); if (sagemaker_models_list_.find(model_name) == sagemaker_models_list_.end()) { - LOG_VERBOSE(1) << "Model " << model_name << "is not loaded." << std::endl; + LOG_VERBOSE(1) << "Model " << model_name << " is not loaded." << std::endl; evhtp_send_reply(req, EVHTP_RES_NOTFOUND); /* 404*/ return; } - HandleRepositoryControl(req, "", model_name, "unload"); + /* Always unload dependents as well - this is required to unload dependents in + * ensemble */ + triton::common::TritonJson::Value request_parameters( + triton::common::TritonJson::ValueType::OBJECT); + triton::common::TritonJson::Value unload_parameter( + request_parameters, triton::common::TritonJson::ValueType::OBJECT); - std::string repo_path = sagemaker_models_list_.at(model_name); + unload_parameter.AddBool("unload_dependents", true); + request_parameters.Add("parameters", std::move(unload_parameter)); - std::string repo_parent_path, subdir, customer_subdir; - RE2::FullMatch( - repo_path, model_path_regex_, &repo_parent_path, &subdir, - &customer_subdir); + const char* buffer; + size_t byte_size; + + triton::common::TritonJson::WriteBuffer json_buffer_; + json_buffer_.Clear(); + request_parameters.Write(&json_buffer_); + + byte_size = json_buffer_.Size(); + buffer = json_buffer_.Base(); + + evbuffer_add(req->buffer_in, buffer, byte_size); + + /* Extract targetModel to log the associated archive */ + const char* targetModel = + evhtp_kv_find(req->headers_in, "X-Amzn-SageMaker-Target-Model"); + + LOG_INFO << "Unloading SageMaker TargetModel: " << targetModel << std::endl; + + HandleRepositoryControl(req, "", model_name, "unload"); + + std::string repo_parent_path = sagemaker_models_list_.at(model_name); TRITONSERVER_Error* unload_err = TRITONSERVER_ServerUnregisterModelRepository( server_.get(), repo_parent_path.c_str()); @@ -636,6 +678,8 @@ SagemakerAPIServer::SageMakerMMEUnloadModel( if (unload_err != nullptr) { EVBufferAddErrorJson(req->buffer_out, unload_err); evhtp_send_reply(req, EVHTP_RES_BADREQ); + LOG_ERROR << "Unable to unregister model repository for path: " + << repo_parent_path << std::endl; TRITONSERVER_ErrorDelete(unload_err); } @@ -646,6 +690,8 @@ void SagemakerAPIServer::SageMakerMMEGetModel( evhtp_request_t* req, const char* model_name) { + std::lock_guard lock(mutex_); + if (sagemaker_models_list_.find(model_name) == sagemaker_models_list_.end()) { evhtp_send_reply(req, EVHTP_RES_NOTFOUND); /* 404*/ return; @@ -675,6 +721,8 @@ SagemakerAPIServer::SageMakerMMEGetModel( void SagemakerAPIServer::SageMakerMMEListModel(evhtp_request_t* req) { + std::lock_guard lock(mutex_); + triton::common::TritonJson::Value sagemaker_list_json( triton::common::TritonJson::ValueType::OBJECT); @@ -721,20 +769,31 @@ SagemakerAPIServer::SageMakerMMECheckOOMError(TRITONSERVER_Error* err) const char* message = TRITONSERVER_ErrorMessage(err); std::string error_string(message); + LOG_VERBOSE(1) << "Logging Verbose Error: " << std::endl + << error_string.c_str() << std::endl; + const std::vector error_messages{ "CUDA out of memory", /* pytorch */ "CUDA_OUT_OF_MEMORY", /* tensorflow */ "Out of memory", /* generic */ + "Out Of Memory", "out of memory", "MemoryError", + "OutOfMemory", + "OOM", "Dst tensor is not initialized", "Src tensor is not initialized", "CNMEM_STATUS_OUT_OF_MEMORY", - "CUDNN_STATUS_NOT_INITIALIZED"}; + "CUDNN_STATUS_NOT_INITIALIZED", + "CUBLAS_STATUS_ALLOC_FAILED"}; + /* + TODO: Improve the search to do pattern match on whole words only + */ for (long unsigned int i = 0; i < error_messages.size(); i++) { if (error_string.find(error_messages[i]) != std::string::npos) { - LOG_VERBOSE(1) << "OOM strings detected in logs."; + LOG_VERBOSE(1) << "OOM string '" << error_messages[i].c_str() + << "' detected in logs."; return true; } } @@ -776,7 +835,7 @@ SagemakerAPIServer::SageMakerMMELoadModel( DIR* dir; struct dirent* ent; int dir_count = 0; - std::string model_subdir; + std::string model_subdir, ensemble_model_subdir; if ((dir = opendir(repo_path.c_str())) != NULL) { while ((ent = readdir(dir)) != NULL) { @@ -785,21 +844,56 @@ SagemakerAPIServer::SageMakerMMELoadModel( dir_count += 1; model_subdir = std::string(ent->d_name); } - if (dir_count > 1) { - HTTP_RESPOND_IF_ERR( - req, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "More than one version or model directories found. Note that " - "hidden folders are not allowed and " - "Ensemble models are not supported in SageMaker MME mode.")); - closedir(dir); - return; + + if (dir_count >= 2) { + LOG_VERBOSE(1) << "More than one model detected in archive. " + "Checking if it is an ensemble." + << std::endl; + } + + LOG_VERBOSE(1) << "Reading model sub-directory: " << model_subdir.c_str() + << std::endl; + + // Read the config.pbtxt file at each path, if available + std::string ensemble_config_path = + repo_path + "/" + model_subdir + "/" + "config.pbtxt"; + std::ifstream config_fstream(ensemble_config_path); + std::stringstream ensemble_config_content; + + if (config_fstream.is_open()) { + ensemble_config_content << config_fstream.rdbuf(); + } else { + continue; // A valid config.pbtxt does not exit at this path, or cannot + // be read + } + + /* Compare matched string with `platform: "ensemble"` or + * `platform:"ensemble"`. If present, we break, and use the model_subdir + * to load the ensemble model + */ + std::string detected_ensemble_regex; + if (RE2::PartialMatch( + ensemble_config_content.str(), platform_ensemble_regex_, + &detected_ensemble_regex)) { + LOG_INFO << "SageMaker front-end detected an Ensemble config at path: " + << ensemble_config_path << std::endl; + ensemble_model_subdir = model_subdir; + } + + if (dir_count > 5) { + LOG_WARNING + << "Several model directories found. If using ensemble, smaller " + "ensembles are recommended for better memory management." + << std::endl; } } closedir(dir); } + if (!strcmp(ensemble_model_subdir.c_str(), "") == 0) { + model_subdir = ensemble_model_subdir; + } + std::vector subdir_modelname_map; /* Split repo path into three parts: @@ -877,7 +971,7 @@ SagemakerAPIServer::SageMakerMMELoadModel( } else { std::lock_guard lock(mutex_); - sagemaker_models_list_.emplace(model_name, repo_path); + sagemaker_models_list_.emplace(model_name, repo_parent_path); evhtp_send_reply(req, EVHTP_RES_OK); } diff --git a/src/sagemaker_server.h b/src/sagemaker_server.h index 59d6008129..0fe6e13a13 100644 --- a/src/sagemaker_server.h +++ b/src/sagemaker_server.h @@ -26,7 +26,7 @@ #pragma once #include - +#include #include #include "common.h" @@ -76,6 +76,7 @@ class SagemakerAPIServer : public HTTPAPIServer { models_regex_(R"(/models(?:/)?([^/]+)?(/invoke)?)"), model_path_regex_( R"((\/opt\/ml\/models\/[0-9A-Za-z._]+)\/(model)\/?([0-9A-Za-z._]+)?)"), + platform_ensemble_regex_(R"(platform:(\s)*\"ensemble\")"), ping_mode_("ready"), model_name_(GetEnvironmentVariableOrDefault( "SAGEMAKER_TRITON_DEFAULT_MODEL_NAME", @@ -139,6 +140,7 @@ class SagemakerAPIServer : public HTTPAPIServer { re2::RE2 invocations_regex_; re2::RE2 models_regex_; re2::RE2 model_path_regex_; + re2::RE2 platform_ensemble_regex_; const std::string ping_mode_;