add two flags(num_load_threads and num_unload_threads) to the server, by setting these two flags, to solve the problem of the serving request being delayed when the model is switched.

handong0123 · handong0123 · commit 238d8f4463e8 · 2021-01-15T16:10:40.000+08:00
diff --git a/tensorflow_serving/core/aspired_versions_manager.h b/tensorflow_serving/core/aspired_versions_manager.h
@@ -110,13 +110,13 @@ class AspiredVersionsManager : public Manager,
     ///
     /// If set as 0, we don't use a thread-pool, and servable loads are
     /// performed serially in the manager's main work loop.
-    uint32 num_load_threads = 1;
+    uint32 num_load_threads = 0;
 
     /// The number of threads in the thread-pool used to unload servables.
     ///
     /// If set as 0, we don't use a thread-pool, and servable unloads are
     /// performed serially in the manager's main work loop.
-    uint32 num_unload_threads = 1;
+    uint32 num_unload_threads = 0;
 
     /// Maximum number of times we retry loading a servable, after the first
     /// failure, before we give up.
diff --git a/tensorflow_serving/core/basic_manager.h b/tensorflow_serving/core/basic_manager.h
@@ -118,12 +118,12 @@ class BasicManager : public Manager {
     // The number of threads in the thread-pool used to load servables.
     //
     // If set as 0, we don't use a thread-pool, and LoadServable() blocks.
-    uint32 num_load_threads = 1;
+    uint32 num_load_threads = 0;
 
     // The number of threads in the thread-pool used to unload servables.
     //
     // If set as 0, we don't use a thread-pool, and UnloadServable() blocks.
-    uint32 num_unload_threads = 1;
+    uint32 num_unload_threads = 0;
 
     // EventBus to publish servable state changes. This is optional, if unset,
     // we don't publish.
diff --git a/tensorflow_serving/core/caching_manager.h b/tensorflow_serving/core/caching_manager.h
@@ -54,12 +54,12 @@ class CachingManager : public Manager {
     // The number of threads in the thread-pool used to load servables.
     //
     // If set as 0, we don't use a thread-pool, and LoadServable() blocks.
-    uint32 num_load_threads = 1;
+    uint32 num_load_threads = 0;
 
     // The number of threads in the thread-pool used to unload servables.
     //
     // If set as 0, we don't use a thread-pool.
-    uint32 num_unload_threads = 1;
+    uint32 num_unload_threads = 0;
 
     // EventBus to publish servable state changes. This is optional, if unset,
     // we don't publish.
diff --git a/tensorflow_serving/model_servers/main.cc b/tensorflow_serving/model_servers/main.cc
@@ -105,6 +105,18 @@ int main(int argc, char** argv) {
       tensorflow::Flag("model_base_path", &options.model_base_path,
                        "path to export (ignored if --model_config_file flag "
                        "is set, otherwise required)"),
+      tensorflow::Flag("num_load_threads", &options.num_load_threads,
+                       "The number of threads in the thread-pool used to load servables."
+                       "If set as 0, we don't use a thread-pool, and servable loads are "
+                       "performed serially in the manager's main work loop, "
+                       "may casue the Serving request to be delayed. "
+                       "Default: 0"),
+      tensorflow::Flag("num_unload_threads", &options.num_unload_threads,
+                       "The number of threads in the thread-pool used to unload servables."
+                       "If set as 0, we don't use a thread-pool, and servable loads are "
+                       "performed serially in the manager's main work loop, "
+                       "may casue the Serving request to be delayed. "
+                       "Default: 0"),
       tensorflow::Flag("max_num_load_retries", &options.max_num_load_retries,
                        "maximum number of times it retries loading a model "
                        "after the first failure, before giving up. "
diff --git a/tensorflow_serving/model_servers/server.cc b/tensorflow_serving/model_servers/server.cc
@@ -289,6 +289,8 @@ Status Server::BuildAndStart(const Options& server_options) {
   options.custom_model_config_loader = &LoadCustomModelConfig;
   options.aspired_version_policy =
       std::unique_ptr<AspiredVersionPolicy>(new AvailabilityPreservingPolicy);
+  options.num_load_threads = server_options.num_load_threads;
+  options.num_unload_threads = server_options.num_unload_threads;
   options.max_num_load_retries = server_options.max_num_load_retries;
   options.load_retry_interval_micros =
       server_options.load_retry_interval_micros;
diff --git a/tensorflow_serving/model_servers/server.h b/tensorflow_serving/model_servers/server.h
@@ -60,6 +60,8 @@ class Server {
     float per_process_gpu_memory_fraction = 0;
     tensorflow::string batching_parameters_file;
     tensorflow::string model_name;
+    tensorflow::int32 num_load_threads = 0;
+    tensorflow::int32 num_unload_threads = 0;
     tensorflow::int32 max_num_load_retries = 5;
     tensorflow::int64 load_retry_interval_micros = 1LL * 60 * 1000 * 1000;
     tensorflow::int32 file_system_poll_wait_seconds = 1;