Add CPU only to compose script (#3365)

- Enable compose script to compose cpu-only containers. - Fixed upstream-version confusion to ensure compose script works for all branches - Added documentation
triton-inference-server · Sep 15, 2021 · 0801098 · 0801098
1 parent aa05498
commit 0801098
Show file tree

Hide file tree

Showing 3 changed files with 153 additions and 56 deletions.
diff --git a/build.py b/build.py
@@ -742,6 +742,7 @@ def create_dockerfile_linux(ddir, dockerfile_name, argmap, backends, repoagents,
 
 
 def dockerfile_prepare_container_linux(argmap, backends, enable_gpu):
+    gpu_enabled = 1 if enable_gpu else 0
     # Common steps to produce docker images shared by build.py and compose.py.
     # Sets enviroment variables, installs dependencies and adds entrypoint
     df = '''
@@ -760,6 +761,7 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu):
 ENV TF_ADJUST_SATURATION_FUSED  1
 ENV TF_ENABLE_WINOGRAD_NONFUSED 1
 ENV TF_AUTOTUNE_THRESHOLD       2
+ENV TRITON_SERVER_GPU_ENABLED    {gpu_enabled}        
 
 # Create a user that can be used to run triton as
 # non-root. Make sure that this user to given ID 1000. All server
@@ -788,7 +790,7 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu):
             curl \
             {ort_dependencies} && \
     rm -rf /var/lib/apt/lists/*
-'''.format(ort_dependencies=ort_dependencies)
+'''.format(gpu_enabled=gpu_enabled, ort_dependencies=ort_dependencies)
 
     if enable_gpu:
         df += install_dcgm_libraries(argmap['DCGM_VERSION'])

diff --git a/compose.py b/compose.py
@@ -56,7 +56,7 @@ def fail_if(p, msg):
         fail(msg)
 
 
-def start_gpu_dockerfile(ddir, images, argmap, dockerfile_name, backends):
+def start_dockerfile(ddir, images, argmap, dockerfile_name, backends):
     # Set enviroment variables, set default user and install dependencies
     df = '''
 #
@@ -106,15 +106,16 @@ def add_requested_repoagents(ddir, dockerfile_name, repoagents):
     for ra in repoagents:
         df += '''COPY --chown=1000:1000 --from=full /opt/tritonserver/repoagents/{} /opt/tritonserver/repoagents/{}    
 '''.format(ra, ra)
-    df += '''
+    if len(repoagents) > 0:
+        df += '''
 # Top-level /opt/tritonserver/repoagents not copied so need to explicitly set permissions here
 RUN chown triton-server:triton-server /opt/tritonserver/repoagents
 '''
     with open(os.path.join(ddir, dockerfile_name), "a") as dfile:
         dfile.write(df)
 
 
-def end_gpu_dockerfile(ddir, dockerfile_name, argmap):
+def end_dockerfile(ddir, dockerfile_name, argmap):
     # Install additional dependencies
     df = ""
     if argmap['SAGEMAKER_ENDPOINT']:
@@ -140,51 +141,90 @@ def get_container_version_if_not_specified():
         with open('TRITON_VERSION', "r") as vfile:
             version = vfile.readline().strip()
         import build
-        FLAGS.container_version, upstream_container_version = build.get_container_versions(
-            version, FLAGS.container_version, "")
+        current_container_version, FLAGS.container_version = build.get_container_versions(
+            version, None, FLAGS.container_version)
         log('version {}'.format(version))
     log('using container version {}'.format(FLAGS.container_version))
 
 
 def create_argmap(images):
     # Extract information from upstream build and create map other functions can
     # use
-    upstreamDockerImage = images["full"]
+    full_docker_image = images["full"]
+    min_docker_image = images["min"]
+    enable_gpu = FLAGS.enable_gpu
+    # Docker inspect enviroment variables
+    base_run_args = ['docker', 'inspect', '-f']
+    import re  # parse all PATH enviroment variables
 
-    # first pull docker image
-    log("pulling container:{}".format(upstreamDockerImage))
-    p = subprocess.run(['docker', 'pull', upstreamDockerImage])
+    # first pull docker images
+    log("pulling container:{}".format(full_docker_image))
+    p = subprocess.run(['docker', 'pull', full_docker_image])
     fail_if(
         p.returncode != 0,
-        'docker pull container {} failed, {}'.format(upstreamDockerImage,
+        'docker pull container {} failed, {}'.format(full_docker_image,
                                                      p.stderr))
-
-    baseRunArgs = ['docker', 'inspect', '-f']
-    p_version = subprocess.run(baseRunArgs + [
+    if enable_gpu:
+        pm = subprocess.run(['docker', 'pull', min_docker_image])
+        fail_if(
+            pm.returncode != 0, 'docker pull container {} failed, {}'.format(
+                min_docker_image, pm.stderr))
+        pm_path = subprocess.run(base_run_args + [
+            '{{range $index, $value := .Config.Env}}{{$value}} {{end}}',
+            min_docker_image
+        ],
+                                 capture_output=True,
+                                 text=True)
+        fail_if(
+            pm_path.returncode != 0,
+            'docker inspect to find triton enviroment variables for min container failed, {}'
+            .format(pm_path.stderr))
+        # min container needs to be GPU enabled if the build is GPU build
+        vars = pm_path.stdout
+        e = re.search("CUDA_VERSION", vars)
+        gpu_enabled = False if e == None else True
+        fail_if(
+            not gpu_enabled,
+            '\'enable-gpu\' flag specified but min container provided does not have CUDA installed'
+        )
+
+    # Check full container enviroment variables
+    p_path = subprocess.run(base_run_args + [
         '{{range $index, $value := .Config.Env}}{{$value}} {{end}}',
-        upstreamDockerImage
+        full_docker_image
     ],
-                               capture_output=True,
-                               text=True)
-    vars = p_version.stdout
+                            capture_output=True,
+                            text=True)
+    fail_if(
+        p_path.returncode != 0,
+        'docker inspect to find enviroment variables for full container failed, {}'
+        .format(p_path.stderr))
+    vars = p_path.stdout
     log_verbose("inspect args: {}".format(vars))
-    import re  # parse all PATH enviroment variables
+
+    e0 = re.search("TRITON_SERVER_GPU_ENABLED=([\S]{1,}) ", vars)
+    e1 = re.search("CUDA_VERSION", vars)
+    gpu_enabled = False
+    if (e0 != None):
+        gpu_enabled = e0.group(1) == "1"
+    elif (e1 != None):
+        gpu_enabled = True
+    fail_if(
+        gpu_enabled != enable_gpu,
+        'Error: full container provided was build with \'enable_gpu\' as {} and you are composing container with \'enable_gpu\' as {}'
+        .format(gpu_enabled, enable_gpu))
     e = re.search("TRITON_SERVER_VERSION=([\S]{6,}) ", vars)
     version = "" if e == None else e.group(1)
     fail_if(
-        p_version.returncode != 0 or len(version) == 0,
-        'docker inspect to find triton version failed, {}'.format(
-            p_version.stderr))
-
-    vars = p_version.stdout
+        len(version) == 0,
+        'docker inspect to find triton server version failed, {}'.format(
+            p_path.stderr))
     e = re.search("NVIDIA_TRITON_SERVER_VERSION=([\S]{5,}) ", vars)
     container_version = "" if e == None else e.group(1)
     fail_if(
         len(container_version) == 0,
         'docker inspect to find triton container version failed, {}'.format(
             vars))
-
-    vars = p_version.stdout
     dcgm_ver = re.search("DCGM_VERSION=([\S]{4,}) ", vars)
     dcgm_version = ""
     if dcgm_ver == None:
@@ -197,27 +237,27 @@ def create_argmap(images):
         len(dcgm_version) == 0,
         'docker inspect to find DCGM version failed, {}'.format(vars))
 
-    p_sha = subprocess.run(baseRunArgs + [
-        '{{ index .Config.Labels "com.nvidia.build.ref"}}', upstreamDockerImage
-    ],
-                           capture_output=True,
-                           text=True)
+    p_sha = subprocess.run(
+        base_run_args +
+        ['{{ index .Config.Labels "com.nvidia.build.ref"}}', full_docker_image],
+        capture_output=True,
+        text=True)
     fail_if(
         p_sha.returncode != 0,
         'docker inspect of upstream docker image build sha failed, {}'.format(
             p_sha.stderr))
-    p_build = subprocess.run(baseRunArgs + [
-        '{{ index .Config.Labels "com.nvidia.build.id"}}', upstreamDockerImage
-    ],
-                             capture_output=True,
-                             text=True)
+    p_build = subprocess.run(
+        base_run_args +
+        ['{{ index .Config.Labels "com.nvidia.build.id"}}', full_docker_image],
+        capture_output=True,
+        text=True)
     fail_if(
         p_build.returncode != 0,
         'docker inspect of upstream docker image build sha failed, {}'.format(
             p_build.stderr))
 
     p_find = subprocess.run(
-        ['docker', 'run', upstreamDockerImage, 'bash', '-c', 'ls /usr/bin/'],
+        ['docker', 'run', full_docker_image, 'bash', '-c', 'ls /usr/bin/'],
         capture_output=True,
         text=True)
     f = re.search("serve", p_find.stdout)
@@ -298,10 +338,6 @@ def create_argmap(images):
     )
 
     FLAGS = parser.parse_args()
-    fail_if(
-        not FLAGS.enable_gpu,
-        "Only GPU versions are supported right now. Add --enable-gpu to compose.py command."
-    )
 
     if FLAGS.work_dir is None:
         FLAGS.work_dir = "."
@@ -329,24 +365,34 @@ def create_argmap(images):
             images[parts[0]] = parts[1]
     else:
         get_container_version_if_not_specified()
-        images = {
-            "full":
-                "nvcr.io/nvidia/tritonserver:{}-py3".format(
-                    FLAGS.container_version),
-            "min":
-                "nvcr.io/nvidia/tritonserver:{}-py3-min".format(
-                    FLAGS.container_version)
-        }
+        if (FLAGS.enable_gpu):
+            images = {
+                "full":
+                    "nvcr.io/nvidia/tritonserver:{}-py3".format(
+                        FLAGS.container_version),
+                "min":
+                    "nvcr.io/nvidia/tritonserver:{}-py3-min".format(
+                        FLAGS.container_version)
+            }
+        else:
+            images = {
+                "full":
+                    "nvcr.io/nvidia/tritonserver:{}-cpu-only-py3".format(
+                        FLAGS.container_version),
+                "min":
+                    "ubuntu:20.04"
+            }
     fail_if(
         len(images) != 2,
         "Need to both specify 'full' and 'min' images if at all")
 
     argmap = create_argmap(images)
 
-    start_gpu_dockerfile(FLAGS.work_dir, images, argmap, dockerfile_name,
-                         FLAGS.backend)
+    start_dockerfile(FLAGS.work_dir, images, argmap, dockerfile_name,
+                     FLAGS.backend)
     add_requested_backends(FLAGS.work_dir, dockerfile_name, FLAGS.backend)
     add_requested_repoagents(FLAGS.work_dir, dockerfile_name, FLAGS.repoagent)
-    end_gpu_dockerfile(FLAGS.work_dir, dockerfile_name, argmap)
+    end_dockerfile(FLAGS.work_dir, dockerfile_name, argmap)
+
     if (not FLAGS.dry_run):
         build_docker_image(FLAGS.work_dir, dockerfile_name, FLAGS.output_name)
diff --git a/docs/compose.md b/docs/compose.md
@@ -41,9 +41,18 @@ from source to get more exact customization.
 
 ## Use the compose.py script
 
-The `compose.py` script can be found in the [server repository](https://github.com/triton-inference-server/server). Simply clone the repository and run `compose.py` to create a custom container. Note created container version will depend on the branch that was cloned. For example branch [r21.06](https://github.com/triton-inference-server/server/tree/r21.06) should be used to create a image based on the NGC 21.06 Triton release. 
-
-`compose.py` provides `--backend`, `--repoagent` options that allow you to specify which backends and repository agents to include in the custom image. The `--enable-gpu` flag indicates that you want to create an image that supports NVIDIA GPUs. For example, the following creates a new docker image that contains only the TensorFlow 1 and TensorFlow 2 backends and the checksum repository agent.
+The `compose.py` script can be found in the [server repository](https://github.com/triton-inference-server/server).
+Simply clone the repository and run `compose.py` to create a custom container. 
+Note: Created container version will depend on the branch that was cloned. 
+For example branch [r21.08](https://github.com/triton-inference-server/server/tree/r21.08) 
+should be used to create a image based on the NGC 21.08 Triton release. 
+
+`compose.py` provides `--backend`, `--repoagent` options that allow you to 
+specify which backends and repository agents to include in the custom image. 
+The `--enable-gpu` flag indicates that you want to create an image that supports
+NVIDIA GPUs. For example, the following creates a new docker image that 
+contains only the TensorFlow 1 and TensorFlow 2 backends and the checksum 
+repository agent.
 
 Example:
 ```
@@ -54,7 +63,47 @@ will provide a container `tritonserver` locally. You can access the container wi
 $ docker run -it tritonserver:latest
 ```
 
-Note: If `compose.py` is run on release versions `r21.08` and older, the resulting container will have DCGM version 2.2.3 installed. This may result in different GPU statistic reporting behavior.
+Note: If `compose.py` is run on release versions `r21.08` and earlier, 
+the resulting container will have DCGM version 2.2.3 installed. 
+This may result in different GPU statistic reporting behavior.
+
+### Compose a specific version of Triton
+
+`compose.py` requires two containers: a `min` container which is the 
+base the compose container is built from and a `full` container from which the 
+script will extract components. The version of the `min` and `full` container 
+is determined by the branch of Triton `compose.py` is on. 
+For example, running
+```
+python3 compose.py --backend tensorflow1 --repoagent checksum --enable-gpu
+```
+on branch [r21.08](https://github.com/triton-inference-server/server/tree/r21.08) pulls:
+- `min` container `nvcr.io/nvidia/tritonserver:21.08-py3-min` 
+- `full` container `nvcr.io/nvidia/tritonserver:21.08-py3`
+
+Alternatively, users can specify the version of Triton container to pull from any branch by either:
+1. Adding flag `--container-version <container version>` to branch
+```
+python3 compose.py --backend tensorflow1 --repoagent checksum --container-version 21.08 --enable-gpu
+```
+2. Specifying `--image min,<min container image name> --image full,<full container image name>`. 
+   The user is responsible for specifying compatible `min` and `full` containers. 
+```
+python3 compose.py --backend tensorflow1 --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:21.08-py3-min --image full,nvcr.io/nvidia/tritonserver:21.08-py3 --enable-gpu
+```
+Method 1 and 2 will result in the same composed container. Furthermore, `--image` flag overrides the `--container-version` flag when both are specified.
+
+### CPU only container composition
+
+To compose a container that is built for only cpu usage, simply remove the 
+`--enable-gpu` flag when running `compose.py`. 
+
+This will build a container using `ubuntu:20.04` docker as the `min` container 
+and `nvcr.io/nvidia/tritonserver:<upstream-container-version>-cpu-only-py3` as the `full` container.
+Note: 
+1. When composing a CPU only container, both `min` and `full` containers should be built for CPU only and not have CUDA installed.
+2. CPU only containers are only available for Triton versions > `21.09` 
+3. CPU only "full" containers are build with less backends then the GPU enabled containers. Currently supported backends are `onnxruntime`, `openvino` and `python`.
 
 ## Build it yourself