Merge 967a5f7 into 98ca286

agunapal · web-flow · commit 3e86d7fe6908 · 2023-04-03T16:46:02.000-07:00
diff --git a/test/pytest/test_sm_mme_requirements.py b/test/pytest/test_sm_mme_requirements.py
@@ -0,0 +1,113 @@
+import os
+import pathlib
+
+import pytest
+import requests
+import test_utils
+import torch
+
+CURR_FILE_PATH = os.path.dirname(os.path.realpath(__file__))
+REPO_ROOT = os.path.normpath(os.path.join(CURR_FILE_PATH, "..", ".."))
+MODELSTORE_DIR = os.path.join(REPO_ROOT, "model_store")
+data_file_kitten = os.path.join(REPO_ROOT, "examples/image_classifier/kitten.jpg")
+HF_TRANSFORMERS_EXAMPLE_DIR = os.path.join(
+    REPO_ROOT, "examples/Huggingface_Transformers/"
+)
+
+
+def test_no_model_loaded():
+    """
+    Validates that TorchServe returns reponse code 404 if no model is loaded.
+    """
+
+    os.makedirs(MODELSTORE_DIR, exist_ok=True)  # Create modelstore directory
+    test_utils.start_torchserve(model_store=MODELSTORE_DIR)
+
+    response = requests.post(
+        url="http://localhost:8080/models/alexnet/invoke",
+        data=open(data_file_kitten, "rb"),
+    )
+    assert response.status_code == 404, "Model not loaded error expected"
+
+
+@pytest.mark.skipif(
+    not ((torch.cuda.device_count() > 0) and torch.cuda.is_available()),
+    reason="Test to be run on GPU only",
+)
+def test_oom_on_model_load():
+    """
+    Validates that TorchServe returns reponse code 507 if there is OOM on model loading.
+    """
+
+    # Create model store directory
+    pathlib.Path(test_utils.MODEL_STORE).mkdir(parents=True, exist_ok=True)
+
+    # Start TorchServe
+    test_utils.start_torchserve(no_config_snapshots=True)
+
+    # Register model
+    params = {
+        "model_name": "BERTSeqClassification",
+        "url": "https://torchserve.pytorch.org/mar_files/BERTSeqClassification.mar",
+        "batch_size": 1,
+        "initial_workers": 16,
+    }
+    response = test_utils.register_model_with_params(params)
+
+    assert response.status_code == 507, "OOM Error expected"
+
+    test_utils.stop_torchserve()
+
+
+@pytest.mark.skipif(
+    not ((torch.cuda.device_count() > 0) and torch.cuda.is_available()),
+    reason="Test to be run on GPU only",
+)
+def test_oom_on_invoke():
+    # Create model store directory
+    pathlib.Path(test_utils.MODEL_STORE).mkdir(parents=True, exist_ok=True)
+
+    # Start TorchServe
+    test_utils.start_torchserve(no_config_snapshots=True)
+
+    # Register model
+    params = {
+        "model_name": "BERTSeqClassification",
+        "url": "https://torchserve.pytorch.org/mar_files/BERTSeqClassification.mar",
+        "batch_size": 8,
+        "initial_workers": 12,
+    }
+    response = test_utils.register_model_with_params(params)
+
+    input_text = os.path.join(
+        REPO_ROOT,
+        "examples",
+        "Huggingface_Transformers",
+        "Seq_classification_artifacts",
+        "sample_text_captum_input.txt",
+    )
+
+    # Make 8 curl requests in parallel with &
+    # Send multiple requests to make sure to hit OOM
+    for i in range(10):
+        response = os.popen(
+            f"curl http://127.0.0.1:8080/models/BERTSeqClassification/invoke -T {input_text} && "
+            f"curl http://127.0.0.1:8080/models/BERTSeqClassification/invoke -T {input_text} && "
+            f"curl http://127.0.0.1:8080/models/BERTSeqClassification/invoke -T {input_text} && "
+            f"curl http://127.0.0.1:8080/models/BERTSeqClassification/invoke -T {input_text} && "
+            f"curl http://127.0.0.1:8080/models/BERTSeqClassification/invoke -T {input_text} && "
+            f"curl http://127.0.0.1:8080/models/BERTSeqClassification/invoke -T {input_text} && "
+            f"curl http://127.0.0.1:8080/models/BERTSeqClassification/invoke -T {input_text} && "
+            f"curl http://127.0.0.1:8080/models/BERTSeqClassification/invoke -T {input_text} "
+        )
+        response = response.read()
+
+    # If OOM is hit, we expect code 507 to be present in the response string
+    lines = response.split("\n")
+    output = ""
+    for line in lines:
+        if "code" in line:
+            line = line.strip()
+            output = line
+            break
+    assert output == '"code": 507,', "OOM Error expected"
diff --git a/ts/model_service_worker.py b/ts/model_service_worker.py
@@ -144,6 +144,14 @@ def load_model(self, load_model_request):
             return service, "loaded model {}".format(model_name), 200
         except MemoryError:
             return None, "System out of memory", 507
+        except RuntimeError as ex:  # pylint: disable=broad-except
+            if "CUDA" in str(ex):
+                # Handles Case A: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED (Close to OOM) &
+                # Case B: CUDA out of memory (OOM)
+                return None, "System out of memory", 507
+            else:
+                # Sanity testcases fail without this
+                return None, "Unknown exception", 500
 
     def handle_connection(self, cl_socket):
         """
diff --git a/ts/service.py b/ts/service.py
@@ -132,15 +132,21 @@ def predict(self, batch):
         # noinspection PyBroadException
         try:
             ret = self._entry_point(input_batch, self.context)
-        except PredictionException as e:
-            logger.error("Prediction error", exc_info=True)
-            return create_predict_response(None, req_id_map, e.message, e.error_code)
         except MemoryError:
             logger.error("System out of memory", exc_info=True)
             return create_predict_response(None, req_id_map, "Out of resources", 507)
-        except Exception:  # pylint: disable=broad-except
-            logger.warning("Invoking custom service failed.", exc_info=True)
-            return create_predict_response(None, req_id_map, "Prediction failed", 503)
+        except PredictionException as e:
+            logger.error("Prediction error", exc_info=True)
+            return create_predict_response(None, req_id_map, e.message, e.error_code)
+        except Exception as ex:  # pylint: disable=broad-except
+            if "CUDA" in str(ex):
+                # Handles Case A: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED (Close to OOM) &
+                # Case B: CUDA out of memory (OOM)
+                logger.error("CUDA out of memory", exc_info=True)
+                return create_predict_response(None, req_id_map, "Out of resources", 507)
+            else:
+                logger.warning("Invoking custom service failed.", exc_info=True)
+                return create_predict_response(None, req_id_map, "Prediction failed", 503)
 
         if not isinstance(ret, list):
             logger.warning(