Pippy deferred init (#2310)

HamidShojanazeri · Ubuntu · Ubuntu · web-flow · commit 2f1f52f553e8 · 2023-05-10T10:26:38.000-07:00
* remove HF auth

* update steps

* add model checkpoint path

* add model checkpoint path

* add deferred init

* add deferred init

* fix keys

* clean up

* adding torchpippy

* adding torchpippy

* add comment for replace checkpoint path

* add comment for  checkpoint path

* add checks for configs

* fixing thread numbers

* fixing max_new_tokens

* adding  max_new_tokens

* fix padding

* revert tokenizer changes

* fixing the response size

* making index file optional

* fixing new tokens

* fixing new tokens

* fixing the output issue

* add check for torch version

* fixing the index file path

* extend the word list

* moving the script to parent direcrtory

* change the path to download script

* moving to utls

* adding  utls

* allowing only related patterns

* setting default chunks to 1

---------

Co-authored-by: Ubuntu &lt;ubuntu@ip-172-31-9-21.us-west-2.compute.internal&gt;
Co-authored-by: Ubuntu &lt;ubuntu@ip-172-31-5-255.us-west-2.compute.internal&gt;
diff --git a/examples/large_models/Huggingface_pippy/Readme.md b/examples/large_models/Huggingface_pippy/Readme.md
@@ -6,23 +6,25 @@ PiPPy provides pipeline parallelism for serving large models that would not fit
 
 ## How to serve your large HuggingFace models with PiPPy in Torchserve?
 
-We use a Torchserve custom handler that inherits from base_pippy_handler to load the model and define our logic for preprocess, inference and post processing. This is basically very similar to your evaluation process.
+We use a Torchserve custom handler that inherits from base_pippy_handler to load the model and define our logic for preprocess, inference and post processing. This is basically very similar to your evaluation process. Following settings has been tested on g5.12xlarge EC2 instance which has 4xA10 GPUs.
 
-### Step 1: Download model
+To run this example we need to have torchpippy installed. This has been added to the requirement.txt which can be bundled during model packaging.
 
-Login into huggingface hub with token by running the below command
+Generally to install torchpippy you can run following
 
 ```bash
-huggingface-cli login
+pip install torchpippy
+
 ```
-paste the token generated from huggingface hub.
+
+### Step 1: Download model
 
 ```bash
-python Download_model.py --model_name facebook/opt-6.7b
+python ../utils/Download_model.py --model_name facebook/opt-30b
 ```
 The script prints the path where the model is downloaded as below. This is an example and in your workload you want to use your actual trained model checkpoints.
 
-`model/models--bigscience-bloom-7b1/snapshots/5546055f03398095e385d7dc625e636cc8910bf2/`
+`model/models--facebook--opt-30b/snapshots/ceea0a90ac0f6fae7c2c34bcb40477438c152546/`
 
 The downloaded model is around 14GB.
 
@@ -46,37 +48,42 @@ pippy:
     input_names: ['input_ids'] # input arg names to the model, this is required for FX tracing
     model_type: "HF" # set the model type to HF if you are using Huggingface model other wise leave it blank or any other model you use.
     rpc_timeout: 1800
+    num_worker_threads: 512 #number of threads for rpc worker usually 512 is a good number
 
 handler:
     max_length: 80 # max length of tokens for tokenizer in the handler
+    model_name: "/home/ubuntu/serve/examples/large_models/Huggingface_pippy/model/models--facebook--opt-30b/snapshots/ceea0a90ac0f6fae7c2c34bcb40477438c152546" #the path to the checkpoints, in this example downloaded file. Please change to your model path.
+    index_file_name: 'pytorch_model.bin.index.json' # index json file name in the model checkpoint folder, that keeps information of distributed checkpoints
+    manual_seed: 40
+    dtype: fp16 # data type to load your model checkpoint, supported fp32, fp16, bf16
 ```
 
 ### Step 3: Generate Tar/ MAR file
 
-Navigate up to `Huggingface_Largemodels` directory.
+Navigate up to `largemodels` directory. Here as bundling the large model checkpoints is very time consuming, we are passing model checkpoint path in the model_config.yaml as shown above. This let us make the packaging very fast, for production settings, the large models can be put in some shared location and used from there in the model-config.
 
 ```bash
-torch-model-archiver --model-name bloom --version 1.0 --handler pippy_handler.py --extra-files model/models--facebook--opt-iml-max-1.3b/snapshots/d60fa58f50def19751da2075791da359ca19d273  -r requirements.txt --config-file model-config.yaml --archive-format tgz
+torch-model-archiver --model-name opt --version 1.0 --handler pippy_handler.py  -r requirements.txt --config-file model-config.yaml --archive-format tgz
 
 ```
 
 ### Step 4: Add the mar file to model store
 
 ```bash
 mkdir model_store
-mv bloom.mar model_store
+mv opt.tar.gz model_store
 ```
 
 ### Step 5: Start torchserve
 
 Update config.properties and start torchserve
 
 ```bash
-torchserve --ncs --start --model-store model_store --models bloom.mar
+torchserve --ncs --start --model-store model_store --models opt.tar.gz
 ```
 
 ### Step 6: Run inference
 
 ```bash
-curl -v "http://localhost:8080/predictions/bloom" -T sample_text.txt
+curl -v "http://localhost:8080/predictions/opt" -T sample_text.txt
 ```
diff --git a/examples/large_models/Huggingface_pippy/model-config.yaml b/examples/large_models/Huggingface_pippy/model-config.yaml
@@ -1,8 +1,8 @@
 #frontend settings
 minWorkers: 1
 maxWorkers: 1
-maxBatchDelay: 100
-responseTimeout: 120
+maxBatchDelay: 200
+responseTimeout: 300
 parallelType: "pp"
 deviceType: "gpu"
 torchrun:
@@ -14,8 +14,12 @@ pippy:
     model_type: "HF"
     chunks: 1
     input_names: ["input_ids"]
-    num_worker_threads: 512
+    num_worker_threads: 128
 
 handler:
+    model_path: "/home/ubuntu/serve/examples/large_models/Huggingface_pippy/model/models--facebook--opt-30b/snapshots/ceea0a90ac0f6fae7c2c34bcb40477438c152546"
+    index_filename: 'pytorch_model.bin.index.json'
     max_length: 50
+    max_new_tokens: 60
     manual_seed: 40
+    dtype: fp16
diff --git a/examples/large_models/Huggingface_pippy/pippy_handler.py b/examples/large_models/Huggingface_pippy/pippy_handler.py
@@ -2,6 +2,7 @@
 import time
 from abc import ABC
 
+import packaging.version
 import requests
 import torch
 import transformers
@@ -12,6 +13,12 @@
 
 logger = logging.getLogger(__name__)
 logger.info("Transformers version %s", transformers.__version__)
+if packaging.version.parse(torch.__version__) >= packaging.version.parse("2.0.0"):
+    logger.info("PyTorch version is 2.0.0 or greater")
+else:
+    logger.info(
+        "PyTorch version is less than 2.0.0, initializing with meta device needs PyTorch 2.0.0 and greater"
+    )
 
 
 class TransformersSeqClassifierHandler(BasePippyHandler, ABC):
@@ -36,18 +43,43 @@ def initialize(self, ctx):
         model_dir = properties.get("model_dir")
         self.device = self.local_rank
 
+        model_path = ctx.model_yaml_config["handler"]["model_path"]
         seed = ctx.model_yaml_config["handler"]["manual_seed"]
+        dtype_str = ctx.model_yaml_config["handler"]["dtype"]
         torch.manual_seed(seed)
 
-        self.model = AutoModelForCausalLM.from_pretrained(model_dir, use_cache=False)
+        dtypes = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
 
-        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, return_tensors="pt")
+        dtype = dtypes.get(dtype_str, torch.float32)
+        if dtype != torch.float32 and dtype_str not in dtypes:
+            logger.info(
+                f"Unsupported data type {dtype_str}, "
+                "please submit a PR to support it. Falling back to fp32 now."
+            )
+
+        skip_init_start = time.perf_counter()
+        with torch.device("meta"):
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_path, use_cache=False, torch_dtype=dtype
+            )
+        skip_init_end = time.perf_counter()
+        logger.info(
+            f" init model time on meta device took {skip_init_end - skip_init_start} seconds"
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, return_tensors="pt")
+        self.tokenizer.pad_token = self.tokenizer.eos_token
 
         self.max_length = ctx.model_yaml_config["handler"]["max_length"]
+        self.max_new_tokens = ctx.model_yaml_config["handler"]["max_new_tokens"]
 
         logger.info("Instantiating model Pipeline")
-        model_init_start = time.time()
+        pippy_compile_time_start = time.perf_counter()
         self.model = get_pipeline_driver(self.model, self.world_size, ctx)
+        pippy_compile_time_end = time.perf_counter()
+
+        logger.info(
+            f" pippy compile time took {pippy_compile_time_end- pippy_compile_time_start} seconds on rank {self.local_rank}"
+        )
 
         logger.info("Transformer model from path %s loaded successfully", model_dir)
 
@@ -64,14 +96,12 @@ def preprocess(self, requests):
                 attention masks.
         """
         input_texts = [data.get("data") or data.get("body") for data in requests]
-        input_ids_batch, attention_mask_batch = [], []
+        input_ids_batch = []
         for input_text in input_texts:
-            input_ids, attention_mask = self.encode_input_text(input_text)
+            input_ids = self.encode_input_text(input_text)
             input_ids_batch.append(input_ids)
-            attention_mask_batch.append(attention_mask)
         input_ids_batch = torch.cat(input_ids_batch, dim=0).to(self.device)
-        attention_mask_batch = torch.cat(attention_mask_batch, dim=0).to(self.device)
-        return input_ids_batch, attention_mask_batch
+        return input_ids_batch
 
     def encode_input_text(self, input_text):
         """
@@ -92,8 +122,7 @@ def encode_input_text(self, input_text):
             return_tensors="pt",
         )
         input_ids = inputs["input_ids"]
-        attention_mask = inputs["attention_mask"]
-        return input_ids, attention_mask
+        return input_ids
 
     def inference(self, input_batch):
         """
@@ -105,21 +134,18 @@ def inference(self, input_batch):
         Returns:
             list: A list of strings with the predicted values for each input text in the batch.
         """
-        input_ids_batch, attention_mask_batch = input_batch
+        input_ids_batch = input_batch
         input_ids_batch = input_ids_batch.to(self.device)
         outputs = self.model.generate(
             input_ids_batch,
-            attention_mask=attention_mask_batch,
-            max_length=30,
+            max_length=self.max_new_tokens,
+        )
+        generated_text = self.tokenizer.batch_decode(
+            outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
 
-        inferences = [
-            self.tokenizer.batch_decode(
-                outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
-            )
-        ]
-        logger.info("Generated text: %s", inferences)
-        return inferences
+        logger.info("Generated text: %s", generated_text)
+        return generated_text
 
     def postprocess(self, inference_output):
         """Post Process Function converts the predicted response into Torchserve readable format.
diff --git a/examples/large_models/Huggingface_pippy/requirements.txt b/examples/large_models/Huggingface_pippy/requirements.txt
@@ -1,2 +1,2 @@
 transformers
-
+torchpippy
diff --git a/examples/large_models/utils/Download_model.py b/examples/large_models/utils/Download_model.py
@@ -41,11 +41,14 @@ def hf_model(model_str):
 )
 parser.add_argument("--revision", "-r", type=str, default="main", help="Revision")
 args = parser.parse_args()
+# Only download pytorch checkpoint files
+allow_patterns = ["*.json", "*.pt", "*.bin", "*.txt", "*.model"]
 
 snapshot_path = snapshot_download(
     repo_id=args.model_name,
     revision=args.revision,
+    allow_patterns=allow_patterns,
     cache_dir=args.model_path,
-    use_auth_token=True,
+    use_auth_token=False,
 )
 print(f"Files for '{args.model_name}' is downloaded to '{snapshot_path}'")
diff --git a/ts/handler_utils/distributed/pt_pippy.py b/ts/handler_utils/distributed/pt_pippy.py
@@ -51,10 +51,49 @@ def get_pipeline_driver(model, world_size, ctx):
         torch.nn.Sequential: The pipeline driver for the model.
     """
     # Extract configuration parameters from the context
-    chunks = ctx.model_yaml_config["pippy"]["chunks"]
+
+    # Check that the "pippy" and "handler" keys are present in the YAML config
+    assert "pippy" in ctx.model_yaml_config, "Missing 'pippy' key in YAML config"
+    assert "handler" in ctx.model_yaml_config, "Missing 'handler' key in YAML config"
+
+    # Check that the required keys are present in the "pippy" section
+
+    assert (
+        "input_names" in ctx.model_yaml_config["pippy"]
+    ), "Missing 'input_names' key in YAML config"
+    assert (
+        "model_type" in ctx.model_yaml_config["pippy"]
+    ), "Missing 'model_type' key in YAML config"
+
+    # Check that the required keys are present in the "handler" section
+    assert (
+        "model_path" in ctx.model_yaml_config["handler"]
+    ), "Missing 'model_path' key in YAML config"
+
+    # Set variables from the config
+
     input_names = ctx.model_yaml_config["pippy"]["input_names"]
     model_type = ctx.model_yaml_config["pippy"]["model_type"]
+    model_path = ctx.model_yaml_config["handler"]["model_path"]
+    try:
+        chunks = ctx.model_yaml_config["pippy"]["chunks"]
+    except KeyError:
+        chunks = 1
+    try:
+        index_filename = ctx.model_yaml_config["handler"]["index_filename"]
+    except KeyError:
+        index_filename = None
+
+    # Check that the index file exists
+    if index_filename is not None:
+        index_file_path = os.path.join(model_path, index_filename)
+        assert os.path.exists(
+            index_file_path
+        ), f"Index file '{index_file_path}' not found"
+    else:
+        index_file_path = None
 
+    checkpoint_prefix = None
     # Set the model to evaluation mode
     model.eval()
 
@@ -83,6 +122,8 @@ def get_pipeline_driver(model, world_size, ctx):
         split_policy=split_policy,
         tracer=tracer,
         concrete_args=concrete_args,
+        index_filename=index_file_path,
+        checkpoint_prefix=checkpoint_prefix,
     )
 
     # Inject the pipeline forward method if necessary
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1043,3 +1043,5 @@ QueueTime
 WorkerLoadTime
 WorkerName
 WorkerThreadTime
+largemodels
+torchpippy