BERT nightly benchmark on Inferentia2 (#2283)

namannandan · Naman Nandan · web-flow · commit 25f3700c40a7 · 2023-05-16T09:55:42.000-07:00
* Inf2 nightly benchmark

* fix linter spellcheck error

---------

Co-authored-by: Naman Nandan &lt;namannan@amazon.com&gt;
diff --git a/.github/workflows/benchmark_nightly.yml b/.github/workflows/benchmark_nightly.yml
@@ -10,7 +10,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        hardware: [cpu, gpu, inf1]
+        hardware: [cpu, gpu, inf1, inf2]
     runs-on:
       - self-hosted
       - ${{ matrix.hardware }}
@@ -52,6 +52,11 @@ jobs:
         env:
           NEURON_RT_NUM_CORES: 4
         run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_neuron.yaml --skip false
+      - name: Benchmark inf2 nightly
+        if: ${{ matrix.hardware == 'inf2' }}
+        env:
+          NEURON_RT_NUM_CORES: 1
+        run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_neuronx.yaml --skip false
       - name: Save benchmark artifacts
         uses: actions/upload-artifact@v2
         with:
diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py
@@ -97,7 +97,7 @@ def load_config(self):
 
         self.bm_config["model_config_path"] = (
             "{}/{}".format(MODEL_JSON_CONFIG_PATH, self.bm_config["hardware"])
-            if self.bm_config["hardware"] in ["cpu", "gpu", "neuron"]
+            if self.bm_config["hardware"] in ["cpu", "gpu", "neuron", "neuronx"]
             else "{}/cpu".format(MODEL_JSON_CONFIG_PATH)
         )
 
diff --git a/benchmarks/benchmark_config_neuronx.yaml b/benchmarks/benchmark_config_neuronx.yaml
@@ -0,0 +1,45 @@
+# Torchserve version is to be installed. It can be one of the options
+#  - branch : "master"
+#  - nightly: "2022.3.16"
+#  - release: "0.5.3"
+# Nightly build will be installed if "ts_version" is not specifiged
+#ts_version:
+#    branch: &ts_version "master"
+
+# a list of model configure yaml files defined in benchmarks/models_config
+# or a list of model configure yaml files with full path
+models:
+  - "bert_neuronx.yaml"
+
+# benchmark on "cpu", "gpu", "neuron" or "neuronx".
+# "cpu" is set if "hardware" is not specified
+hardware: &hardware "neuronx"
+
+# load prometheus metrics report to remote storage or local different path if "metrics_cmd" is set.
+# the command line to load prometheus metrics report to remote system.
+# Here is an example of AWS cloudwatch command:
+# Note:
+#    - keep the values order as the same as the command definition.
+#    - set up the command before enabling `metrics_cmd`.
+#      For example, aws client and AWS credentials need to be setup before trying this example.
+metrics_cmd:
+  - "cmd": "aws cloudwatch put-metric-data"
+  - "--namespace": ["torchserve_benchmark_nightly_", *hardware]
+  - "--region": "us-east-2"
+  - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json'
+
+# load report to remote storage or local different path if "report_cmd" is set.
+# the command line to load report to remote storage.
+# Here is an example of AWS cloudwatch command:
+# Note:
+#    - keep the values order as the same as the command.
+#    - set up the command before enabling `report_cmd`.
+#      For example, aws client, AWS credentials and S3 bucket
+#      need to be setup before trying this example.
+#    - "today()" is a keyword to apply current date in the path
+#      For example, the dest path in the following example is
+#      s3://torchserve-model-serving/benchmark/2022-03-18/gpu
+report_cmd:
+  - "cmd": "aws s3 cp --recursive"
+  - "source": '/tmp/ts_benchmark/'
+  - "dest": ['s3://torchserve-benchmark/nightly', "today()", *hardware]
diff --git a/benchmarks/models_config/bert_neuronx.yaml b/benchmarks/models_config/bert_neuronx.yaml
@@ -0,0 +1,68 @@
+---
+bert_neuronx_batch_1:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuronx_batch_1.mar
+        workers:
+            - 2
+        batch_delay: 100
+        batch_size:
+            - 1
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuronx"
+
+bert_neuronx_batch_2:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuronx_batch_2.mar
+        workers:
+            - 2
+        batch_delay: 100
+        batch_size:
+            - 2
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuronx"
+
+bert_neuronx_batch_4:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuronx_batch_4.mar
+        workers:
+            - 2
+        batch_delay: 100
+        batch_size:
+            - 4
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuronx"
+
+bert_neuronx_batch_8:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuronx_batch_8.mar
+        workers:
+            - 2
+        batch_delay: 100
+        batch_size:
+            - 8
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuronx"
diff --git a/examples/Huggingface_Transformers/Download_Transformer_models.py b/examples/Huggingface_Transformers/Download_Transformer_models.py
@@ -121,6 +121,23 @@ def transformers_model_dowloader(
                     "traced_{}_model_neuron_batch_{}.pt".format(model_name, batch_size),
                 ),
             )
+        elif hardware == "neuronx":
+            import torch_neuronx
+
+            input_ids = torch.cat([inputs["input_ids"]] * batch_size, 0).to(device)
+            attention_mask = torch.cat([inputs["attention_mask"]] * batch_size, 0).to(
+                device
+            )
+            traced_model = torch_neuronx.trace(model, (input_ids, attention_mask))
+            torch.jit.save(
+                traced_model,
+                os.path.join(
+                    NEW_DIR,
+                    "traced_{}_model_neuronx_batch_{}.pt".format(
+                        model_name, batch_size
+                    ),
+                ),
+            )
         else:
             input_ids = inputs["input_ids"].to(device)
             attention_mask = inputs["attention_mask"].to(device)
diff --git a/examples/Huggingface_Transformers/README.md b/examples/Huggingface_Transformers/README.md
@@ -51,9 +51,9 @@ In the setup_config.json :
 
 *embedding_name* : The name of embedding layer in the chosen model, this could be `bert` for `bert-base-uncased`, `roberta` for `roberta-base` or `roberta` for `xlm-roberta-large`, or `gpt2` for `gpt2` model
 
-*hardware* : The target platform to trace the model for. Specify as `neuron` for [Inferentia1](https://aws.amazon.com/ec2/instance-types/inf1/).
+*hardware* : The target platform to trace the model for. Specify as `neuron` for [Inferentia1](https://aws.amazon.com/ec2/instance-types/inf1/) and `neuronx` for [Inferentia2](https://aws.amazon.com/ec2/instance-types/inf2/).
 
-*batch_size* : Input batch size when tracing the model for `neuron` as target hardware.
+*batch_size* : Input batch size when tracing the model for `neuron` or `neuronx` as target hardware.
 
 Once, `setup_config.json` has been set properly, the next step is to run
 
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1051,3 +1051,4 @@ largemodels
 torchpippy
 InferenceSession
 maxRetryTimeoutInSec
+neuronx

Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ def load_config(self):`
`97`	`97`
`98`	`98`	`self.bm_config["model_config_path"] = (`
`99`	`99`	`"{}/{}".format(MODEL_JSON_CONFIG_PATH, self.bm_config["hardware"])`
`100`		`- if self.bm_config["hardware"] in ["cpu", "gpu", "neuron"]`
	`100`	`+ if self.bm_config["hardware"] in ["cpu", "gpu", "neuron", "neuronx"]`
`101`	`101`	`else "{}/cpu".format(MODEL_JSON_CONFIG_PATH)`
`102`	`102`	`)`
`103`	`103`