From 1139bdeb8a6fc18489c0acb5e781da4830312d0b Mon Sep 17 00:00:00 2001
From: Tyler Osterberg <tylertosterberg@gmail.com>
Date: Sun, 28 Apr 2024 05:07:01 -0700
Subject: [PATCH] [tnx] version bump Neuron SDK and Optimum (#1826)

---
 .../main/java/ai/djl/python/engine/Connection.java    | 11 +++++++++++
 serving/docker/pytorch-inf2.Dockerfile                |  8 ++++----
 serving/docs/lmi/user_guides/tnx_user_guide.md        |  8 +++++++-
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/engines/python/src/main/java/ai/djl/python/engine/Connection.java b/engines/python/src/main/java/ai/djl/python/engine/Connection.java
index 6954996bf..e95879ae7 100644
--- a/engines/python/src/main/java/ai/djl/python/engine/Connection.java
+++ b/engines/python/src/main/java/ai/djl/python/engine/Connection.java
@@ -186,6 +186,10 @@ static String[] getPythonStartCmd(PyEnv pyEnv, Model model, int workerId, int po
             // TODO: re-map logic device once neuron fixed bug
             pyEnv.addEnv("NEURON_RT_VISIBLE_CORES", visibleCores);
             logger.info("Set NEURON_RT_VISIBLE_CORES={}", visibleCores);
+
+            String neuronThreads = getNeuronThreads(tensorParallelDegree);
+            pyEnv.addEnv("OMP_NUM_THREADS", neuronThreads);
+            logger.info("Set OMP_NUM_THREADS={}", neuronThreads);
         }
         boolean uds = Epoll.isAvailable() || KQueue.isAvailable();
         String[] args = new String[12];
@@ -231,6 +235,13 @@ private static String getNeuronVisibleCores(int deviceId, int tensorParallelDegr
         return String.valueOf(deviceId);
     }
 
+    private static String getNeuronThreads(int tensorParallelDegree) {
+        if (tensorParallelDegree > 0) {
+            return String.valueOf(tensorParallelDegree * 2);
+        }
+        return String.valueOf(1);
+    }
+
     void connect() throws InterruptedException {
         EventLoopGroup group = PyEnv.getEventLoopGroup();
 
diff --git a/serving/docker/pytorch-inf2.Dockerfile b/serving/docker/pytorch-inf2.Dockerfile
index 3e2f519d6..5e1679ec2 100644
--- a/serving/docker/pytorch-inf2.Dockerfile
+++ b/serving/docker/pytorch-inf2.Dockerfile
@@ -14,17 +14,17 @@ ARG djl_version=0.28.0~SNAPSHOT
 ARG torch_version=2.1.2
 ARG torchvision_version=0.16.2
 ARG python_version=3.9
-ARG neuronsdk_version=2.18.1
+ARG neuronsdk_version=2.18.2
 ARG torch_neuronx_version=2.1.2.2.1.0
 ARG transformers_neuronx_version=0.10.0.360
 ARG neuronx_distributed_version=0.7.0
-ARG neuronx_cc_version=2.13.68.0
+ARG neuronx_cc_version=2.13.72.0
 ARG protobuf_version=3.19.6
 ARG transformers_version=4.36.2
 ARG accelerate_version=0.23.0
 ARG diffusers_version=0.26.1
 ARG pydantic_version=2.6.1
-ARG optimum_neuron_version=0.0.20
+ARG optimum_neuron_version=0.0.21
 ARG vllm_wheel="https://publish.djl.ai/neuron_vllm/vllm-nightly-py3-none-any.whl"
 EXPOSE 8080
 
@@ -75,7 +75,7 @@ RUN mkdir -p /opt/djl/bin && cp scripts/telemetry.sh /opt/djl/bin && \
     neuronx-cc==${neuronx_cc_version} torch-neuronx==${torch_neuronx_version} transformers-neuronx==${transformers_neuronx_version} \
     neuronx_distributed==${neuronx_distributed_version} protobuf==${protobuf_version} sentencepiece jinja2 \
     diffusers==${diffusers_version} opencv-contrib-python-headless  Pillow --extra-index-url=https://pip.repos.neuron.amazonaws.com \
-    pydantic==${pydantic_version} optimum optimum-neuron==${optimum_neuron_version} tiktoken blobfile && \
+    pydantic==${pydantic_version} optimum optimum-neuron==${optimum_neuron_version} tiktoken blobfile \
     torchvision==${torchvision_version} && \
     scripts/install_s5cmd.sh x64 && \
     scripts/patch_oss_dlc.sh python && \
diff --git a/serving/docs/lmi/user_guides/tnx_user_guide.md b/serving/docs/lmi/user_guides/tnx_user_guide.md
index c102ad84e..926d8cb58 100644
--- a/serving/docs/lmi/user_guides/tnx_user_guide.md
+++ b/serving/docs/lmi/user_guides/tnx_user_guide.md
@@ -19,6 +19,7 @@ The model architectures that are tested daily for LMI Transformers-NeuronX (in C
 
 - LLAMA
 - Mistral
+- Mixtral
 - GPT-NeoX
 - GPT-J
 - Bloom
@@ -32,8 +33,9 @@ The model architectures that are tested daily for LMI Transformers-NeuronX (in C
 - GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
 - GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
 - GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
-- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
+- LLaMA, LLaMA-2, LLaMA-3 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `meta-llama/Meta-Llama-3-70B`, `openlm-research/open_llama_13b`, etc.)
 - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
+- Mixtral (`mistralai/Mixtral-8x7B-Instruct-v0.1`)
 - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
 
 We will add more model support for the future versions to have them tested. Please feel free to [file us an issue](https://github.com/deepjavalibrary/djl-serving/issues/new/choose) for more model coverage in CI.
@@ -99,3 +101,7 @@ In that situation, there is nothing LMI can do until the issue is fixed in the b
 | option.group_query_attention               | >= 0.26.0   | Pass Through       | Enable K/V cache sharding for llama and mistral models types  based on various [strategies](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/transformers-neuronx-developer-guide.html#grouped-query-attention-gqa-support-beta)                                                                                | `shard-over-heads`  Default: `None`                                                            |
 | option.enable_mixed_precision_accumulation | >= 0.26.0   | Pass Through       | Turn this on for LLAMA 70B model to achieve better accuracy.                                                                                                                                                                                                                                                                                          | `true` Default: `None`                                                                         |
 
+## Advanced Multi-Model Inference Considerations
+
+When using the LMI Transformers-NeuronX for multimodel inference endpoints you may need to limit the number of threads available to each model.
+Follow this [guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/transformers-neuronx-developer-guide.html?highlight=omp_num#running-inference-with-multiple-models) when setting the correct number of threads to avoid race conditions. LMI Transformers-NeuronX in its standard configuration will set threads equal to two times the tensor parallel degree as the `OMP_NUM_THREADS` values. 
\ No newline at end of file