From 1139bdeb8a6fc18489c0acb5e781da4830312d0b Mon Sep 17 00:00:00 2001 From: Tyler Osterberg Date: Sun, 28 Apr 2024 05:07:01 -0700 Subject: [PATCH] [tnx] version bump Neuron SDK and Optimum (#1826) --- .../main/java/ai/djl/python/engine/Connection.java | 11 +++++++++++ serving/docker/pytorch-inf2.Dockerfile | 8 ++++---- serving/docs/lmi/user_guides/tnx_user_guide.md | 8 +++++++- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/engines/python/src/main/java/ai/djl/python/engine/Connection.java b/engines/python/src/main/java/ai/djl/python/engine/Connection.java index 6954996bf..e95879ae7 100644 --- a/engines/python/src/main/java/ai/djl/python/engine/Connection.java +++ b/engines/python/src/main/java/ai/djl/python/engine/Connection.java @@ -186,6 +186,10 @@ static String[] getPythonStartCmd(PyEnv pyEnv, Model model, int workerId, int po // TODO: re-map logic device once neuron fixed bug pyEnv.addEnv("NEURON_RT_VISIBLE_CORES", visibleCores); logger.info("Set NEURON_RT_VISIBLE_CORES={}", visibleCores); + + String neuronThreads = getNeuronThreads(tensorParallelDegree); + pyEnv.addEnv("OMP_NUM_THREADS", neuronThreads); + logger.info("Set OMP_NUM_THREADS={}", neuronThreads); } boolean uds = Epoll.isAvailable() || KQueue.isAvailable(); String[] args = new String[12]; @@ -231,6 +235,13 @@ private static String getNeuronVisibleCores(int deviceId, int tensorParallelDegr return String.valueOf(deviceId); } + private static String getNeuronThreads(int tensorParallelDegree) { + if (tensorParallelDegree > 0) { + return String.valueOf(tensorParallelDegree * 2); + } + return String.valueOf(1); + } + void connect() throws InterruptedException { EventLoopGroup group = PyEnv.getEventLoopGroup(); diff --git a/serving/docker/pytorch-inf2.Dockerfile b/serving/docker/pytorch-inf2.Dockerfile index 3e2f519d6..5e1679ec2 100644 --- a/serving/docker/pytorch-inf2.Dockerfile +++ b/serving/docker/pytorch-inf2.Dockerfile @@ -14,17 +14,17 @@ ARG djl_version=0.28.0~SNAPSHOT ARG torch_version=2.1.2 ARG torchvision_version=0.16.2 ARG python_version=3.9 -ARG neuronsdk_version=2.18.1 +ARG neuronsdk_version=2.18.2 ARG torch_neuronx_version=2.1.2.2.1.0 ARG transformers_neuronx_version=0.10.0.360 ARG neuronx_distributed_version=0.7.0 -ARG neuronx_cc_version=2.13.68.0 +ARG neuronx_cc_version=2.13.72.0 ARG protobuf_version=3.19.6 ARG transformers_version=4.36.2 ARG accelerate_version=0.23.0 ARG diffusers_version=0.26.1 ARG pydantic_version=2.6.1 -ARG optimum_neuron_version=0.0.20 +ARG optimum_neuron_version=0.0.21 ARG vllm_wheel="https://publish.djl.ai/neuron_vllm/vllm-nightly-py3-none-any.whl" EXPOSE 8080 @@ -75,7 +75,7 @@ RUN mkdir -p /opt/djl/bin && cp scripts/telemetry.sh /opt/djl/bin && \ neuronx-cc==${neuronx_cc_version} torch-neuronx==${torch_neuronx_version} transformers-neuronx==${transformers_neuronx_version} \ neuronx_distributed==${neuronx_distributed_version} protobuf==${protobuf_version} sentencepiece jinja2 \ diffusers==${diffusers_version} opencv-contrib-python-headless Pillow --extra-index-url=https://pip.repos.neuron.amazonaws.com \ - pydantic==${pydantic_version} optimum optimum-neuron==${optimum_neuron_version} tiktoken blobfile && \ + pydantic==${pydantic_version} optimum optimum-neuron==${optimum_neuron_version} tiktoken blobfile \ torchvision==${torchvision_version} && \ scripts/install_s5cmd.sh x64 && \ scripts/patch_oss_dlc.sh python && \ diff --git a/serving/docs/lmi/user_guides/tnx_user_guide.md b/serving/docs/lmi/user_guides/tnx_user_guide.md index c102ad84e..926d8cb58 100644 --- a/serving/docs/lmi/user_guides/tnx_user_guide.md +++ b/serving/docs/lmi/user_guides/tnx_user_guide.md @@ -19,6 +19,7 @@ The model architectures that are tested daily for LMI Transformers-NeuronX (in C - LLAMA - Mistral +- Mixtral - GPT-NeoX - GPT-J - Bloom @@ -32,8 +33,9 @@ The model architectures that are tested daily for LMI Transformers-NeuronX (in C - GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.) - GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.) - GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.) -- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.) +- LLaMA, LLaMA-2, LLaMA-3 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `meta-llama/Meta-Llama-3-70B`, `openlm-research/open_llama_13b`, etc.) - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.) +- Mixtral (`mistralai/Mixtral-8x7B-Instruct-v0.1`) - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.) We will add more model support for the future versions to have them tested. Please feel free to [file us an issue](https://github.com/deepjavalibrary/djl-serving/issues/new/choose) for more model coverage in CI. @@ -99,3 +101,7 @@ In that situation, there is nothing LMI can do until the issue is fixed in the b | option.group_query_attention | >= 0.26.0 | Pass Through | Enable K/V cache sharding for llama and mistral models types based on various [strategies](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/transformers-neuronx-developer-guide.html#grouped-query-attention-gqa-support-beta) | `shard-over-heads` Default: `None` | | option.enable_mixed_precision_accumulation | >= 0.26.0 | Pass Through | Turn this on for LLAMA 70B model to achieve better accuracy. | `true` Default: `None` | +## Advanced Multi-Model Inference Considerations + +When using the LMI Transformers-NeuronX for multimodel inference endpoints you may need to limit the number of threads available to each model. +Follow this [guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/transformers-neuronx-developer-guide.html?highlight=omp_num#running-inference-with-multiple-models) when setting the correct number of threads to avoid race conditions. LMI Transformers-NeuronX in its standard configuration will set threads equal to two times the tensor parallel degree as the `OMP_NUM_THREADS` values. \ No newline at end of file