diff --git a/Dockerfile_intel b/Dockerfile_intel index 3aa43acfc4c..08a29dc0d80 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -1,5 +1,4 @@ -# Rust builder -FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef +FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef WORKDIR /usr/src ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse @@ -35,6 +34,7 @@ COPY router router COPY launcher launcher RUN cargo build --release + # Text Generation Inference base image for Intel FROM intel/intel-extension-for-pytorch:2.1.10-xpu as base @@ -47,7 +47,7 @@ RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1. RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list -RUN apt-get update && apt install -y intel-basekit xpu-smi +RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev # Text Generation Inference base env ENV HUGGINGFACE_HUB_CACHE=/data \ @@ -55,6 +55,11 @@ ENV HUGGINGFACE_HUB_CACHE=/data \ PORT=80 +WORKDIR /usr/src +# Build pytorch and ipex +RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout -b xpu_main origin/xpu-main +RUN git clone https://github.com/pytorch/pytorch.git && cd pytorch && git checkout 209f2fa8ff86652f67d75c2f19bf9cb9942fd018 && git apply /usr/src/intel-extension-for-pytorch/torch_patches/00*.patch + # Install server COPY proto proto COPY server server @@ -62,7 +67,29 @@ COPY server/Makefile server/Makefile RUN cd server && \ make gen-server && \ pip install -r requirements_common.txt && \ - pip install ".[accelerate, peft]" --no-cache-dir + pip install ".[accelerate, peft, outlines]" --no-cache-dir + +ENV CCL_ROOT=/opt/intel/oneapi/ccl/2021.11 +ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/2021.11 +ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/2021.11/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric +ENV DIAGUTIL_PATH=/opt/intel/oneapi/compiler/2024.0/etc/compiler/sys_check/sys_check.sh +ENV CCL_CONFIGURATION=cpu_gpu_dpcpp +ENV MANPATH=/opt/intel/oneapi/mpi/2021.11/share/man:/opt/intel/oneapi/mpi/2021.11/share/man:/opt/intel/oneapi/compiler/2024.0/documentation/en/man/common: +ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/2024.0/lib/cmake:/opt/intel/oneapi/compiler/2024.0 +ENV CMPLR_ROOT=/opt/intel/oneapi/compiler/2024.0 +ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/2021.11/lib:/opt/intel/oneapi/ccl/2021.11/lib/:/opt/intel/oneapi/mkl/2024.0/lib/:/opt/intel/oneapi/compiler/2024.0/lib +ENV OCL_ICD_FILENAMES=libintelocl_emu.so:libalteracl.so:/opt/intel/oneapi/compiler/2024.0/lib/libintelocl.so +ENV CLASSPATH=/opt/intel/oneapi/mpi/2021.11/share/java/mpi.jar:/opt/intel/oneapi/mpi/2021.11/share/java/mpi.jar +ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/2021.11/lib/:/opt/intel/oneapi/mpi/2021.11/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/2021.11/lib:/opt/intel/oneapi/mkl/2024.0/lib:/opt/intel/oneapi/compiler/2024.0/opt/compiler/lib:/opt/intel/oneapi/compiler/2024.0/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64: +ENV MKLROOT=/opt/intel/oneapi/mkl/2024.0 +ENV NLSPATH=/opt/intel/oneapi/mkl/2024.0/share/locale/%l_%t/%N:/opt/intel/oneapi/compiler/2024.0/lib/locale/%l_%t/%N +ENV PATH=/opt/intel/oneapi/mpi/2021.11/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/2021.11/bin:/opt/intel/oneapi/mpi/2021.11/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/2024.0/bin/:/opt/intel/oneapi/compiler/2024.0/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ENV CPATH=/opt/intel/oneapi/mpi/2021.11/include:/opt/intel/oneapi/ccl/2021.11/include:/opt/intel/oneapi/mkl/2024.0/include +ENV CCL_ZE_IPC_EXCHANGE=sockets + + +RUN pip uninstall -y torch && cd pytorch && git submodule update --init --recursive && python setup.py install +RUN pip uninstall -y intel-extension-for-pytorch && cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc' BUILD_SEPARATE_OPS=ON BUILD_WITH_CPU=ON USE_XETLA=ON python setup.py install # Install benchmarker COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py index ec12f06f1d5..b7d20138b6a 100644 --- a/server/text_generation_server/models/flash_mistral.py +++ b/server/text_generation_server/models/flash_mistral.py @@ -36,7 +36,7 @@ SLIDING_WINDOW_BLOCKS: Optional[int] = None from text_generation_server.utils.import_utils import IS_XPU_SYSTEM -MEM_POOL = torch.cuda.graph_pool_handle() +MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None def set_sliding_window(sliding_window: int, sliding_window_blocks: int): diff --git a/server/text_generation_server/models/globals.py b/server/text_generation_server/models/globals.py index 3b8a70bca21..13c90f38566 100644 --- a/server/text_generation_server/models/globals.py +++ b/server/text_generation_server/models/globals.py @@ -1,6 +1,6 @@ import torch import os -MEM_POOL = torch.cuda.graph_pool_handle() +MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None # This is overridden by the cli ENABLE_CUDA_GRAPHS = os.getenv("ENABLE_CUDA_GRAPHS", "false").lower() in {"1", "true"} diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py index 73ece5ae7ff..ac126148c51 100644 --- a/server/text_generation_server/utils/layers.py +++ b/server/text_generation_server/utils/layers.py @@ -713,11 +713,10 @@ def forward(self, hidden_states, residual=None): residual = hidden_states out = ipex.llm.modules.RMSNorm.apply( hidden_states, - [hidden_states.size(-1)], self.weight, self.variance_epsilon, ) - return out[0], residual + return out, residual elif hidden_states.shape[-1] > 8192: if residual is not None: hidden_states += residual diff --git a/server/text_generation_server/utils/paged_attention.py b/server/text_generation_server/utils/paged_attention.py index b9c42ce77dc..ea0f734f73b 100644 --- a/server/text_generation_server/utils/paged_attention.py +++ b/server/text_generation_server/utils/paged_attention.py @@ -26,7 +26,7 @@ def reshape_and_cache( cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots) elif IS_XPU_SYSTEM: ipex.llm.modules.PagedAttention.reshape_and_cache( - key, value, key_cache, value_cache, slots + key, value, key_cache, value_cache, slots.to(torch.int64) )