Upgrading exl2. (huggingface#2415)

* Upgrading exl2. * Fixing the other pathways. * Fix idefics.
yuanwu2017 · Sep 25, 2024 · 4baa6ff · 4baa6ff
1 parent bae161a
commit 4baa6ff
Show file tree

Hide file tree

Showing 10 changed files with 23 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,7 +9,7 @@ backends/client/src/v3/pb
 
 # ROCm auto-generated files
 *.hip
-server/exllamav2_kernels/exllamav2_kernels/hip/
+server/exllamav2
 server/exllama_kernels/exllama_kernels/hip/
 server/exllama_kernels/exllama_kernels/hip_func/
 *_hip.cuh

diff --git a/flake.nix b/flake.nix
@@ -93,6 +93,7 @@
                 causal-conv1d
                 click
                 einops
+                exllamav2
                 fbgemm-gpu
                 flashinfer
                 flash-attn

diff --git a/server/Makefile b/server/Makefile
@@ -6,6 +6,7 @@ include Makefile-eetq
 include Makefile-selective-scan
 include Makefile-lorax-punica
 include Makefile-fbgemm
+include Makefile-exllamav2
 
 unit-tests:
 	pytest -s -vv -m "not private" tests

diff --git a/server/Makefile-exllamav2 b/server/Makefile-exllamav2
@@ -0,0 +1,12 @@
+exllamav2_commit := v0.1.8
+
+build-exllamav2:
+	git clone https://github.com/turboderp/exllamav2.git exllamav2 && \
+	cd exllamav2 && git fetch && git checkout $(exllamav2_commit)  && \
+	git submodule update --init --recursive && \
+	pip install -r requirements.txt && \
+	CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py build
+
+install-exllamav2: build-exllamav2
+	cd exllamav2/ &&  \
+	CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py install
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
@@ -511,6 +511,7 @@ def __init__(
         config_class=AutoConfig,
         batch_class=CausalLMBatch,
     ):
+        self.quantize = quantize
         self.batch_class = batch_class
         self.process_group, rank, world_size = initialize_torch_distributed()
         if torch.cuda.is_available():

diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -872,6 +872,7 @@ def __init__(
         head_size: Optional[int] = None,
         skip_special_tokens: bool = True,
     ):
+        self.quantize = quantize
         self.process_group, rank, world_size = initialize_torch_distributed()
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")

diff --git a/server/text_generation_server/models/idefics.py b/server/text_generation_server/models/idefics.py
@@ -33,6 +33,7 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
+        self.quantize = quantize
         self.process_group, rank, world_size = initialize_torch_distributed()
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")

diff --git a/server/text_generation_server/models/idefics_causal_lm.py b/server/text_generation_server/models/idefics_causal_lm.py
@@ -580,6 +580,7 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
+        self.quantize = quantize
         from text_generation_server.models.custom_modeling.idefics_modeling import (
             IdeficsForVisionText2Text,
         )

diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
@@ -553,6 +553,7 @@ def __init__(
         tokenizer_class=AutoTokenizer,
         aliases=None,
     ):
+        self.quantize = quantize
         self.process_group, rank, world_size = initialize_torch_distributed()
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")

diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
@@ -50,12 +50,12 @@ def __init__(
         self,
         model: Model,
         cache: Cache,
-        quantize: Optional[str],
         server_urls: List[str],
     ):
         self.cache = cache
         self.model = model
-        self.quantize = quantize
+        # Quantize is resolved during model loading
+        self.quantize = model.quantize
         self.server_urls = server_urls
         # For some reason, inference_mode does not work well with GLOO which we use on CPU
         if model.device.type == "cuda":
@@ -255,7 +255,7 @@ async def serve_inner(
             ],
         )
         generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
-            TextGenerationService(model, Cache(), quantize, server_urls), server
+            TextGenerationService(model, Cache(), server_urls), server
         )
         SERVICE_NAMES = (
             generate_pb2.DESCRIPTOR.services_by_name["TextGenerationService"].full_name,