huggingface · Narsil · Aug 14, 2024 · Aug 14, 2024 · Aug 14, 2024 · Aug 14, 2024
diff --git a/.gitignore b/.gitignore
@@ -9,7 +9,7 @@ backends/client/src/v3/pb
 
 # ROCm auto-generated files
 *.hip
-server/exllamav2_kernels/exllamav2_kernels/hip/
+server/exllamav2
 server/exllama_kernels/exllama_kernels/hip/
 server/exllama_kernels/exllama_kernels/hip_func/
 *_hip.cuh

diff --git a/Dockerfile b/Dockerfile
@@ -123,10 +123,10 @@ RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
 # Build Transformers exllama kernels
 FROM kernel-builder AS exllamav2-kernels-builder
 WORKDIR /usr/src
-COPY server/exllamav2_kernels/ .
+COPY server/Makefile-exllamav2/ Makefile
 
 # Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-exllamav2
 
 # Build Transformers awq kernels
 FROM kernel-builder AS awq-kernels-builder
@@ -221,7 +221,7 @@ COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /
 # Copy build artifacts from exllama kernels builder
 COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 # Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 # Copy build artifacts from awq kernels builder
 COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 # Copy build artifacts from eetq kernels builder

diff --git a/flake.nix b/flake.nix
@@ -93,6 +93,7 @@
                 causal-conv1d
                 click
                 einops
+                exllamav2
                 fbgemm-gpu
                 flashinfer
                 flash-attn

diff --git a/server/Makefile b/server/Makefile
@@ -6,6 +6,7 @@ include Makefile-eetq
 include Makefile-selective-scan
 include Makefile-lorax-punica
 include Makefile-fbgemm
+include Makefile-exllamav2
 
 unit-tests:
 	pytest -s -vv -m "not private" tests

diff --git a/server/Makefile-exllamav2 b/server/Makefile-exllamav2
@@ -0,0 +1,12 @@
+exllamav2_commit := v0.1.8
+
+build-exllamav2:
+	git clone https://github.com/turboderp/exllamav2.git exllamav2 && \
+	cd exllamav2 && git fetch && git checkout $(exllamav2_commit)  && \
+	git submodule update --init --recursive && \
+	pip install -r requirements.txt && \
+	CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py build
+
+install-exllamav2: build-exllamav2
+	cd exllamav2/ &&  \
+	CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py install
diff --git a/server/text_generation_server/layers/gptq/exllamav2.py b/server/text_generation_server/layers/gptq/exllamav2.py
@@ -12,7 +12,10 @@
 from text_generation_server.utils.log import log_master
 
 try:
-    from exllamav2_kernels import make_q_matrix, gemm_half_q_half
+    from exllamav2.ext import exllamav2_ext
+
+    make_q_matrix = exllamav2_ext.make_q_matrix
+    gemm_half_q_half = exllamav2_ext.gemm_half_q_half
 except ImportError:
     log_master(logger.warning, "exllamav2_kernels not installed.")
     raise
@@ -70,6 +73,10 @@ def ext_make_q_matrix(
     """
     Create Q matrix
     """
+    # max_dq_size = 512*(1024**2)
+    # max_dq_rows = max_dq_size // out_features[0]
+    max_dq_rows = 0
+
     # EXL2
     if isinstance(w, Exl2Weight):
         extra.q_group_map = make_group_map(w.q_groups, w.q_weight.shape[0])
@@ -83,10 +90,12 @@ def ext_make_q_matrix(
             w.q_scale_max,
             w.q_groups,
             extra.q_group_map,
-            none_tensor,
-            none_tensor,
-            none_tensor,
+            none_tensor,  # zeros
+            none_tensor,  # scales
+            none_tensor,  # g_idx
+            none_tensor,  # bias
             temp_dq,
+            max_dq_rows,
         )
     # GPTQ
     elif isinstance(w, GPTQWeight):
@@ -106,29 +115,33 @@ def ext_make_q_matrix(
                 w.qweight,
                 extra.q_perm,
                 extra.q_invperm,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
+                none_tensor,  # q_scale
+                none_tensor,  # q_scale_max
+                none_tensor,  # q_groups
+                none_tensor,  # q_group_map
                 w.qzeros,
                 w.scales,
                 w.g_idx.cpu(),
+                none_tensor,  # bias
                 temp_dq,
+                max_dq_rows,
             )
         # GPTQ without g_idx
         else:
             return make_q_matrix(
                 w.qweight,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
+                none_tensor,  # q_perm
+                none_tensor,  # q_invperm
+                none_tensor,  # q_scale
+                none_tensor,  # q_scale_max
+                none_tensor,  # q_groups
+                none_tensor,  # q_group_map
                 w.qzeros,
                 w.scales,
-                none_tensor,
+                none_tensor,  # g_idx
+                none_tensor,  # bias
                 temp_dq,
+                max_dq_rows,
             )
     else:
         RuntimeError("Cannot create handle")

diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
@@ -511,6 +511,7 @@ def __init__(
         config_class=AutoConfig,
         batch_class=CausalLMBatch,
     ):
+        self.quantize = quantize
         self.batch_class = batch_class
         self.process_group, rank, world_size = initialize_torch_distributed()
         if torch.cuda.is_available():

diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -872,6 +872,7 @@ def __init__(
         head_size: Optional[int] = None,
         skip_special_tokens: bool = True,
     ):
+        self.quantize = quantize
         self.process_group, rank, world_size = initialize_torch_distributed()
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")

diff --git a/server/text_generation_server/models/idefics.py b/server/text_generation_server/models/idefics.py
@@ -33,6 +33,7 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
+        self.quantize = quantize
         self.process_group, rank, world_size = initialize_torch_distributed()
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")

diff --git a/server/text_generation_server/models/idefics_causal_lm.py b/server/text_generation_server/models/idefics_causal_lm.py
@@ -580,6 +580,7 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
+        self.quantize = quantize
         from text_generation_server.models.custom_modeling.idefics_modeling import (
             IdeficsForVisionText2Text,
         )

diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
@@ -553,6 +553,7 @@ def __init__(
         tokenizer_class=AutoTokenizer,
         aliases=None,
     ):
+        self.quantize = quantize
         self.process_group, rank, world_size = initialize_torch_distributed()
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")

diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
@@ -50,12 +50,12 @@ def __init__(
         self,
         model: Model,
         cache: Cache,
-        quantize: Optional[str],
         server_urls: List[str],
     ):
         self.cache = cache
         self.model = model
-        self.quantize = quantize
+        # Quantize is resolved during model loading
+        self.quantize = model.quantize
         self.server_urls = server_urls
         # For some reason, inference_mode does not work well with GLOO which we use on CPU
         if model.device.type == "cuda":
@@ -255,7 +255,7 @@ async def serve_inner(
             ],
         )
         generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
-            TextGenerationService(model, Cache(), quantize, server_urls), server
+            TextGenerationService(model, Cache(), server_urls), server
         )
         SERVICE_NAMES = (
             generate_pb2.DESCRIPTOR.services_by_name["TextGenerationService"].full_name,