Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrading exl2. #2415

Merged
merged 3 commits into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ backends/client/src/v3/pb

# ROCm auto-generated files
*.hip
server/exllamav2_kernels/exllamav2_kernels/hip/
server/exllamav2
server/exllama_kernels/exllama_kernels/hip/
server/exllama_kernels/exllama_kernels/hip_func/
*_hip.cuh
Expand Down
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,10 @@ RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
# Build Transformers exllama kernels
FROM kernel-builder AS exllamav2-kernels-builder
WORKDIR /usr/src
COPY server/exllamav2_kernels/ .
COPY server/Makefile-exllamav2/ Makefile

# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-exllamav2

# Build Transformers awq kernels
FROM kernel-builder AS awq-kernels-builder
Expand Down Expand Up @@ -221,7 +221,7 @@ COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /
# Copy build artifacts from exllama kernels builder
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from awq kernels builder
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from eetq kernels builder
Expand Down
1 change: 1 addition & 0 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
causal-conv1d
click
einops
exllamav2
fbgemm-gpu
flashinfer
flash-attn
Expand Down
1 change: 1 addition & 0 deletions server/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ include Makefile-eetq
include Makefile-selective-scan
include Makefile-lorax-punica
include Makefile-fbgemm
include Makefile-exllamav2

unit-tests:
pytest -s -vv -m "not private" tests
Expand Down
12 changes: 12 additions & 0 deletions server/Makefile-exllamav2
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
exllamav2_commit := v0.1.8

build-exllamav2:
git clone https://github.com/turboderp/exllamav2.git exllamav2 && \
cd exllamav2 && git fetch && git checkout $(exllamav2_commit) && \
git submodule update --init --recursive && \
pip install -r requirements.txt && \
CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py build

install-exllamav2: build-exllamav2
cd exllamav2/ && \
CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py install
43 changes: 28 additions & 15 deletions server/text_generation_server/layers/gptq/exllamav2.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
from text_generation_server.utils.log import log_master

try:
from exllamav2_kernels import make_q_matrix, gemm_half_q_half
from exllamav2.ext import exllamav2_ext

make_q_matrix = exllamav2_ext.make_q_matrix
gemm_half_q_half = exllamav2_ext.gemm_half_q_half
except ImportError:
log_master(logger.warning, "exllamav2_kernels not installed.")
raise
Expand Down Expand Up @@ -70,6 +73,10 @@ def ext_make_q_matrix(
"""
Create Q matrix
"""
# max_dq_size = 512*(1024**2)
# max_dq_rows = max_dq_size // out_features[0]
max_dq_rows = 0

# EXL2
if isinstance(w, Exl2Weight):
extra.q_group_map = make_group_map(w.q_groups, w.q_weight.shape[0])
Expand All @@ -83,10 +90,12 @@ def ext_make_q_matrix(
w.q_scale_max,
w.q_groups,
extra.q_group_map,
none_tensor,
none_tensor,
none_tensor,
none_tensor, # zeros
none_tensor, # scales
none_tensor, # g_idx
none_tensor, # bias
temp_dq,
max_dq_rows,
)
# GPTQ
elif isinstance(w, GPTQWeight):
Expand All @@ -106,29 +115,33 @@ def ext_make_q_matrix(
w.qweight,
extra.q_perm,
extra.q_invperm,
none_tensor,
none_tensor,
none_tensor,
none_tensor,
none_tensor, # q_scale
none_tensor, # q_scale_max
none_tensor, # q_groups
none_tensor, # q_group_map
w.qzeros,
w.scales,
w.g_idx.cpu(),
none_tensor, # bias
temp_dq,
max_dq_rows,
)
# GPTQ without g_idx
else:
return make_q_matrix(
w.qweight,
none_tensor,
none_tensor,
none_tensor,
none_tensor,
none_tensor,
none_tensor,
none_tensor, # q_perm
none_tensor, # q_invperm
none_tensor, # q_scale
none_tensor, # q_scale_max
none_tensor, # q_groups
none_tensor, # q_group_map
w.qzeros,
w.scales,
none_tensor,
none_tensor, # g_idx
none_tensor, # bias
temp_dq,
max_dq_rows,
)
else:
RuntimeError("Cannot create handle")
Expand Down
1 change: 1 addition & 0 deletions server/text_generation_server/models/causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,7 @@ def __init__(
config_class=AutoConfig,
batch_class=CausalLMBatch,
):
self.quantize = quantize
self.batch_class = batch_class
self.process_group, rank, world_size = initialize_torch_distributed()
if torch.cuda.is_available():
Expand Down
1 change: 1 addition & 0 deletions server/text_generation_server/models/flash_causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -872,6 +872,7 @@ def __init__(
head_size: Optional[int] = None,
skip_special_tokens: bool = True,
):
self.quantize = quantize
self.process_group, rank, world_size = initialize_torch_distributed()
if torch.cuda.is_available():
device = torch.device(f"cuda:{rank}")
Expand Down
1 change: 1 addition & 0 deletions server/text_generation_server/models/idefics.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __init__(
dtype: Optional[torch.dtype] = None,
trust_remote_code: bool = False,
):
self.quantize = quantize
self.process_group, rank, world_size = initialize_torch_distributed()
if torch.cuda.is_available():
device = torch.device(f"cuda:{rank}")
Expand Down
1 change: 1 addition & 0 deletions server/text_generation_server/models/idefics_causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,7 @@ def __init__(
dtype: Optional[torch.dtype] = None,
trust_remote_code: bool = False,
):
self.quantize = quantize
from text_generation_server.models.custom_modeling.idefics_modeling import (
IdeficsForVisionText2Text,
)
Expand Down
1 change: 1 addition & 0 deletions server/text_generation_server/models/seq2seq_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,7 @@ def __init__(
tokenizer_class=AutoTokenizer,
aliases=None,
):
self.quantize = quantize
self.process_group, rank, world_size = initialize_torch_distributed()
if torch.cuda.is_available():
device = torch.device(f"cuda:{rank}")
Expand Down
6 changes: 3 additions & 3 deletions server/text_generation_server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@ def __init__(
self,
model: Model,
cache: Cache,
quantize: Optional[str],
server_urls: List[str],
):
self.cache = cache
self.model = model
self.quantize = quantize
# Quantize is resolved during model loading
self.quantize = model.quantize
self.server_urls = server_urls
# For some reason, inference_mode does not work well with GLOO which we use on CPU
if model.device.type == "cuda":
Expand Down Expand Up @@ -255,7 +255,7 @@ async def serve_inner(
],
)
generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
TextGenerationService(model, Cache(), quantize, server_urls), server
TextGenerationService(model, Cache(), server_urls), server
)
SERVICE_NAMES = (
generate_pb2.DESCRIPTOR.services_by_name["TextGenerationService"].full_name,
Expand Down
Loading