Skip to content

Commit fd32b58

Browse files
committed
Add support for GPTQ Marlin kernels
GPTQ Marlin extends the Marlin kernels to support common GPTQ configurations: - bits: 4 or 8 - groupsize: -1, 32, 64, or 128 - desc_act: true/false Using the GPTQ Marlin kernels requires repacking the parameters in the Marlin quantizer format. The kernels were contributed by Neural Magic to VLLM. We vendor them here for convenience.
1 parent 376a0b7 commit fd32b58

40 files changed

+4652
-138
lines changed

Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,9 +140,9 @@ RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
140140
# Build marlin kernels
141141
FROM kernel-builder as marlin-kernels-builder
142142
WORKDIR /usr/src
143-
COPY server/Makefile-marlin Makefile
143+
COPY server/marlin/ .
144144
# Build specific version of transformers
145-
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-marlin
145+
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
146146

147147
# Build Transformers CUDA kernels
148148
FROM kernel-builder as custom-kernels-builder
@@ -213,7 +213,7 @@ COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86
213213
# Copy build artifacts from eetq kernels builder
214214
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
215215
# Copy build artifacts from marlin kernels builder
216-
COPY --from=marlin-kernels-builder /usr/src/marlin/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
216+
COPY --from=marlin-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
217217

218218
# Copy builds artifacts from vllm builder
219219
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
{
2+
"details": {
3+
"best_of_sequences": null,
4+
"finish_reason": "length",
5+
"generated_tokens": 10,
6+
"prefill": [
7+
{
8+
"id": 2323,
9+
"logprob": null,
10+
"text": "Test"
11+
},
12+
{
13+
"id": 1715,
14+
"logprob": -11.34375,
15+
"text": " request"
16+
}
17+
],
18+
"seed": null,
19+
"tokens": [
20+
{
21+
"id": 198,
22+
"logprob": -2.5742188,
23+
"special": false,
24+
"text": "\n"
25+
},
26+
{
27+
"id": 262,
28+
"logprob": -1.6230469,
29+
"special": false,
30+
"text": " "
31+
},
32+
{
33+
"id": 3270,
34+
"logprob": -2.046875,
35+
"special": false,
36+
"text": " \"\"\"\n"
37+
},
38+
{
39+
"id": 262,
40+
"logprob": -0.015281677,
41+
"special": false,
42+
"text": " "
43+
},
44+
{
45+
"id": 422,
46+
"logprob": -2.1425781,
47+
"special": false,
48+
"text": " if"
49+
},
50+
{
51+
"id": 1715,
52+
"logprob": -0.9238281,
53+
"special": false,
54+
"text": " request"
55+
},
56+
{
57+
"id": 13204,
58+
"logprob": -0.076660156,
59+
"special": false,
60+
"text": ".method"
61+
},
62+
{
63+
"id": 624,
64+
"logprob": -0.021987915,
65+
"special": false,
66+
"text": " =="
67+
},
68+
{
69+
"id": 364,
70+
"logprob": -0.39208984,
71+
"special": false,
72+
"text": " '"
73+
},
74+
{
75+
"id": 3019,
76+
"logprob": -0.10821533,
77+
"special": false,
78+
"text": "POST"
79+
}
80+
],
81+
"top_tokens": null
82+
},
83+
"generated_text": "\n \"\"\"\n if request.method == 'POST"
84+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
{
2+
"details": {
3+
"best_of_sequences": null,
4+
"finish_reason": "length",
5+
"generated_tokens": 10,
6+
"prefill": [
7+
{
8+
"id": 2323,
9+
"logprob": null,
10+
"text": "Test"
11+
},
12+
{
13+
"id": 1715,
14+
"logprob": -11.34375,
15+
"text": " request"
16+
}
17+
],
18+
"seed": 0,
19+
"tokens": [
20+
{
21+
"id": 13,
22+
"logprob": -2.2539062,
23+
"special": false,
24+
"text": "."
25+
},
26+
{
27+
"id": 578,
28+
"logprob": -0.15563965,
29+
"special": false,
30+
"text": " The"
31+
},
32+
{
33+
"id": 3622,
34+
"logprob": -0.8203125,
35+
"special": false,
36+
"text": " server"
37+
},
38+
{
39+
"id": 706,
40+
"logprob": 0.0,
41+
"special": false,
42+
"text": " has"
43+
},
44+
{
45+
"id": 539,
46+
"logprob": 0.0,
47+
"special": false,
48+
"text": " not"
49+
},
50+
{
51+
"id": 3686,
52+
"logprob": 0.0,
53+
"special": false,
54+
"text": " yet"
55+
},
56+
{
57+
"id": 3288,
58+
"logprob": 0.0,
59+
"special": false,
60+
"text": " sent"
61+
},
62+
{
63+
"id": 904,
64+
"logprob": 0.0,
65+
"special": false,
66+
"text": " any"
67+
},
68+
{
69+
"id": 828,
70+
"logprob": 0.0,
71+
"special": false,
72+
"text": " data"
73+
},
74+
{
75+
"id": 382,
76+
"logprob": -1.5517578,
77+
"special": false,
78+
"text": ".\n\n"
79+
}
80+
],
81+
"top_tokens": null
82+
},
83+
"generated_text": "Test request. The server has not yet sent any data.\n\n"
84+
}

0 commit comments

Comments
 (0)