Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for exl2-quantized models #1965

Merged
merged 2 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/basic_tutorials/launcher.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ Options:
Possible values:
- awq: 4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
- eetq: 8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
- exl2: Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
- gptq: 4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
- bitsandbytes: Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half, but it is known that the model will be much slower to run than the native f16
- bitsandbytes-nf4: Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x, but it is known that the model will be much slower to run than the native f16
Expand Down
1 change: 0 additions & 1 deletion docs/source/conceptual/guidance.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

## What is Guidance?


Guidance is a feature that allows users to constrain the generation of a large language model with a specified grammar. This feature is particularly useful when you want to generate text that follows a specific structure or uses a specific set of words or produce output in a specific format. A prominent example is JSON grammar, where the model is forced to output valid JSON.

## How is it used?
Expand Down
22 changes: 19 additions & 3 deletions integration-tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

class ResponseComparator(JSONSnapshotExtension):
rtol = 0.2
ignore_logprob = False

def serialize(
self,
Expand Down Expand Up @@ -95,7 +96,10 @@ def eq_token(token: Token, other: Token) -> bool:
return (
token.id == other.id
and token.text == other.text
and math.isclose(token.logprob, other.logprob, rel_tol=self.rtol)
and (
self.ignore_logprob
or math.isclose(token.logprob, other.logprob, rel_tol=self.rtol)
)
and token.special == other.special
)

Expand All @@ -105,8 +109,11 @@ def eq_prefill_token(prefill_token: InputToken, other: InputToken) -> bool:
prefill_token.id == other.id
and prefill_token.text == other.text
and (
math.isclose(
prefill_token.logprob, other.logprob, rel_tol=self.rtol
self.ignore_logprob
or math.isclose(
prefill_token.logprob,
other.logprob,
rel_tol=self.rtol,
)
if prefill_token.logprob is not None
else prefill_token.logprob == other.logprob
Expand Down Expand Up @@ -223,6 +230,10 @@ class GenerousResponseComparator(ResponseComparator):
rtol = 0.75


class IgnoreLogProbResponseComparator(ResponseComparator):
ignore_logprob = True


class LauncherHandle:
def __init__(self, port: int):
self.client = AsyncClient(f"http://localhost:{port}")
Expand Down Expand Up @@ -274,6 +285,11 @@ def generous_response_snapshot(snapshot):
return snapshot.use_extension(GenerousResponseComparator)


@pytest.fixture
def ignore_logprob_response_snapshot(snapshot):
return snapshot.use_extension(IgnoreLogProbResponseComparator)


@pytest.fixture(scope="module")
def event_loop():
loop = asyncio.get_event_loop()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2323,
"logprob": null,
"text": "Test"
},
{
"id": 1715,
"logprob": -11.4375,
"text": " request"
}
],
"seed": null,
"tokens": [
{
"id": 25,
"logprob": -2.9316406,
"special": false,
"text": ":"
},
{
"id": 330,
"logprob": -3.5136719,
"special": false,
"text": " \""
},
{
"id": 489,
"logprob": -0.7783203,
"special": false,
"text": " +"
},
{
"id": 1715,
"logprob": -1.2314453,
"special": false,
"text": " request"
},
{
"id": 489,
"logprob": -2.0019531,
"special": false,
"text": " +"
},
{
"id": 2990,
"logprob": -1.5009766,
"special": false,
"text": " \"\\"
},
{
"id": 77,
"logprob": -0.057434082,
"special": false,
"text": "n"
},
{
"id": 702,
"logprob": -1.4912109,
"special": false,
"text": "\"\n"
},
{
"id": 262,
"logprob": -1.2636719,
"special": false,
"text": " "
},
{
"id": 557,
"logprob": -2.4042969,
"special": false,
"text": " }\n\n"
}
],
"top_tokens": null
},
"generated_text": ": \" + request + \"\\n\"\n }\n\n"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2323,
"logprob": null,
"text": "Test"
},
{
"id": 1715,
"logprob": -11.453125,
"text": " request"
}
],
"seed": 0,
"tokens": [
{
"id": 13,
"logprob": -1.9980469,
"special": false,
"text": "."
},
{
"id": 578,
"logprob": -0.15795898,
"special": false,
"text": " The"
},
{
"id": 3622,
"logprob": -1.0458984,
"special": false,
"text": " server"
},
{
"id": 31680,
"logprob": -1.3623047,
"special": false,
"text": " responds"
},
{
"id": 449,
"logprob": 0.0,
"special": false,
"text": " with"
},
{
"id": 264,
"logprob": 0.0,
"special": false,
"text": " a"
},
{
"id": 330,
"logprob": -0.5678711,
"special": false,
"text": " \""
},
{
"id": 1049,
"logprob": -0.12322998,
"special": false,
"text": "200"
},
{
"id": 10619,
"logprob": 0.0,
"special": false,
"text": " OK"
},
{
"id": 1,
"logprob": 0.0,
"special": false,
"text": "\""
}
],
"top_tokens": null
},
"generated_text": "Test request. The server responds with a \"200 OK\""
}
Loading
Loading