Skip to content

[ CI/Build ] Added E2E Test For Compressed Tensors #5839

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
2 changes: 2 additions & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ peft
requests
ray
sentence-transformers # required for embedding
sparseml==1.8.0 # required for compressed-tensors
compressed-tensors==0.4.0 # required for compressed-tensors

# Benchmarking
aiohttp
Expand Down
4 changes: 4 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ def __init__(
model_kwargs: Optional[Dict[str, Any]] = None,
is_embedding_model: bool = False,
is_vision_model: bool = False,
is_sparseml_model: bool = False,
) -> None:
assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
Expand All @@ -193,6 +194,9 @@ def __init__(
else:
if is_vision_model:
auto_cls = AutoModelForVision2Seq
elif is_sparseml_model:
from sparseml.transformers import SparseAutoModelForCausalLM
auto_cls = SparseAutoModelForCausalLM
else:
auto_cls = AutoModelForCausalLM

Expand Down
49 changes: 49 additions & 0 deletions tests/models/test_compressed_tensors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Compares vllm vs sparseml for compressed-tensors

Note: vllm and sparseml do not have bitwise correctness,
so in this test, we just confirm that the top selected
tokens of the are in the top 5 selections of each other.
"""

import pytest

from tests.quantization.utils import is_quant_method_supported

from .utils import check_logprobs_close

MODELS = [
"nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test",
]

MAX_TOKENS = 32
NUM_LOGPROBS = 5


@pytest.mark.skipif(
not is_quant_method_supported("compressed-tensors"),
reason="compressed-tensors is not supported on this machine type.")
@pytest.mark.parametrize("model_name", MODELS)
def test_models(
vllm_runner,
hf_runner,
example_prompts,
model_name,
) -> None:
# Run sparseml.
with hf_runner(model_name=model_name,
is_sparseml_model=True) as sparseml_model:

sparseml_outputs = sparseml_model.generate_greedy_logprobs_limit(
example_prompts, MAX_TOKENS, NUM_LOGPROBS)

# Run vllm.
with vllm_runner(model_name=model_name) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, MAX_TOKENS, NUM_LOGPROBS)

check_logprobs_close(
outputs_0_lst=sparseml_outputs,
outputs_1_lst=vllm_outputs,
name_0="sparseml",
name_1="vllm",
)
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
return [torch.float16, torch.bfloat16]

# Need to figure it out
def get_min_capability(self) -> int:
@classmethod
def get_min_capability(cls) -> int:
return 60

def get_name(self) -> str:
Expand Down
Loading