Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ CI/Build ] Added E2E Test For Compressed Tensors #5839

Merged
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ peft
requests
ray
sentence-transformers # required for embedding
sparseml==1.8.0 # required for compressed-tensors
compressed-tensors==0.4.0 # required for compressed-tensors

# Benchmarking
aiohttp
Expand Down
4 changes: 4 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ def __init__(
model_kwargs: Optional[Dict[str, Any]] = None,
is_embedding_model: bool = False,
is_vision_model: bool = False,
is_sparseml_model: bool = False,
) -> None:
assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
Expand All @@ -164,6 +165,9 @@ def __init__(
else:
if is_vision_model:
auto_cls = AutoModelForVision2Seq
elif is_sparseml_model:
from sparseml.transformers import SparseAutoModelForCausalLM
auto_cls = SparseAutoModelForCausalLM
else:
auto_cls = AutoModelForCausalLM

Expand Down
49 changes: 49 additions & 0 deletions tests/models/test_compressed_tensors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Compares vllm vs sparseml for compressed-tensors

Note: vllm and sparseml do not have bitwise correctness,
so in this test, we just confirm that the top selected
tokens of the are in the top 5 selections of each other.
"""

import pytest

from tests.quantization.utils import is_quant_method_supported

from .utils import check_logprobs_close

MODELS = [
"nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test",
]

MAX_TOKENS = 32
NUM_LOGPROBS = 5


@pytest.mark.skipif(
not is_quant_method_supported("compressed-tensors"),
reason="compressed-tensors is not supported on this machine type.")
@pytest.mark.parametrize("model_name", MODELS)
def test_models(
vllm_runner,
hf_runner,
example_prompts,
model_name,
) -> None:
# Run sparseml.
with hf_runner(model_name=model_name,
robertgshaw2-neuralmagic marked this conversation as resolved.
Show resolved Hide resolved
is_sparseml_model=True) as sparseml_model:

sparseml_outputs = sparseml_model.generate_greedy_logprobs_limit(
example_prompts, MAX_TOKENS, NUM_LOGPROBS)

# Run vllm.
with vllm_runner(model_name=model_name) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, MAX_TOKENS, NUM_LOGPROBS)

check_logprobs_close(
outputs_0_lst=sparseml_outputs,
outputs_1_lst=vllm_outputs,
name_0="sparseml",
name_1="vllm",
)
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
return [torch.float16, torch.bfloat16]

# Need to figure it out
def get_min_capability(self) -> int:
@classmethod
def get_min_capability(cls) -> int:
return 60

def get_name(self) -> str:
Expand Down
Loading