intel · wenhuach21 · Sep 23, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025
diff --git a/.azure-pipelines/scripts/ut/run_ut_hpu.sh b/.azure-pipelines/scripts/ut/run_ut_hpu.sh
@@ -7,7 +7,7 @@ export TQDM_MININTERVAL=60
 pip install pytest-cov pytest-html
 pip list
 
-cd /auto-round/test/test_cpu || exit 1
+cd /auto-round/test/test_hpu || exit 1
 find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} +
 
 export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
@@ -19,8 +19,8 @@ LOG_DIR=/auto-round/log_dir
 mkdir -p ${LOG_DIR}
 ut_log_name=${LOG_DIR}/ut.log
 
-find . -name "test*hpu_only.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_lazy.sh
-find . -name "test*hpu_only.py" | sed "s,\.\/,python -m pytest --mode compile --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_compile.sh
+find . -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_lazy.sh
+find . -name "test*.py" | sed "s,\.\/,python -m pytest --mode compile --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_compile.sh
 
 cat run_lazy.sh
 bash run_lazy.sh 2>&1 | tee ${ut_log_name}

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -81,9 +81,9 @@
     infer_bits_by_data_type,
     init_cache,
     is_debug_mode,
+    is_hpex_available,
     is_mx_fp,
     is_nv_fp,
-    is_optimum_habana_available,
     is_standard_fp,
     is_static_wfp8afp8,
     is_wfp8afp8,
@@ -380,8 +380,8 @@ def __init__(
         self._check_configs()
         torch.set_printoptions(precision=3, sci_mode=True)
 
-        if is_optimum_habana_available():
-            logger.info("optimum Habana is available, import htcore explicitly.")
+        if is_hpex_available():
+            logger.info("habana_frameworks is available, import htcore explicitly.")
             import habana_frameworks.torch.core as htcore  # pylint: disable=E0401
             import habana_frameworks.torch.hpu as hthpu  # pylint: disable=E0401]
 
@@ -3279,7 +3279,7 @@ def _scale_loss_and_backward(self, scaler: Any, loss: torch.Tensor) -> torch.Ten
         """
         scale_loss = loss * 1000
         scale_loss.backward()
-        if is_optimum_habana_available():
+        if is_hpex_available():
             htcore.mark_step()
         return scale_loss
 
@@ -3296,7 +3296,7 @@ def _step(self, scaler: Any, optimizer: Any, lr_schedule: Any):
         """
         optimizer.step()
         # for hpu
-        if is_optimum_habana_available():
+        if is_hpex_available():
             htcore.mark_step()
         optimizer.zero_grad()
         lr_schedule.step()
@@ -3478,7 +3478,7 @@ def _scale_loss_and_backward(self, scaler, loss):
             loss = scaler.scale(loss)
 
         loss.backward()
-        if is_optimum_habana_available():
+        if is_hpex_available():
             htcore.mark_step()
         return loss
 
@@ -3492,5 +3492,5 @@ def _step(self, scaler, optimizer, lr_schedule):
             optimizer.step()
             optimizer.zero_grad()
             lr_schedule.step()
-        if is_optimum_habana_available():
+        if is_hpex_available():
             htcore.mark_step()
diff --git a/auto_round/data_type/utils.py b/auto_round/data_type/utils.py
@@ -224,9 +224,9 @@ def float8_e4m3fn_hpu_ste(x: torch.Tensor):
 
 @lru_cache(None)
 def get_gaudi_fp8_ste_func():
-    from auto_round.utils import is_hpu_supported
+    from auto_round.utils import is_hpex_available
 
-    if is_hpu_supported():
+    if is_hpex_available():
         fn = float8_e4m3fn_hpu_ste
         logger.warning_once("Using HPU STE for FP8")
     else:

diff --git a/auto_round/inference/auto_quantizer.py b/auto_round/inference/auto_quantizer.py
@@ -42,7 +42,7 @@
 from transformers.utils.quantization_config import AwqConfig, GPTQConfig, QuantizationConfigMixin, QuantizationMethod
 
 from auto_round.inference.convert_model import convert_hf_model, infer_target_device, post_init
-from auto_round.utils import is_hpu_supported
+from auto_round.utils import is_hpex_available
 
 logger = getLogger(__name__)
 import sys
@@ -126,7 +126,7 @@ def from_config(cls, quantization_config: Union[QuantizationConfigMixin, Dict],
                 f"Unknown quantization type, got {quant_method} - supported types are:"
                 f" {list(AUTO_QUANTIZER_MAPPING.keys())}"
             )
-        if "auto-round" in quant_method or is_hpu_supported():  # pragma: no cover
+        if "auto-round" in quant_method or is_hpex_available():  # pragma: no cover
             target_cls = AutoRoundQuantizer
         else:
             target_cls = AUTO_QUANTIZER_MAPPING[quant_method]

diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
@@ -39,7 +39,7 @@
     get_block_names,
     get_layer_names_in_block,
     get_module,
-    is_hpu_supported,
+    is_hpex_available,
     set_module,
 )
 
@@ -165,7 +165,7 @@ def get_available_devices():
     if torch.cuda.is_available():
         devices.append("cuda")
 
-    if is_hpu_supported():
+    if is_hpex_available():
         devices.append("hpu")
 
     if hasattr(torch, "xpu") and torch.xpu.is_available():

diff --git a/auto_round/utils.py b/auto_round/utils.py
@@ -116,7 +116,7 @@ def __init__(self, module_name):
         """Init LazyImport object.
 
         Args:
-           module_name (string): The name of module imported later
+            module_name (string): The name of module imported later
         """
         self.module_name = module_name
         self.module = None
@@ -145,12 +145,31 @@ def __call__(self, *args, **kwargs):
 htcore = LazyImport("habana_frameworks.torch.core")
 
 
+################ Check available sys.module to decide behavior #################
+def is_package_available(package_name: str) -> bool:
+    """Check if the package exists in the environment without importing.
+
+    Args:
+        package_name (str): package name
+    """
+    from importlib.util import find_spec
+
+    package_spec = find_spec(package_name)
+    return package_spec is not None
+
+
+## check hpex
+if is_package_available("habana_frameworks"):
+    _hpex_available = True
+    import habana_frameworks.torch.hpex  # pylint: disable=E0401
+else:
+    _hpex_available = False
+
+
 @torch._dynamo.disable()
 @lru_cache(None)
-def is_optimum_habana_available():
-    from transformers.utils.import_utils import is_optimum_available
-
-    return is_optimum_available() and importlib.util.find_spec("optimum.habana") is not None
+def is_hpex_available():
+    return _hpex_available
 
 
 def get_module(module, key):
@@ -553,7 +572,7 @@ def is_valid_digit(s):
         if torch.cuda.is_available():
             device = torch.device("cuda")
             # logger.info("Using GPU device")
-        elif is_optimum_habana_available():  # pragma: no cover
+        elif is_hpex_available():  # pragma: no cover
             device = torch.device("hpu")
             # logger.info("Using HPU device")
         elif torch.xpu.is_available():  # pragma: no cover
@@ -780,15 +799,6 @@ def is_autoround_exllamav2_available():
     return res
 
 
-@lru_cache(None)
-def is_hpu_supported():  # pragma: no cover
-    try:
-        import habana_frameworks.torch.core as htcore  # pylint: disable=E0401
-    except ImportError as e:
-        return False
-    return True
-
-
 def get_library_version(library_name):
     from packaging.version import Version
 
@@ -906,7 +916,7 @@ def _clear_memory_for_cpu_and_cuda(tensor=None):
 
 @torch._dynamo.disable()
 def clear_memory(tensor=None):
-    if is_hpu_supported():
+    if is_hpex_available():
         # hpu does not have empty_cache
         return
     else:

diff --git a/test/test_cpu/_test_helpers.py b/test/test_cpu/_test_helpers.py
@@ -1,14 +1,3 @@
-import pytest
-
-
-def is_pytest_mode_compile():
-    return pytest.mode == "compile"
-
-
-def is_pytest_mode_lazy():
-    return pytest.mode == "lazy"
-
-
 def model_infer(model, tokenizer, apply_chat_template=False):
     prompts = [
         "Hello,my name is",

diff --git a/test/test_hpu/_test_helpers.py b/test/test_hpu/_test_helpers.py
@@ -0,0 +1,43 @@
+import pytest
+
+
+def is_pytest_mode_compile():
+    return pytest.mode == "compile"
+
+
+def is_pytest_mode_lazy():
+    return pytest.mode == "lazy"
+
+
+def model_infer(model, tokenizer, apply_chat_template=False):
+    prompts = [
+        "Hello,my name is",
+        # "The president of the United States is",
+        # "The capital of France is",
+        # "The future of AI is",
+    ]
+    if apply_chat_template:
+        texts = []
+        for prompt in prompts:
+            messages = [{"role": "user", "content": prompt}]
+            text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            texts.append(text)
+        prompts = texts
+
+    inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
+
+    outputs = model.generate(
+        input_ids=inputs["input_ids"].to(model.device),
+        attention_mask=inputs["attention_mask"].to(model.device),
+        do_sample=False,  ## change this to follow official usage
+        max_new_tokens=5,
+    )
+    generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
+
+    decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+
+    for i, prompt in enumerate(prompts):
+        print(f"Prompt: {prompt}")
+        print(f"Generated: {decoded_outputs[i]}")
+        print("-" * 50)
+    return decoded_outputs[0]
diff --git a/test/test_cpu/conftest.py → test/test_hpu/conftest.py b/test/test_cpu/conftest.py → test/test_hpu/conftest.py
diff --git a/test/test_hpu/requirements.txt b/test/test_hpu/requirements.txt
@@ -0,0 +1 @@
+
diff --git a/test/test_cpu/test_auto_round_hpu_only.py → test/test_hpu/test_auto_round.py b/test/test_cpu/test_auto_round_hpu_only.py → test/test_hpu/test_auto_round.py
@@ -2,7 +2,7 @@
 import torch
 from _test_helpers import is_pytest_mode_compile, is_pytest_mode_lazy
 
-from auto_round.utils import is_hpu_supported
+from auto_round.utils import is_hpex_available
 
 
 def run_opt_125m_on_hpu():
@@ -28,13 +28,13 @@ def run_opt_125m_on_hpu():
     assert q_model is not None, "Expected q_model to be not None"
 
 
-@pytest.mark.skipif(not is_hpu_supported(), reason="HPU is not supported")
+@pytest.mark.skipif(not is_hpex_available(), reason="HPU is not supported")
 @pytest.mark.skipif(not is_pytest_mode_lazy(), reason="Only for lazy mode")
 def test_opt_125m_lazy_mode():
     run_opt_125m_on_hpu()
 
 
-@pytest.mark.skipif(not is_hpu_supported(), reason="HPU is not supported")
+@pytest.mark.skipif(not is_hpex_available(), reason="HPU is not supported")
 @pytest.mark.skipif(not is_pytest_mode_compile(), reason="Only for compile mode")
 def test_opt_125m_compile_mode():
     torch._dynamo.reset()

diff --git a/test/test_cpu/test_hpu.py → test/test_hpu/test_inference.py b/test/test_cpu/test_hpu.py → test/test_hpu/test_inference.py
@@ -18,7 +18,7 @@ def __iter__(self):
             yield torch.ones([1, 10], dtype=torch.long)
 
 
-def is_hpu_supported():
+def is_hpex_available():
     try:
         import habana_frameworks.torch.core as htcore  # pylint: disable=E0401
     except ImportError as e:
@@ -40,7 +40,7 @@ def tearDownClass(self):
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_autogptq_format_hpu_inference(self):
-        if not is_hpu_supported():
+        if not is_hpex_available():
             return
         try:
             import auto_gptq
@@ -73,7 +73,7 @@ def test_autogptq_format_hpu_inference(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_autoround_format_hpu_inference(self):
-        if not is_hpu_supported():
+        if not is_hpex_available():
             return
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(