fix tp device issue caused by device_map (#833)

xin3he · web-flow · commit 77c608f93e42 · 2025-09-23T11:49:53.000+08:00
diff --git a/auto_round/utils.py b/auto_round/utils.py
@@ -587,7 +587,16 @@ def is_valid_digit(s):
     elif isinstance(device, torch.device):
         device = str(device)
     elif isinstance(device, str):  ## for cuda:0
-        device = device
+        if device == "tp":  # pragma: no cover
+            # should not specify card, e.g., cuda:0
+            if torch.cuda.is_available():
+                device = "cuda"
+            elif is_hpex_available():
+                device = "hpu"
+            else:
+                device = "cpu"
+        else:
+            device = device
     return device
 
 
diff --git a/test/test_hpu/test_inference.py b/test/test_hpu/test_inference.py
@@ -26,76 +26,78 @@ def is_hpex_available():
     return True
 
 
-class TestAutoRound(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        model_name = "facebook/opt-125m"
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree("./saved", ignore_errors=True)
-        shutil.rmtree("runs", ignore_errors=True)
-
-    def test_autogptq_format_hpu_inference(self):
-        if not is_hpex_available():
-            return
-        try:
-            import auto_gptq
-        except:
-            return
-        bits, group_size, sym = 4, 128, False
-        autoround = AutoRound(
-            self.model,
-            self.tokenizer,
-            bits=bits,
-            group_size=group_size,
-            sym=sym,
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        autoround.quantize()
-        quantized_model_path = "./saved"
-
-        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_gptq")
-        model = (
-            AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True)
-            .to("hpu")
-            .to(torch.float32)
-        )
-        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
-        text = "There is a girl who likes adventure,"
-        inputs = tokenizer(text, return_tensors="pt").to(model.device)
-        print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_autoround_format_hpu_inference(self):
-        if not is_hpex_available():
-            return
-        bits, group_size, sym = 4, 128, False
-        autoround = AutoRound(
-            self.model,
-            self.tokenizer,
-            bits=bits,
-            group_size=group_size,
-            sym=sym,
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        autoround.quantize()
-        quantized_model_path = "./saved"
-
-        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
-
-        model = (
-            AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto").to("hpu").to(torch.float32)
-        )
-        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
-        text = "There is a girl who likes adventure,"
-        inputs = tokenizer(text, return_tensors="pt").to(model.device)
-        print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
-        shutil.rmtree("./saved", ignore_errors=True)
+# TODO: This test case is temporarily commented out since it not tested for a long time. We need to add it back and change it into pytest format.
+
+# class TestAutoRound(unittest.TestCase):
+#     @classmethod
+#     def setUpClass(self):
+#         model_name = "facebook/opt-125m"
+#         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+#         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+#         self.llm_dataloader = LLMDataLoader()
+
+#     @classmethod
+#     def tearDownClass(self):
+#         shutil.rmtree("./saved", ignore_errors=True)
+#         shutil.rmtree("runs", ignore_errors=True)
+
+#     def test_autogptq_format_hpu_inference(self):
+#         if not is_hpex_available():
+#             return
+#         try:
+#             import auto_gptq
+#         except:
+#             return
+#         bits, group_size, sym = 4, 128, False
+#         autoround = AutoRound(
+#             self.model,
+#             self.tokenizer,
+#             bits=bits,
+#             group_size=group_size,
+#             sym=sym,
+#             iters=2,
+#             seqlen=2,
+#             dataset=self.llm_dataloader,
+#         )
+#         autoround.quantize()
+#         quantized_model_path = "./saved"
+
+#         autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_gptq")
+#         model = (
+#             AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True)
+#             .to("hpu")
+#             .to(torch.float32)
+#         )
+#         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+#         text = "There is a girl who likes adventure,"
+#         inputs = tokenizer(text, return_tensors="pt").to(model.device)
+#         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
+#         shutil.rmtree("./saved", ignore_errors=True)
+
+#     def test_autoround_format_hpu_inference(self):
+#         if not is_hpex_available():
+#             return
+#         bits, group_size, sym = 4, 128, False
+#         autoround = AutoRound(
+#             self.model,
+#             self.tokenizer,
+#             bits=bits,
+#             group_size=group_size,
+#             sym=sym,
+#             iters=2,
+#             seqlen=2,
+#             dataset=self.llm_dataloader,
+#         )
+#         autoround.quantize()
+#         quantized_model_path = "./saved"
+
+#         autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
+
+#         model = (
+#             AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto").to("hpu").to(torch.float32)
+#         )
+#         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+#         text = "There is a girl who likes adventure,"
+#         inputs = tokenizer(text, return_tensors="pt").to(model.device)
+#         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
+#         shutil.rmtree("./saved", ignore_errors=True)