Minor fix on code style

wangzhen263 · Aug 24, 2023 · d55bedd · d55bedd
1 parent 0017fd9
commit d55bedd
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 8 deletions.
diff --git a/fastchat/serve/inference.py b/fastchat/serve/inference.py
@@ -66,6 +66,9 @@ def generate_stream(
     stream_interval: int = 2,
     judge_sent_end: bool = False,
 ):
+    if hasattr(model, "device"):
+        device = model.device
+
     # Read parameters
     prompt = params["prompt"]
     len_prompt = len(prompt)
@@ -95,12 +98,12 @@ def generate_stream(
 
     if model.config.is_encoder_decoder:
         encoder_output = model.encoder(
-            input_ids=torch.as_tensor([input_ids], device=model.device)
+            input_ids=torch.as_tensor([input_ids], device=device)
         )[0]
         start_ids = torch.as_tensor(
             [[model.generation_config.decoder_start_token_id]],
             dtype=torch.int64,
-            device=model.device,
+            device=device,
         )
 
     past_key_values = out = None
@@ -115,17 +118,15 @@ def generate_stream(
                 )
                 logits = model.lm_head(out[0])
             else:
-                out = model(
-                    torch.as_tensor([input_ids], device=model.device), use_cache=True
-                )
+                out = model(torch.as_tensor([input_ids], device=device), use_cache=True)
                 logits = out.logits
             past_key_values = out.past_key_values
         else:  # decoding
             if model.config.is_encoder_decoder:
                 out = model.decoder(
                     input_ids=torch.as_tensor(
                         [[token] if not sent_interrupt else output_ids],
-                        device=model.device,
+                        device=device,
                     ),
                     encoder_hidden_states=encoder_output,
                     use_cache=True,
@@ -138,7 +139,7 @@ def generate_stream(
                 out = model(
                     input_ids=torch.as_tensor(
                         [[token] if not sent_interrupt else output_ids],
-                        device=model.device,
+                        device=device,
                     ),
                     use_cache=True,
                     past_key_values=past_key_values if not sent_interrupt else None,

diff --git a/fastchat/train/train_mem.py b/fastchat/train/train_mem.py
@@ -1,7 +1,7 @@
 # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
 
 # Need to call this before importing transformers.
-from fastchat.train.llama2_flash_attn_monkey_patch import (
+from fastchat.train.llama_flash_attn_monkey_patch import (
     replace_llama_attn_with_flash_attn,
 )