fix gen config passing

yuanheng-zhao · yuanheng-zhao · commit 5a526b8530a1 · 2024-05-17T09:48:40.000Z
diff --git a/colossalai/inference/core/engine.py b/colossalai/inference/core/engine.py
@@ -688,11 +688,12 @@ def prepare_input(self, batch: BatchBucket) -> Tuple[torch.Tensor, torch.Tensor,
         )
 
         batch_token_ids = None
-        config_dict = self.generation_config.to_dict()
-        # process repetition_penalty, no_repeat_ngram_size
-        for type in ["repetition_penalty", "no_repeat_ngram_size"]:
-            if type in config_dict and config_dict[type] is not None:
-                batch_token_ids = batch.batch_token_ids
+        if (
+            self.generation_config.repetition_penalty != 1.0
+            or self.generation_config.no_repeat_ngram_size > 0
+            or self.generation_config.forced_eos_token_id is not None
+        ):
+            batch_token_ids = batch.batch_token_ids
 
         # only when we have the graph for specific decoding batch size can we use the cuda graph for inference
         use_cuda_graph = False
diff --git a/colossalai/inference/core/request_handler.py b/colossalai/inference/core/request_handler.py
@@ -8,8 +8,6 @@
 from colossalai.inference.config import InferenceConfig
 from colossalai.inference.flash_decoding_utils import FDIntermTensors
 from colossalai.inference.kv_cache import KVCacheManager, RPCKVCacheManager
-from colossalai.inference.logit_processors import logits_processor
-from colossalai.inference.sampler import *
 from colossalai.inference.struct import RequestStatus, Sequence
 from colossalai.logging import get_dist_logger
 
diff --git a/colossalai/inference/logit_processors.py b/colossalai/inference/logit_processors.py
@@ -134,20 +134,20 @@ def apply_top_p(logits, top_p: float):
 def apply_forced_bos_token_id(
     logits: torch.Tensor,
     sequence_lengths: Union[torch.Tensor, List[int]],
-    max_out_lengths: Union[torch.Tensor, List[int]],
+    max_lengths: Union[torch.Tensor, List[int]],
     bos_token_id: int,
 ):
     # NOTE For now, optimizations for encoder-decoder models have not been supported yet
     # And this function will never be called in the current implementation.
     if isinstance(sequence_lengths, torch.Tensor):
         sequence_lengths = sequence_lengths.tolist()
-    if isinstance(max_out_lengths, torch.Tensor):
-        max_out_lengths = max_out_lengths.tolist()
+    if isinstance(max_lengths, torch.Tensor):
+        max_lengths = max_lengths.tolist()
 
     select_indexes = []
     num_sequences = logits.shape[0]
     sequence_lengths = sequence_lengths[:num_sequences]
-    max_out_lengths = max_out_lengths[:num_sequences]
+    max_lengths = max_lengths[:num_sequences]
     for i, sequence_length in enumerate(sequence_lengths):
         if sequence_length == 1:
             select_indexes.append(i)
@@ -162,7 +162,7 @@ def apply_forced_bos_token_id(
 def apply_forced_eos_token_id(
     logits: torch.Tensor,
     sequence_lengths: Union[torch.Tensor, List[int]],
-    max_out_lengths: Union[torch.Tensor, List[int]],
+    max_lengths: Union[torch.Tensor, List[int]],
     eos_token_id: Union[int, List[int]],
 ):
     """
@@ -172,22 +172,22 @@ def apply_forced_eos_token_id(
 
     Args:
         logits(torch.Tensor): logits
-        sequence_lengths(torch.Tensor): sequence lengths
-        max_out_lengths(torch.Tensor): maximum output lengths for each sequence
+        sequence_lengths(torch.Tensor): sequence lengths including prompt and output tokens
+        max_lengths(torch.Tensor): the maximum length for each sequence
         eos_token_id(Union[int, List[int]]): forced eos token id
     """
     if isinstance(eos_token_id, int):
         eos_token_id = [eos_token_id]
     if isinstance(sequence_lengths, torch.Tensor):
         sequence_lengths = sequence_lengths.tolist()
-    if isinstance(max_out_lengths, torch.Tensor):
-        max_out_lengths = max_out_lengths.tolist()
+    if isinstance(max_lengths, torch.Tensor):
+        max_lengths = max_lengths.tolist()
 
     select_indexes = []
     num_sequences = logits.shape[0]
     sequence_lengths = sequence_lengths[:num_sequences]
-    max_out_lengths = max_out_lengths[:num_sequences]
-    for i, (sequence_length, max_out_length) in enumerate(zip(sequence_lengths, max_out_lengths)):
+    max_lengths = max_lengths[:num_sequences]
+    for i, (sequence_length, max_out_length) in enumerate(zip(sequence_lengths, max_lengths)):
         if sequence_length == max_out_length - 1:
             select_indexes.append(i)
     if select_indexes:
diff --git a/colossalai/inference/sampler.py b/colossalai/inference/sampler.py
@@ -3,7 +3,7 @@
 import torch
 from transformers.generation import GenerationConfig
 
-from colossalai.inference.logit_processors import logit_processor
+from colossalai.inference.logit_processors import get_logits_processor
 
 
 def greedy_sample(
@@ -86,18 +86,28 @@ def search_tokens(
     Sample tokens for finished requests.
     """
     # NOTE: need to decide the granularity to process logits (sequence or batch)
+    print(
+        f"CHECK search_tokens max_length {generation_config.max_length}; max_new_tokens {generation_config.max_new_tokens}"
+    )
     config_dict = generation_config.to_dict()
-    # process repetition_penalty, no_repeat_ngram_size
-    for type in ["repetition_penalty", "no_repeat_ngram_size"]:
-        if type in config_dict and config_dict[type] is not None:
-            logits = logit_processor(type, logits, config_dict[type], batch_token_ids)
+    if (repetition_penalty := config_dict.get("repetition_penalty", 1.0)) != 1.0:
+        logits = get_logits_processor("repetition_penalty", logits, repetition_penalty, batch_token_ids)
+    if (no_repeat_ngram_size := config_dict.get("no_repeat_ngram_size", 0)) > 0:
+        logits = get_logits_processor("no_repeat_ngram_size", logits, no_repeat_ngram_size, batch_token_ids)
+    if (forced_eos_token_id := config_dict.get("forced_eos_token_id", None)) is not None:
+        sequence_lengths = [len(batch_token_ids[i]) for i in range(len(batch_token_ids))]
+        max_out_lengths = [generation_config.max_length for _ in range(len(batch_token_ids))]
+        logits = get_logits_processor(
+            "forced_eos_token_id", logits, sequence_lengths, max_out_lengths, forced_eos_token_id
+        )
 
-    # do logit processor
     if generation_config.do_sample:
-        # process temperature, top_k, top_p
-        for type in ["temperature", "top_k", "top_p"]:
-            if type in config_dict and config_dict[type] is not None:
-                logits = logit_processor(type, logits, config_dict[type])
+        if (temperature := config_dict.get("temperature", 1.0)) != 1.0:
+            logits = get_logits_processor("temperature", logits, temperature)
+        if (top_k := config_dict.get("top_k", 0)) != 0:
+            logits = get_logits_processor("top_k", logits, top_k)
+        if (top_p := config_dict.get("top_p", 1.0)) < 1.0:
+            logits = get_logits_processor("top_p", logits, top_p)
 
     # calculate probs
     probs = torch.softmax(logits, dim=-1, dtype=torch.float)