refactor and add

yuanheng-zhao · yuanheng-zhao · commit 3f1fc85ddedc · 2024-05-13T03:17:44.000Z
diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py
@@ -137,6 +137,7 @@ class InferenceConfig:
     top_k: Optional[int] = None
     top_p: Optional[float] = None
     min_p: Optional[float] = None
+    forced_eos_token_id: int = None
 
     # speculative decoding configs
     max_n_spec_tokens: int = 5
diff --git a/colossalai/inference/core/engine.py b/colossalai/inference/core/engine.py
@@ -424,7 +424,7 @@ def steps_spec_dec(self) -> List[Sequence]:
 
         # 2. Prefill main model (Verifier) - fill past kv cache for main model
         logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
-        next_tokens = self.request_handler.search_tokens(self.generation_config, logits)
+        next_tokens = self.request_handler.search_tokens(logits, batch, self.generation_config)
         # append new inputs to the batch, temporarily
         batch.append_batch_tokens(next_tokens)
         self.request_handler.allocate_batch_spec_dec(batch, 1)
@@ -472,7 +472,7 @@ def steps_spec_dec(self) -> List[Sequence]:
             input_token_ids, output_tensor, input_meta_data = self.prepare_input(batch)
             logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
 
-            next_tokens = self.request_handler.search_tokens(self.generation_config, logits)
+            next_tokens = self.request_handler.search_tokens(logits, batch, self.generation_config)
 
             # 5. Compare and process the results
             diff_indexes = torch.nonzero(~(next_tokens[:-1] == next_token_ids_spec))
@@ -738,7 +738,7 @@ def step(self) -> List[str]:
         logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
         if self.inference_config.pad_input:
             logits = logits[:, -1, :]
-        next_tokens = self.request_handler.search_tokens(self.generation_config, logits)
+        next_tokens = self.request_handler.search_tokens(logits, batch, self.generation_config)
         self.request_handler.append_next_tokens(next_tokens)
         finished_sequences = self.request_handler.update()
 
diff --git a/colossalai/inference/core/request_handler.py b/colossalai/inference/core/request_handler.py
@@ -8,7 +8,7 @@
 from colossalai.inference.config import InferenceConfig
 from colossalai.inference.flash_decoding_utils import FDIntermTensors
 from colossalai.inference.kv_cache import KVCacheManager
-from colossalai.inference.logit_processors import logit_processor
+from colossalai.inference.logit_processors import logits_processor
 from colossalai.inference.sampler import *
 from colossalai.inference.struct import RequestStatus, Sequence
 from colossalai.logging import get_dist_logger
@@ -331,9 +331,19 @@ def check_unfinished_seqs(self) -> bool:
     def total_requests_in_batch_bucket(self) -> int:
         return self.prefill_bb.current_batch_size + self.running_bb.current_batch_size
 
-    def search_tokens(self, generation_config: GenerationConfig, logits):
+    def search_tokens(
+        self,
+        logits: torch.Tensor,
+        batch_bucket: BatchBucket,
+        generation_config: GenerationConfig,
+    ):
         """
         Sample tokens for finished requests.
+
+        Args:
+            input_ids (torch.Tensor): [num_token_ids] The flattened input tensor.
+            logits (torch.Tensor): [num_seqs, vocab_size] The logits tensor.
+            generation_config (GenerationConfig): The generation configuration.
         """
 
         # do logit processor
@@ -342,7 +352,18 @@ def search_tokens(self, generation_config: GenerationConfig, logits):
             config_dict = generation_config.to_dict()
             for type in ["temperature", "top_k", "top_p"]:
                 if type in config_dict and config_dict[type] is not None:
-                    logits = logit_processor(type, logits, config_dict[type])
+                    logits = logits_processor(type, logits, config_dict[type])
+
+        forced_eos_token_id = config_dict.get("forced_eos_token_id", None)
+        if forced_eos_token_id is not None:
+            # sequence_lengths = batch_bucket.seq_lengths
+            num_seqs = len(batch_bucket)
+            seq_out_lengths, max_out_lengths = [0] * num_seqs, [0] * num_seqs
+            max_out_lengths = [0] * num_seqs
+            for i, seq in enumerate(batch_bucket.seqs_li):
+                # retrieve the current output length and the maximum out length bound with each Sequence
+                seq_out_lengths[i], max_out_lengths[i] = seq.output_len, seq.max_output_len
+            logits_processor("forced_eos_token_id", logits, seq_out_lengths, max_out_lengths, forced_eos_token_id)
 
         # calculate probs
         probs = torch.softmax(logits, dim=-1, dtype=torch.float)
diff --git a/colossalai/inference/logit_processors.py b/colossalai/inference/logit_processors.py
@@ -1,24 +1,27 @@
+import logging
+from typing import List, Union
+
 import torch
 import torch.nn.functional as F
 
-_LOGIT_PROCESSOR_MAP = {}
+_LOGITS_PROCESSOR_MAP = {}
 
 
-def register_logit_processor(process_type):
+def register_logits_processor(process_type):
     """
     register flops computation function for operation.
     """
 
     def register(func):
-        global _LOGIT_PROCESSOR_MAP
-        _LOGIT_PROCESSOR_MAP[process_type] = func
+        global _LOGITS_PROCESSOR_MAP
+        _LOGITS_PROCESSOR_MAP[process_type] = func
         return func
 
     return register
 
 
-@register_logit_processor("temperature")
-def temperature_logit_process(logits, temperature: float):
+@register_logits_processor("temperature")
+def temperature_logits_process(logits, temperature: float):
     """
     apply temperature scaling.
     """
@@ -32,8 +35,8 @@ def temperature_logit_process(logits, temperature: float):
     return logits if temperature == 1.0 else logits / temperature
 
 
-@register_logit_processor("top_k")
-def top_k_logit_processor(logits, top_k: int):
+@register_logits_processor("top_k")
+def top_k_logits_processor(logits, top_k: int):
     """
     top_k logit processor
     """
@@ -46,8 +49,8 @@ def top_k_logit_processor(logits, top_k: int):
     return logits
 
 
-@register_logit_processor("top_p")
-def top_p_logit_processor(logits, top_p: float):
+@register_logits_processor("top_p")
+def top_p_logits_processor(logits, top_p: float):
     """
     top_p logit processor
     """
@@ -68,24 +71,88 @@ def top_p_logit_processor(logits, top_p: float):
     return logits
 
 
-def logit_processor(processor: str, logits, attrs):
+@register_logits_processor("forced_bos_token_id")
+def forced_bos_token_processor(
+    logits: torch.Tensor,
+    sequence_lengths: Union[torch.Tensor, List[int]],
+    max_out_lengths: Union[torch.Tensor, List[int]],
+    bos_token_id: int,
+):
+    # NOTE For now, optimizations for encoder-decoder models have not been supported yet
+    # And this function will never be called in the current implementation.
+    if isinstance(sequence_lengths, torch.Tensor):
+        sequence_lengths = sequence_lengths.tolist()
+    if isinstance(max_out_lengths, torch.Tensor):
+        max_out_lengths = max_out_lengths.tolist()
+
+    select_indexes = []
+    num_sequences = logits.shape[0]
+    sequence_lengths = sequence_lengths[:num_sequences]
+    max_out_lengths = max_out_lengths[:num_sequences]
+    for i, sequence_length in enumerate(sequence_lengths):
+        if sequence_length == 1:
+            select_indexes.append(i)
+    if select_indexes:
+        logits[select_indexes, :] = -float("inf")
+        logits[select_indexes, bos_token_id] = 0
+
+    return logits
+
+
+@register_logits_processor("forced_eos_token_id")
+def forced_eos_token_processor(
+    logits: torch.Tensor,
+    sequence_lengths: Union[torch.Tensor, List[int]],
+    max_out_lengths: Union[torch.Tensor, List[int]],
+    eos_token_id: Union[int, List[int]],
+):
+    """
+    Enforces the specified token as the last generated token when the maximum output length
+    is reached. Notice that the maximum output lengths for different sequences, even if they're
+    in the same batch, can be different.
+
+    Args:
+        logits(torch.Tensor): logits
+        sequence_lengths(torch.Tensor): sequence lengths
+        max_out_lengths(torch.Tensor): maximum output lengths for each sequence
+        eos_token_id(Union[int, List[int]]): forced eos token id
+    """
+    if isinstance(eos_token_id, int):
+        eos_token_id = [eos_token_id]
+    if isinstance(sequence_lengths, torch.Tensor):
+        sequence_lengths = sequence_lengths.tolist()
+    if isinstance(max_out_lengths, torch.Tensor):
+        max_out_lengths = max_out_lengths.tolist()
+
+    select_indexes = []
+    num_sequences = logits.shape[0]
+    sequence_lengths = sequence_lengths[:num_sequences]
+    max_out_lengths = max_out_lengths[:num_sequences]
+    for i, (sequence_length, max_out_length) in enumerate(zip(sequence_lengths, max_out_lengths)):
+        if sequence_length == max_out_length - 1:
+            select_indexes.append(i)
+    if select_indexes:
+        logits[select_indexes, :] = -float("inf")
+        logits[select_indexes, eos_token_id] = 0
+
+    return logits
+
+
+def logits_processor(processor: str, logits, *args):
     """
     do logit process for given logits.
 
     Args:
         processor(str): the type of logit processor
         logits(torch.Tensor): input logits
-        attrs(dict): attrs of the logit processor
 
     Returns:
         logits after process
     """
-    if processor not in _LOGIT_PROCESSOR_MAP:
-        return logits
+    if processor not in _LOGITS_PROCESSOR_MAP:
+        logging.warning(f"Unsupported processor {processor}. Fall back to the original logits.")
     else:
-        func = _LOGIT_PROCESSOR_MAP[processor]
-        try:
-            logits = func(logits, attrs)
-        except Exception:
-            return logits
-        return logits
+        func = _LOGITS_PROCESSOR_MAP[processor]
+        logits = func(logits, *args)
+
+    return logits