xiangyuT
diff --git a/‎tests/upper_frontends/test_start_async_llm_engine.py‎
Lines changed: 26 additions & 0 deletions b/‎tests/upper_frontends/test_start_async_llm_engine.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎vllm/core/scheduler.py‎
Lines changed: 293 additions & 2 deletions b/‎vllm/core/scheduler.py‎
Lines changed: 293 additions & 2 deletions
diff --git a/‎vllm/engine/arg_utils.py‎
Lines changed: 1 addition & 0 deletions b/‎vllm/engine/arg_utils.py‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,26 @@
+"""Try sending a mocked request to the underlying model execute stage"""
+
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.arg_utils import AsyncEngineArgs
+import pytest
+import asyncio
+
+# This is the model to load for workers
+MODEL_PATH="/models/vicuna-7b/"
+
+
+"""
+1. Test to start a AsyncLLMEngine, to ensure that all goes well before start serving.
+"""
+
+@pytest.mark.asyncio
+async def test_model_execution():
+    # Let's build an engine_args    
+    engine_args = AsyncEngineArgs(model='/models/vicuna-7b/', tokenizer='/models/vicuna-7b/', tokenizer_mode='auto', trust_remote_code=False, download_dir=None, load_format='auto', dtype='auto', seed=0, max_model_len=None, worker_use_ray=False, pipeline_parallel_size=1, tensor_parallel_size=1, block_size=16, swap_space=16, gpu_memory_utilization=0.9, max_num_batched_tokens=None, max_num_seqs=256, disable_log_stats=False, revision=None, tokenizer_revision=None, quantization=None, engine_use_ray=False, disable_log_requests=True, max_log_len=None)
+    # Start the engine
+    engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+    engine.start_background_loop()
+    await asyncio.sleep(5)
+
+
@@ -61,14 +61,15 @@ def __init__(
         cache_config: CacheConfig,
     ) -> None:
         self.scheduler_config = scheduler_config
-        self.cache_config = cache_config
+        #self.cache_config = cache_config
 
         self.prompt_limit = min(self.scheduler_config.max_model_len,
                                 self.scheduler_config.max_num_batched_tokens)
 
         # Instantiate the scheduling policy.
         self.policy = PolicyFactory.get_policy(policy_name="fcfs")
-        # # Create the block space manager.
+        # Create the block space manager.
+        # CO(gc): disable the block_manager
         # self.block_manager = BlockSpaceManager(
         #     block_size=self.cache_config.block_size,
         #     num_gpu_blocks=self.cache_config.num_gpu_blocks,
@@ -270,6 +271,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
         for seq_group in scheduler_outputs.scheduled_seq_groups:
             seq_data: Dict[int, List[SequenceData]] = {}
             block_tables: Dict[int, List[int]] = {}
+
             for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
                 seq_id = seq.seq_id
                 seq_data[seq_id] = seq.data
@@ -391,3 +393,292 @@ def _swap_out(
         blocks_to_swap_out.update(mapping)
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
             seq.status = SequenceStatus.SWAPPED
+
+
+
+
+class FixedWindowScheduler:
+
+    def __init__(
+        self,
+        scheduler_config: SchedulerConfig,
+        cache_config: CacheConfig,
+    ) -> None:
+        self.scheduler_config = scheduler_config
+        #self.cache_config = cache_config
+
+        self.prompt_limit = min(self.scheduler_config.max_model_len,
+                                self.scheduler_config.max_num_batched_tokens)
+
+        # Instantiate the scheduling policy.
+        self.policy = PolicyFactory.get_policy(policy_name="fcfs")
+
+        # Sequence groups in the WAITING state.
+        self.waiting: List[SequenceGroup] = []
+        # Sequence groups in the RUNNING state.
+        self.running: List[SequenceGroup] = []
+
+    def add_seq_group(self, seq_group: SequenceGroup) -> None:
+        # Add sequence groups to the waiting queue.
+        self.waiting.append(seq_group)
+
+    def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
+        if isinstance(request_id, str):
+            request_id = (request_id, )
+        request_ids = set(request_id)
+        for state_queue in [self.waiting, self.running]:
+            # We need to reverse the list as we are removing elements
+            # from it as we iterate over it. If we don't do it,
+            # indices will get messed up and we will skip over elements.
+            for seq_group in reversed(state_queue):
+                if seq_group.request_id in request_ids:
+                    # Remove the sequence group from the state queue.
+                    state_queue.remove(seq_group)
+                    for seq in seq_group.get_seqs():
+                        if seq.is_finished():
+                            continue
+                        seq.status = SequenceStatus.FINISHED_ABORTED
+                        self.free_seq(seq)
+                    request_ids.remove(seq_group.request_id)
+                    if not request_ids:
+                        return
+
+    def has_unfinished_seqs(self) -> bool:
+        return self.waiting or self.running
+
+    def get_num_unfinished_seq_groups(self) -> int:
+        return len(self.waiting) + len(self.running)
+
+    def _schedule(self) -> SchedulerOutputs:
+
+        # Fix the current time.
+        now = time.monotonic()
+
+        ignored_seq_groups: List[SequenceGroup] = []
+        scheduled: List[SequenceGroup] = []
+        # The total number of sequences on the fly, including the
+        # requests in the generation phase.
+        num_curr_seqs = sum(seq_group.get_max_num_running_seqs()
+                            for seq_group in self.running)
+        num_batched_tokens = 0
+
+        if self.waiting:
+            # We restrict how many requests that can be run using these three arguments
+            # Co(gc): If there are waiting requests, we will just try to add it into the running state if not exceeds the stage
+            # Co(gc): prefilled requests are prioritied over decoding stage requests
+            while self.waiting:
+                seq_group = self.waiting[0]
+
+                assert seq_group.num_seqs() == 1, (
+                    "Waiting sequence group should have only one prompt "
+                    "sequence.")
+                num_prompt_tokens = seq_group.get_seqs()[0].get_len()
+                if num_prompt_tokens > self.prompt_limit:
+                    logger.warning(
+                        f"Input prompt ({num_prompt_tokens} tokens) is too long"
+                        f" and exceeds limit of {self.prompt_limit}")
+                    for seq in seq_group.get_seqs():
+                        seq.status = SequenceStatus.FINISHED_IGNORED
+                    ignored_seq_groups.append(seq_group)
+                    self.waiting.pop(0)
+                    continue
+
+                # If the sequence group cannot be allocated, stop.
+                # if not self.block_manager.can_allocate(seq_group):
+                #     break
+
+                # If the number of batched tokens exceeds the limit, stop.
+                if (num_batched_tokens + num_prompt_tokens >
+                        self.scheduler_config.max_num_batched_tokens):
+                    break
+
+                # The total number of sequences in the RUNNING state should not
+                # exceed the maximum number of sequences.
+                num_new_seqs = seq_group.get_max_num_running_seqs()
+                if (num_curr_seqs + num_new_seqs >
+                        self.scheduler_config.max_num_seqs):
+                    break
+
+                seq_group = self.waiting.pop(0)
+                for seq in seq_group.get_seqs():
+                    seq.status = SequenceStatus.RUNNING
+                #self._allocate(seq_group)
+                self.running.append(seq_group)
+                num_batched_tokens += num_prompt_tokens
+                num_curr_seqs += num_new_seqs
+                scheduled.append(seq_group)
+
+                print("We have waited sequence_groups")
+
+            scheduler_outputs = SchedulerOutputs(
+                scheduled_seq_groups=scheduled,
+                prompt_run=True,
+                num_batched_tokens=num_batched_tokens,
+                blocks_to_swap_in={},
+                blocks_to_swap_out={},
+                blocks_to_copy={},
+                ignored_seq_groups=ignored_seq_groups,
+            )
+            return scheduler_outputs
+
+        # Now consider all the requests in decoding stage
+        self.running = self.policy.sort_by_priority(now, self.running)
+
+        # Each sequence in the generation phase only takes one token slot.
+        # Therefore, the number of batched tokens is equal to the number of
+        # sequences in the RUNNING state.
+        num_batched_tokens = sum(
+            seq_group.num_seqs(status=SequenceStatus.RUNNING)
+            for seq_group in self.running)
+
+        scheduler_outputs = SchedulerOutputs(
+            scheduled_seq_groups=self.running,
+            prompt_run=False,
+            num_batched_tokens=num_batched_tokens,
+            blocks_to_swap_in={},
+            blocks_to_swap_out={},
+            blocks_to_copy={},
+            ignored_seq_groups=[],
+        )
+        return scheduler_outputs
+
+    def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
+        # Schedule sequence groups.
+        # This function call changes the internal states of the scheduler
+        # such as self.running, self.swapped, and self.waiting.
+        scheduler_outputs = self._schedule()
+
+        # Create input data structures.
+        seq_group_metadata_list: List[SequenceGroupMetadata] = []
+        for seq_group in scheduler_outputs.scheduled_seq_groups:
+            seq_data: Dict[int, List[SequenceData]] = {}
+            block_tables: Dict[int, List[int]] = {}
+            print("Here we print the length of the seq_groups")
+            print(len(seq_group.get_seqs()))
+            print("The following sequences are scheduled")
+            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+                seq_id = seq.seq_id
+                seq_data[seq_id] = seq.data
+                #block_tables[seq_id] = self.block_manager.get_block_table(seq)
+
+            seq_group_metadata = SequenceGroupMetadata(
+                request_id=seq_group.request_id,
+                is_prompt=scheduler_outputs.prompt_run,
+                seq_data=seq_data,
+                sampling_params=seq_group.sampling_params,
+                block_tables=block_tables,
+            )
+            print(seq_group_metadata.seq_data.keys())
+            seq_group_metadata_list.append(seq_group_metadata)
+        return seq_group_metadata_list, scheduler_outputs
+
+    def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+        self.block_manager.fork(parent_seq, child_seq)
+
+    def free_seq(self, seq: Sequence) -> None:
+        self.block_manager.free(seq)
+
+    def free_finished_seq_groups(self) -> None:
+        for seq_group in self.running:
+            if seq_group.is_finished():
+                print("A finished seq_group")
+                print(seq_group)
+        self.running = [
+            seq_group for seq_group in self.running
+            if not seq_group.is_finished()
+        ]
+
+    def _allocate(self, seq_group: SequenceGroup) -> None:
+        self.block_manager.allocate(seq_group)
+        for seq in seq_group.get_seqs():
+            seq.status = SequenceStatus.RUNNING
+
+    def _append_slot(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> None:
+        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+            ret = self.block_manager.append_slot(seq)
+            if ret is not None:
+                src_block, dst_block = ret
+                if src_block in blocks_to_copy:
+                    blocks_to_copy[src_block].append(dst_block)
+                else:
+                    blocks_to_copy[src_block] = [dst_block]
+
+    def _preempt(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_out: Dict[int, int],
+        preemption_mode: Optional[PreemptionMode] = None,
+    ) -> None:
+        # If preemption mode is not specified, we determine the mode as follows:
+        # We use recomputation by default since it incurs lower overhead than
+        # swapping. However, when the sequence group has multiple sequences
+        # (e.g., beam search), recomputation is not currently supported. In
+        # such a case, we use swapping instead.
+        # FIXME(woosuk): This makes our scheduling policy a bit bizarre.
+        # As swapped sequences are prioritized over waiting sequences,
+        # sequence groups with multiple sequences are implicitly prioritized
+        # over sequence groups with a single sequence.
+        # TODO(woosuk): Support recomputation for sequence groups with multiple
+        # sequences. This may require a more sophisticated CUDA kernel.
+        if preemption_mode is None:
+            if seq_group.get_max_num_running_seqs() == 1:
+                preemption_mode = PreemptionMode.RECOMPUTE
+            else:
+                preemption_mode = PreemptionMode.SWAP
+        if preemption_mode == PreemptionMode.RECOMPUTE:
+            self._preempt_by_recompute(seq_group)
+        elif preemption_mode == PreemptionMode.SWAP:
+            self._preempt_by_swap(seq_group, blocks_to_swap_out)
+        else:
+            assert False, "Invalid preemption mode."
+
+    def _preempt_by_recompute(
+        self,
+        seq_group: SequenceGroup,
+    ) -> None:
+        seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
+        assert len(seqs) == 1
+        for seq in seqs:
+            seq.status = SequenceStatus.WAITING
+            self.block_manager.free(seq)
+        # NOTE: For FCFS, we insert the preempted sequence group to the front
+        # of the waiting queue.
+        self.waiting.insert(0, seq_group)
+
+    def _preempt_by_swap(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_out: Dict[int, int],
+    ) -> None:
+        self._swap_out(seq_group, blocks_to_swap_out)
+        self.swapped.append(seq_group)
+
+    def _swap_in(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_in: Dict[int, int],
+    ) -> None:
+        mapping = self.block_manager.swap_in(seq_group)
+        blocks_to_swap_in.update(mapping)
+        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
+            seq.status = SequenceStatus.RUNNING
+
+    def _swap_out(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_out: Dict[int, int],
+    ) -> None:
+        if not self.block_manager.can_swap_out(seq_group):
+            # FIXME(woosuk): Abort the sequence group instead of aborting the
+            # entire engine.
+            raise RuntimeError(
+                "Aborted due to the lack of CPU swap space. Please increase "
+                "the swap space to avoid this error.")
+        mapping = self.block_manager.swap_out(seq_group)
+        blocks_to_swap_out.update(mapping)
+        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+            seq.status = SequenceStatus.SWAPPED
@@ -185,6 +185,7 @@ def create_engine_configs(
                                    self.dtype, self.seed, self.revision,
                                    self.tokenizer_revision, self.max_model_len,
                                    self.quantization)
+        # gc-TODO: disable cache_config later
         cache_config = CacheConfig(
             self.block_size, self.gpu_memory_utilization, self.swap_space,
             getattr(model_config.hf_config, 'sliding_window', None))