xiangyuT
diff --git a/‎tests/under_models/send_mock_request.py‎
Lines changed: 99 additions & 26 deletions b/‎tests/under_models/send_mock_request.py‎
Lines changed: 99 additions & 26 deletions
diff --git a/‎tests/upper_frontends/test_start_async_llm_engine.py‎
Lines changed: 28 additions & 8 deletions b/‎tests/upper_frontends/test_start_async_llm_engine.py‎
Lines changed: 28 additions & 8 deletions
diff --git a/‎vllm/core/scheduler.py‎
Lines changed: 31 additions & 20 deletions b/‎vllm/core/scheduler.py‎
Lines changed: 31 additions & 20 deletions
diff --git a/‎vllm/engine/arg_utils.py‎
Lines changed: 0 additions & 1 deletion b/‎vllm/engine/arg_utils.py‎
Lines changed: 0 additions & 1 deletion
@@ -15,25 +15,35 @@
 import asyncio
 
 # This is the model to load for workers
-MODEL_PATH="/models/vicuna-7b/"
-
-
+MODEL_PATH = "YOUR_MODEL_PATH"
 """
 1. Prepare a faked sequencegroup meta data
 2. Start a mocked AsyncLLMEngine, and modify its step_async function
 3. invoke the step_async function manually
+4. this test tries to kick off the `model_execution` part for the
+   model so that we can perform tests
 """
 
+
 class UglyAsyncLLMEngine(LLMEngine):
     """Extension of LLMEngine to add async methods."""
 
     async def step_async(self) -> List[RequestOutput]:
-        sampling_para = SamplingParams(n=2, best_of=5, temperature=0.8, top_p=0.95, max_tokens=7)
+        sampling_para = SamplingParams(n=2,
+                                       best_of=5,
+                                       temperature=0.8,
+                                       top_p=0.95,
+                                       max_tokens=7)
         seq_data = {}
         seq_data[0] = SequenceData(prompt_token_ids=[1, 3087, 8970, 338, 263])
         request_id = "cmpl-7bef75eaa4394a3d895b5508dd5f69f6"
 
-        seq_group_meta_data = SequenceGroupMetadata(request_id=request_id, is_prompt=True, seq_data=seq_data, sampling_params=sampling_para, block_tables={})
+        seq_group_meta_data = SequenceGroupMetadata(
+            request_id=request_id,
+            is_prompt=True,
+            seq_data=seq_data,
+            sampling_params=sampling_para,
+            block_tables={})
         seq_group_meta_data_lists = [seq_group_meta_data]
 
         output = await self._run_workers_async(
@@ -44,24 +54,66 @@ async def step_async(self) -> List[RequestOutput]:
             blocks_to_copy={},
             finished_seqs=[],
         )
-        print(output)
 
-        # TODO: change this to real one
-        return RequestOutput(request_id=request_id, prompt="", prompt_token_ids=[1, 3087, 8970, 338, 263], outputs=[], finished=False)
+        # Co(gc): we cannot use the real one as it contains private methods that cannot be invoked
+        return RequestOutput(request_id=request_id,
+                             prompt="",
+                             prompt_token_ids=[1, 3087, 8970, 338, 263],
+                             outputs=[],
+                             finished=False)
 
     async def step_async_multiple(self) -> List[RequestOutput]:
+        """
+        Same but send two requests in a batch
+        """
         seq_group_metadata_lists = []
-        request_id_0= "cmpl-81e2b9767b5b47bca7e649482698d385"
-        seq_data_0 = {0: SequenceData(prompt_token_ids=[1, 3087, 8970, 338, 263])}
-        sampling_params_0 = SamplingParams(n=1, best_of=1, presence_penalty=0.0, frequency_penalty=0.0, temperature=0.0, top_p=1.0, top_k=-1, use_beam_search=False, length_penalty=1.0, early_stopping=False, stop=[], ignore_eos=False, max_tokens=7, logprobs=None, skip_special_tokens=True)
-
-        seq_group_metadata_lists.append(SequenceGroupMetadata(request_id_0, True, seq_data_0, sampling_params_0, {}))
+        request_id_0 = "cmpl-81e2b9767b5b47bca7e649482698d385"
+        seq_data_0 = {
+            0: SequenceData(prompt_token_ids=[1, 3087, 8970, 338, 263])
+        }
+        sampling_params_0 = SamplingParams(n=1,
+                                           best_of=1,
+                                           presence_penalty=0.0,
+                                           frequency_penalty=0.0,
+                                           temperature=0.0,
+                                           top_p=1.0,
+                                           top_k=-1,
+                                           use_beam_search=False,
+                                           length_penalty=1.0,
+                                           early_stopping=False,
+                                           stop=[],
+                                           ignore_eos=False,
+                                           max_tokens=7,
+                                           logprobs=None,
+                                           skip_special_tokens=True)
+
+        seq_group_metadata_lists.append(
+            SequenceGroupMetadata(request_id_0, True, seq_data_0,
+                                  sampling_params_0, {}))
 
         request_id_1 = "cmpl-81e2b9767b5b47bca7e649482698d385"
-        seq_data_1 = {1: SequenceData(prompt_token_ids=[1, 3087, 8970, 338, 263])}
-        sampling_params_1 = SamplingParams(n=1, best_of=1, presence_penalty=0.0, frequency_penalty=0.0, temperature=0.0, top_p=1.0, top_k=-1, use_beam_search=False, length_penalty=1.0, early_stopping=False, stop=[], ignore_eos=False, max_tokens=7, logprobs=None, skip_special_tokens=True)
-
-        seq_group_metadata_lists.append(SequenceGroupMetadata(request_id_1, True, seq_data_1, sampling_params_1, {}))
+        seq_data_1 = {
+            1: SequenceData(prompt_token_ids=[1, 3087, 8970, 338, 263])
+        }
+        sampling_params_1 = SamplingParams(n=1,
+                                           best_of=1,
+                                           presence_penalty=0.0,
+                                           frequency_penalty=0.0,
+                                           temperature=0.0,
+                                           top_p=1.0,
+                                           top_k=-1,
+                                           use_beam_search=False,
+                                           length_penalty=1.0,
+                                           early_stopping=False,
+                                           stop=[],
+                                           ignore_eos=False,
+                                           max_tokens=7,
+                                           logprobs=None,
+                                           skip_special_tokens=True)
+
+        seq_group_metadata_lists.append(
+            SequenceGroupMetadata(request_id_1, True, seq_data_1,
+                                  sampling_params_1, {}))
 
         output = await self._run_workers_async(
             "execute_model",
@@ -72,9 +124,11 @@ async def step_async_multiple(self) -> List[RequestOutput]:
             finished_seqs=[],
         )
 
-        # TODO: change this to real one
-        return RequestOutput(request_id=request_id_0, prompt="", prompt_token_ids=[1, 3087, 8970, 338, 263], outputs=[], finished=False)
-
+        return RequestOutput(request_id=request_id_0,
+                             prompt="",
+                             prompt_token_ids=[1, 3087, 8970, 338, 263],
+                             outputs=[],
+                             finished=False)
 
     async def _run_workers_async(
         self,
@@ -106,13 +160,36 @@ async def _run_workers_async(
             assert output == other_output
         return output
 
+
 setattr(AsyncLLMEngine, "_engine_class", UglyAsyncLLMEngine)
 
 
 @pytest.mark.asyncio
 async def test_model_execution():
-    # Let's build an engine_args    
-    engine_args = AsyncEngineArgs(model='/models/vicuna-7b/', tokenizer='/models/vicuna-7b/', tokenizer_mode='auto', trust_remote_code=False, download_dir=None, load_format='dummy', dtype='auto', seed=0, max_model_len=None, worker_use_ray=False, pipeline_parallel_size=1, tensor_parallel_size=1, block_size=16, swap_space=16, gpu_memory_utilization=0.9, max_num_batched_tokens=None, max_num_seqs=256, disable_log_stats=False, revision=None, tokenizer_revision=None, quantization=None, engine_use_ray=False, disable_log_requests=True, max_log_len=None)
+    # Let's build an engine_args
+    engine_args = AsyncEngineArgs(model=MODEL_PATH,
+                                  tokenizer=MODEL_PATH,
+                                  tokenizer_mode='auto',
+                                  trust_remote_code=False,
+                                  download_dir=None,
+                                  dtype='auto',
+                                  seed=0,
+                                  max_model_len=None,
+                                  worker_use_ray=False,
+                                  pipeline_parallel_size=1,
+                                  tensor_parallel_size=1,
+                                  block_size=16,
+                                  swap_space=16,
+                                  gpu_memory_utilization=0.9,
+                                  max_num_batched_tokens=None,
+                                  max_num_seqs=256,
+                                  disable_log_stats=False,
+                                  revision=None,
+                                  tokenizer_revision=None,
+                                  quantization=None,
+                                  engine_use_ray=False,
+                                  disable_log_requests=True,
+                                  max_log_len=None)
     # Start the engine
     engine = AsyncLLMEngine.from_engine_args(engine_args)
 
@@ -121,7 +198,3 @@ async def test_model_execution():
     await engine.engine.step_async()
     # Now let's try something difficult
     await engine.engine.step_async_multiple()
-
-
-
-
 
@@ -1,26 +1,46 @@
-"""Try sending a mocked request to the underlying model execute stage"""
+"""Try start the AsyncLLMEngine"""
 
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.arg_utils import AsyncEngineArgs
 import pytest
 import asyncio
 
 # This is the model to load for workers
-MODEL_PATH="/models/vicuna-7b/"
-
-
+MODEL_PATH = "YOUR_MODEL_PATH"
 """
 1. Test to start a AsyncLLMEngine, to ensure that all goes well before start serving.
 """
 
+
 @pytest.mark.asyncio
 async def test_model_execution():
-    # Let's build an engine_args    
-    engine_args = AsyncEngineArgs(model='/models/vicuna-7b/', tokenizer='/models/vicuna-7b/', tokenizer_mode='auto', trust_remote_code=False, download_dir=None, load_format='auto', dtype='auto', seed=0, max_model_len=None, worker_use_ray=False, pipeline_parallel_size=1, tensor_parallel_size=1, block_size=16, swap_space=16, gpu_memory_utilization=0.9, max_num_batched_tokens=None, max_num_seqs=256, disable_log_stats=False, revision=None, tokenizer_revision=None, quantization=None, engine_use_ray=False, disable_log_requests=True, max_log_len=None)
+    # Let's build an engine_args
+    engine_args = AsyncEngineArgs(model=MODEL_PATH,
+                                  tokenizer=MODEL_PATH,
+                                  tokenizer_mode='auto',
+                                  trust_remote_code=False,
+                                  download_dir=None,
+                                  load_format='auto',
+                                  dtype='auto',
+                                  seed=0,
+                                  max_model_len=None,
+                                  worker_use_ray=False,
+                                  pipeline_parallel_size=1,
+                                  tensor_parallel_size=1,
+                                  block_size=16,
+                                  swap_space=16,
+                                  gpu_memory_utilization=0.9,
+                                  max_num_batched_tokens=None,
+                                  max_num_seqs=256,
+                                  disable_log_stats=False,
+                                  revision=None,
+                                  tokenizer_revision=None,
+                                  quantization=None,
+                                  engine_use_ray=False,
+                                  disable_log_requests=True,
+                                  max_log_len=None)
     # Start the engine
     engine = AsyncLLMEngine.from_engine_args(engine_args)
 
     engine.start_background_loop()
     await asyncio.sleep(5)
-
-
 
@@ -64,20 +64,19 @@ def __init__(
         cache_config: CacheConfig,
     ) -> None:
         self.scheduler_config = scheduler_config
-        #self.cache_config = cache_config
+        self.cache_config = cache_config
 
         self.prompt_limit = min(self.scheduler_config.max_model_len,
                                 self.scheduler_config.max_num_batched_tokens)
 
         # Instantiate the scheduling policy.
         self.policy = PolicyFactory.get_policy(policy_name="fcfs")
         # Create the block space manager.
-        # CO(gc): disable the block_manager
-        # self.block_manager = BlockSpaceManager(
-        #     block_size=self.cache_config.block_size,
-        #     num_gpu_blocks=self.cache_config.num_gpu_blocks,
-        #     num_cpu_blocks=self.cache_config.num_cpu_blocks,
-        #     sliding_window=self.cache_config.sliding_window)
+        self.block_manager = BlockSpaceManager(
+            block_size=self.cache_config.block_size,
+            num_gpu_blocks=self.cache_config.num_gpu_blocks,
+            num_cpu_blocks=self.cache_config.num_cpu_blocks,
+            sliding_window=self.cache_config.sliding_window)
 
         # TODO(zhuohan): Use deque instead of list for better performance.
         # Sequence groups in the WAITING state.
@@ -188,6 +187,8 @@ def _schedule(self) -> SchedulerOutputs:
                     blocks_to_swap_out=blocks_to_swap_out,
                     blocks_to_copy=blocks_to_copy,
                     ignored_seq_groups=ignored_seq_groups,
+                    # Co(gc): not used
+                    finished_seqs=[],
                 )
                 return scheduler_outputs
 
@@ -260,6 +261,8 @@ def _schedule(self) -> SchedulerOutputs:
             blocks_to_swap_out=blocks_to_swap_out,
             blocks_to_copy=blocks_to_copy,
             ignored_seq_groups=[],
+            # Co(gc): not used
+            finished_seqs=[],
         )
         return scheduler_outputs
 
@@ -398,16 +401,24 @@ def _swap_out(
             seq.status = SequenceStatus.SWAPPED
 
 
-
-
 class FixedWindowScheduler:
 
     def __init__(
         self,
         scheduler_config: SchedulerConfig,
         cache_config: CacheConfig,
     ) -> None:
+        """
+        Co(gc): A fixed window scheduler, requests sent limit totally be controlled by SchedulerConfig
+        We disable the block_manager in this class which results in that we cannot know if a request will
+        cause OOM in the backend worker.
+        To enable this, we will need the support of a PageTable, which then need to implement relevant
+        CUDA functions in ./scrc/cache_kernels.cu using oneAPI? So here's a TODO for you
+        TODO: Write a block manager so that we can have fine-grained batch control
+
+        """
         self.scheduler_config = scheduler_config
+        # Co(gc): disable the cache_config as we are not using it
         #self.cache_config = cache_config
 
         self.prompt_limit = min(self.scheduler_config.max_model_len,
@@ -416,11 +427,14 @@ def __init__(
         # Instantiate the scheduling policy.
         self.policy = PolicyFactory.get_policy(policy_name="fcfs")
 
+        # Co(gc): disable the block manager
+
         # Sequence groups in the WAITING state.
         self.waiting: List[SequenceGroup] = []
         # Sequence groups in the RUNNING state.
         self.running: List[SequenceGroup] = []
         self.cleaned: List[int] = []
+        # Co(gc): We no longer have the swapped space as we are not deciding which to swap
 
     def add_seq_group(self, seq_group: SequenceGroup) -> None:
         # Add sequence groups to the waiting queue.
@@ -461,7 +475,7 @@ def _schedule(self) -> SchedulerOutputs:
         ignored_seq_groups: List[SequenceGroup] = []
         scheduled: List[SequenceGroup] = []
         finished_seqs: List[int] = self.cleaned.copy()
-        self.cleaned=[]
+        self.cleaned = []
         # The total number of sequences on the fly, including the
         # requests in the generation phase.
         num_curr_seqs = sum(seq_group.get_max_num_running_seqs()
@@ -489,6 +503,7 @@ def _schedule(self) -> SchedulerOutputs:
                     self.waiting.pop(0)
                     continue
 
+                # TODO(gc): If you can manage to make block_manager work, then this will be fine.
                 # If the sequence group cannot be allocated, stop.
                 # if not self.block_manager.can_allocate(seq_group):
                 #     break
@@ -508,14 +523,13 @@ def _schedule(self) -> SchedulerOutputs:
                 seq_group = self.waiting.pop(0)
                 for seq in seq_group.get_seqs():
                     seq.status = SequenceStatus.RUNNING
+                #TODO(gc): sames here
                 #self._allocate(seq_group)
                 self.running.append(seq_group)
                 num_batched_tokens += num_prompt_tokens
                 num_curr_seqs += num_new_seqs
                 scheduled.append(seq_group)
 
-                print("We have waited sequence_groups")
-
             scheduler_outputs = SchedulerOutputs(
                 scheduled_seq_groups=scheduled,
                 prompt_run=True,
@@ -561,9 +575,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
         for seq_group in scheduler_outputs.scheduled_seq_groups:
             seq_data: Dict[int, List[SequenceData]] = {}
             block_tables: Dict[int, List[int]] = {}
-            print("Here we print the length of the seq_groups")
-            print(len(seq_group.get_seqs()))
-            print("The following sequences are scheduled")
             for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
                 seq_id = seq.seq_id
                 seq_data[seq_id] = seq.data
@@ -576,7 +587,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 sampling_params=seq_group.sampling_params,
                 block_tables=block_tables,
             )
-            print(seq_group_metadata.seq_data.keys())
             seq_group_metadata_list.append(seq_group_metadata)
         return seq_group_metadata_list, scheduler_outputs
 
@@ -588,10 +598,11 @@ def free_seq(self, seq: Sequence) -> None:
         self.cleaned.append(seq.seq_id)
 
     def free_finished_seq_groups(self) -> None:
-        for seq_group in self.running:
-            if seq_group.is_finished():
-                print("A finished seq_group")
-                print(seq_group)
+        # Co(gc): just some debug statements
+        # for seq_group in self.running:
+        #     if seq_group.is_finished():
+        #         print("A finished seq_group")
+        #         print(seq_group)
         self.running = [
             seq_group for seq_group in self.running
             if not seq_group.is_finished()
 
@@ -185,7 +185,6 @@ def create_engine_configs(
                                    self.dtype, self.seed, self.revision,
                                    self.tokenizer_revision, self.max_model_len,
                                    self.quantization)
-        # gc-TODO: disable cache_config later
         cache_config = CacheConfig(
             self.block_size, self.gpu_memory_utilization, self.swap_space,
             getattr(model_config.hf_config, 'sliding_window', None))