vllm-project · jc9123 · Feb 9, 2024 · Feb 9, 2024 · Feb 9, 2024 · Feb 29, 2024
diff --git a/examples/offline_inference.py b/examples/offline_inference.py
@@ -19,4 +19,4 @@
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/requirements.txt b/requirements.txt
@@ -13,5 +13,6 @@ pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
 pynvml == 11.5.0
 triton >= 2.1.0
+datasets >= 2.0.0
 outlines == 0.0.34
 cupy-cuda12x == 12.1.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.
@@ -0,0 +1,34 @@
+from vllm import LLM, SamplingParams
+import numpy as np
+# Sample prompts.
+sentences_1 = ["What is BGE M3?", "Defination of BM25"]
+sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
+               "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+# llm = LLM(model="facebook/opt-125m")
+llm = LLM(model="BAAI/bge-m3", enforce_eager = True, embedded_model = True)
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs1 = llm.generate(sentences_1, sampling_params)
+
+lst1 = []
+for output1 in outputs1:
+    generated_text = output1.embed.cpu()
+    lst1.append(np.array(generated_text))
+lst1 = np.array(lst1)
+outputs2 = llm.generate(sentences_2, sampling_params)
+
+lst2 = []
+for output2 in outputs2:
+    prompt = output2.prompt
+    generated_text = output2.embed.cpu()
+    lst2.append(np.array(generated_text))
+lst2 = np.array(lst2)
+result = lst1 @ lst2.T
+expected_result = np.array([[0.6265, 0.3477], [0.3499, 0.678 ]])
+
+assert(np.isclose(result, expected_result, atol=1e-2).all())
+print("Passed!")
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
@@ -128,6 +128,7 @@ def create_worker(cls: type,
 
     cache_config.num_gpu_blocks = num_gpu_blocks
     cache_config.num_cpu_blocks = 0
+
     worker.init_cache_engine(cache_config)
     worker.warm_up_model()
 

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
@@ -370,7 +370,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
 
             seq_data: Dict[int, SequenceData] = {}
             block_tables: Dict[int, List[int]] = {}
-
             for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
                 seq_id = seq.seq_id
                 seq_data[seq_id] = seq.data

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -52,6 +52,7 @@ class EngineArgs:
     max_cpu_loras: Optional[int] = None
     device: str = 'auto'
     ray_workers_use_nsight: bool = False
+    embedded_model : bool = False
 
     def __post_init__(self):
         if self.tokenizer is None:

@@ -86,7 +86,6 @@ def __init__(
             f"device_config={device_config.device}, "
             f"seed={model_config.seed})")
         # TODO(woosuk): Print more configs in debug mode.
-
         self.model_config = model_config
         self.cache_config = cache_config
         self.lora_config = lora_config
@@ -375,6 +374,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
             parent_seq.seq_id: []
             for parent_seq in parent_seqs
         }
+
         for sample in samples:
             parent_child_dict[sample.parent_seq_id].append(sample)
         # List of (child, parent)
@@ -536,18 +536,23 @@ def _process_model_outputs(
         # Update the scheduled sequence groups with the model outputs.
         scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
 
-        # If prefix caching is enabled, mark all blocks in the sequence groups
-        # as completed so that future requests don't attempt to recompute them
+        if self.embedded_model : 
+            for i, seq_group in enumerate(scheduled_seq_groups):
+                for seq in seq_group.get_seqs():
+                    seq.status = SequenceStatus.FINISHED_STOPPED
+                seq_group.embed = output[i]
+        else:
+            for seq_group, outputs in zip(scheduled_seq_groups, output):
+                self._process_sequence_group_outputs(seq_group, outputs)
+
         if self.cache_config.enable_prefix_caching:
             for seq_group in scheduled_seq_groups:
                 self.scheduler.mark_blocks_as_computed(seq_group)
-
-        for seq_group, outputs in zip(scheduled_seq_groups, output):
-            self._process_sequence_group_outputs(seq_group, outputs)
-
+
         # Free the finished sequence groups.
         self.scheduler.free_finished_seq_groups()
 
+        request_outputs: List[RequestOutput] = []
         # Create the outputs.
         request_outputs: List[RequestOutput] = []
         for seq_group in scheduled_seq_groups:
@@ -561,7 +566,7 @@ def _process_model_outputs(
         # Log stats.
         if self.log_stats:
             self.stat_logger.log(self._get_stats(scheduler_outputs))
-
+    
         return request_outputs
 
     def step(self) -> List[RequestOutput]:
@@ -624,7 +629,7 @@ def step(self) -> List[RequestOutput]:
                 scheduler_outputs.blocks_to_copy)
         else:
             output = []
-
+        
         return self._process_model_outputs(output, scheduler_outputs)
 
     def do_log_stats(self) -> None:

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -84,6 +84,7 @@ def __init__(
         enforce_eager: bool = False,
         max_context_len_to_capture: int = 8192,
         disable_custom_all_reduce: bool = False,
+        embedded_model: bool = False, 
         **kwargs,
     ) -> None:
         if "disable_log_stats" not in kwargs:
@@ -104,9 +105,11 @@ def __init__(
             enforce_eager=enforce_eager,
             max_context_len_to_capture=max_context_len_to_capture,
             disable_custom_all_reduce=disable_custom_all_reduce,
+            embedded_model = embedded_model,
             **kwargs,
         )
         self.llm_engine = LLMEngine.from_engine_args(engine_args)
+        self.llm_engine.embedded_model = embedded_model
         self.request_counter = Counter()
 
     def get_tokenizer(

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -112,6 +112,18 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
         return JSONResponse(content=generator.model_dump())
 
 
+@app.post("/v1/embeddings")
+async def create_embeddings(request: EmbeddingRequest):
+
+    ## need to implement
+    generator = await openai_serving_completion.create_completion()
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    else:
+        return JSONResponse(content=generator.model_dump())
+
+
 if __name__ == "__main__":
     args = parse_args()
 

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
@@ -19,6 +19,11 @@ class ErrorResponse(BaseModel):
     code: int
 
 
+class EmbeddingRequest(BaseModel):
+    input: str
+    model: str
+
+
 class ModelPermission(BaseModel):
     id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
     object: str = "model_permission"

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -377,3 +377,7 @@ def request_output_to_completion_response(
             choices=choices,
             usage=usage,
         )
+
+
+    async def create_embeddings(self, input: str, model: str):
+        raise NotImplementedError
diff --git a/vllm/model_executor/layers/attention/__init__.py b/vllm/model_executor/layers/attention/__init__.py
@@ -1,5 +0,0 @@
-from vllm.model_executor.layers.attention.attention import Attention
-
-__all__ = [
-    "Attention",
-]

diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
@@ -41,7 +41,6 @@ def get_model(model_config: ModelConfig, device_config: DeviceConfig,
               **kwargs) -> nn.Module:
     lora_config = kwargs.get("lora_config", None)
     model_class = _get_model_architecture(model_config)
-
     # Get the (maybe quantized) linear method.
     linear_method = None
     if model_config.quantization is not None:

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
@@ -46,6 +46,8 @@
     "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
     "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
     "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
+    # embedded model
+    "XLMRobertaModel": ("bgem3", "BGEM3FlagForCausalLM"),
 }
 
 # Models not supported by ROCm.