support sentence transformer

vtuber-plan · Jun 30, 2024 · 2b28af1 · 2b28af1
1 parent b5515f4
commit 2b28af1
Show file tree

Hide file tree

Showing 3 changed files with 111 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -112,6 +112,12 @@ python -m langport.service.server.generation_worker --port 21001 --model-path <y
 python -m langport.service.gateway.openai_api
 ```
 
+If you need a single node embeddings API server:
+```bash
+python -m langport.service.server.embedding_worker --port 21002 --model-path bert-base-chinese --gpus 0 --num-gpus 1
+python -m langport.service.gateway.openai_api --port 8000 --controller-address http://localhost:21002
+```
+
 If you need the embeddings API or other features, you can deploy a distributed inference cluster:
 ``` bash
 python -m langport.service.server.dummy_worker --port 21001

diff --git a/langport/model/executor/embedding/huggingface.py b/langport/model/executor/embedding/huggingface.py
@@ -1,13 +1,14 @@
+import os
 import time
 import traceback
 from typing import List, Optional
 
 import torch
+from huggingface_hub import hf_hub_download
 from langport.model.executor.huggingface import HuggingfaceExecutor
 from langport.protocol.worker_protocol import BaseWorkerResult, EmbeddingWorkerResult, EmbeddingsObject, UsageInfo
 from langport.workers.embedding_worker import EmbeddingModelWorker
 
-
 class HuggingfaceEmbeddingExecutor(HuggingfaceExecutor):
     def __init__(
         self,
@@ -46,11 +47,29 @@ def __init__(
         self.adapter = None
         self.model = None
         self.tokenizer = None
-        self.adapter, self.model, self.tokenizer = self.load_model(
-            model_path, device, num_gpus, max_gpu_memory, quantization, cpu_offloading, deepspeed, gptq, group_size, trust_remote_code, offload_folder
-        )
 
-        if hasattr(self.model.config, "max_sequence_length"):
+        if os.path.exists(model_path):
+            if not os.path.exists(os.path.join(model_path, "modules.json")):
+                modules_file = ""
+            else:
+                with open(os.path.join(model_path, "modules.json"), "r", encoding="utf-8") as f:
+                    modules_file = f.read()
+        else:
+            modules_file = hf_hub_download(repo_id=model_path, filename="modules.json")
+        if "sentence_transformers" in modules_file:
+            self.adapter, self.model, self.tokenizer = self.load_sentence_transformer_model(
+                model_path, device, num_gpus, max_gpu_memory, quantization, cpu_offloading,
+                deepspeed, gptq, group_size, trust_remote_code, offload_folder
+            )
+        else:
+            self.adapter, self.model, self.tokenizer = self.load_model(
+                model_path, device, num_gpus, max_gpu_memory, quantization, cpu_offloading,
+                deepspeed, gptq, group_size, trust_remote_code, offload_folder
+            )
+
+        if hasattr(self.model, "max_seq_length"):
+            self._context_len = self.model.max_seq_length
+        elif hasattr(self.model.config, "max_sequence_length"):
             self._context_len = self.model.config.max_sequence_length
         elif hasattr(self.model.config, "max_position_embeddings"):
             self._context_len = self.model.config.max_position_embeddings
@@ -93,7 +112,7 @@ def context_length(self) -> int:
     def tokenize(self, text: str) -> List[int]:
         input_ids = self.tokenizer(text).input_ids
         return input_ids
-    
+
     # Mean Pooling - Take attention mask into account for correct averaging
     def _mean_pooling(self, model_output, attention_mask):
         token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
@@ -130,7 +149,7 @@ def inference(self, worker: "EmbeddingModelWorker"):
                 prompts_index.extend([task_i] * len(task_input))
             else:
                 raise Exception("Invalid prompt type...")
-            
+
         try:
             tokenizer = self.tokenizer
             model = self.model
@@ -141,28 +160,32 @@ def inference(self, worker: "EmbeddingModelWorker"):
             if tokenizer._pad_token is None:
                 tokenizer.pad_token = tokenizer.eos_token
 
-            encoded_prompts = tokenizer(prompts, return_tensors="pt", padding="longest").to(self.device)
-            input_ids = encoded_prompts.input_ids
-            if model.config.is_encoder_decoder:
-                decoder_input_ids = torch.full(
-                    (len(prompts), 1),
-                    model.generation_config.decoder_start_token_id,
-                    dtype=torch.long,
-                    device=self.device,
-                )
-                model_output = model(input_ids, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
-                data = model_output.decoder_hidden_states[-1]
-            elif model.config.is_decoder:
-                model_output = model(input_ids, output_hidden_states=True)
-                is_chatglm = "chatglm" in str(type(model)).lower()
-                if is_chatglm:
-                    data = model_output.hidden_states[-1].transpose(0, 1)
+            if model.__class__.__module__ + '.' + model.__class__.__name__ != 'sentence_transformers.SentenceTransformer.SentenceTransformer':
+                encoded_prompts = tokenizer(prompts, return_tensors="pt", padding="longest").to(self.device)
+                input_ids = encoded_prompts.input_ids
+                if model.config.is_encoder_decoder:
+                    decoder_input_ids = torch.full(
+                        (len(prompts), 1),
+                        model.generation_config.decoder_start_token_id,
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    model_output = model(input_ids, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
+                    data = model_output.decoder_hidden_states[-1]
+                elif model.config.is_decoder:
+                    model_output = model(input_ids, output_hidden_states=True)
+                    is_chatglm = "chatglm" in str(type(model)).lower()
+                    if is_chatglm:
+                        data = model_output.hidden_states[-1].transpose(0, 1)
+                    else:
+                        data = model_output.hidden_states[-1]
                 else:
-                    data = model_output.hidden_states[-1]
+                    data = model(**encoded_prompts)
+                # embeddings = torch.mean(data, dim=1)
+                embeddings = self._mean_pooling(data, encoded_prompts['attention_mask'])
             else:
-                data = model(**encoded_prompts)
-            # embeddings = torch.mean(data, dim=1)
-            embeddings = self._mean_pooling(data, encoded_prompts['attention_mask'])
+                embeddings = model.encode(prompts)
+
             for task_i, cur_task in enumerate(tasks):
                 token_num = 0
                 embedding_list = []

diff --git a/langport/model/executor/huggingface.py b/langport/model/executor/huggingface.py
@@ -141,6 +141,61 @@ def _load_hf_model(self, adapter, model_path: str, from_pretrained_kwargs: dict)
 
         return model, tokenizer
 
+    def load_sentence_transformer_model(
+        self,
+        model_path: str,
+        device: str,
+        num_gpus: int,
+        max_gpu_memory: Optional[str] = None,
+        quantization: Optional[str] = None,
+        cpu_offloading: bool = False,
+        deepspeed: bool = False,
+        gptq: bool = False,
+        group_size: Optional[int] = None,
+        trust_remote_code: bool = False,
+        offload_folder: Optional[str] = None,
+        debug: bool = False,
+    ):
+        """Load a model from Hugging Face."""
+        from sentence_transformers import SentenceTransformer
+        adapter = get_model_adapter(model_path)
+
+        kwargs = {}
+        if device == "cpu":
+            kwargs["torch_dtype"] = torch.float32
+        elif device == "cuda":
+            kwargs["torch_dtype"] = torch.float16
+            if num_gpus != 1:
+                kwargs["device_map"] = "auto"
+                if max_gpu_memory is None:
+                    kwargs["device_map"] = "sequential"  # This is important for not the same VRAM sizes
+                    available_gpu_memory = get_gpu_memory(num_gpus)
+                    if len(available_gpu_memory) == 0:
+                        kwargs["device_map"] = "auto"
+                    elif all([mem == available_gpu_memory[0] for mem in available_gpu_memory]):
+                        kwargs["device_map"] = "balanced"
+                    else:
+                        kwargs["max_memory"] = {
+                            i: str(int(available_gpu_memory[i] * 0.55)) + "GiB"
+                            for i in range(num_gpus)
+                        }
+                else:
+                    kwargs["max_memory"] = {i: max_gpu_memory for i in range(num_gpus)}
+        elif device == "mps":
+            kwargs["torch_dtype"] = torch.float16
+            # Avoid bugs in mps backend by not using in-place operations.
+            replace_llama_attn_with_non_inplace_operations()
+        else:
+            raise ValueError(f"Invalid device: {device}")
+
+        kwargs["trust_remote_code"] = trust_remote_code
+        if offload_folder is not None:
+            kwargs["offload_folder"] = offload_folder
+
+        model = SentenceTransformer(model_path, device=device, trust_remote_code=trust_remote_code, model_kwargs=kwargs)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=trust_remote_code)
+        return adapter, model, tokenizer
+
     def load_model(
         self,
         model_path: str,