Skip to content

Commit

Permalink
support sentence transformer
Browse files Browse the repository at this point in the history
  • Loading branch information
jstzwj committed Jun 30, 2024
1 parent b5515f4 commit 2b28af1
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 27 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,12 @@ python -m langport.service.server.generation_worker --port 21001 --model-path <y
python -m langport.service.gateway.openai_api
```

If you need a single node embeddings API server:
```bash
python -m langport.service.server.embedding_worker --port 21002 --model-path bert-base-chinese --gpus 0 --num-gpus 1
python -m langport.service.gateway.openai_api --port 8000 --controller-address http://localhost:21002
```

If you need the embeddings API or other features, you can deploy a distributed inference cluster:
``` bash
python -m langport.service.server.dummy_worker --port 21001
Expand Down
77 changes: 50 additions & 27 deletions langport/model/executor/embedding/huggingface.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import os
import time
import traceback
from typing import List, Optional

import torch
from huggingface_hub import hf_hub_download
from langport.model.executor.huggingface import HuggingfaceExecutor
from langport.protocol.worker_protocol import BaseWorkerResult, EmbeddingWorkerResult, EmbeddingsObject, UsageInfo
from langport.workers.embedding_worker import EmbeddingModelWorker


class HuggingfaceEmbeddingExecutor(HuggingfaceExecutor):
def __init__(
self,
Expand Down Expand Up @@ -46,11 +47,29 @@ def __init__(
self.adapter = None
self.model = None
self.tokenizer = None
self.adapter, self.model, self.tokenizer = self.load_model(
model_path, device, num_gpus, max_gpu_memory, quantization, cpu_offloading, deepspeed, gptq, group_size, trust_remote_code, offload_folder
)

if hasattr(self.model.config, "max_sequence_length"):
if os.path.exists(model_path):
if not os.path.exists(os.path.join(model_path, "modules.json")):
modules_file = ""
else:
with open(os.path.join(model_path, "modules.json"), "r", encoding="utf-8") as f:
modules_file = f.read()
else:
modules_file = hf_hub_download(repo_id=model_path, filename="modules.json")
if "sentence_transformers" in modules_file:
self.adapter, self.model, self.tokenizer = self.load_sentence_transformer_model(
model_path, device, num_gpus, max_gpu_memory, quantization, cpu_offloading,
deepspeed, gptq, group_size, trust_remote_code, offload_folder
)
else:
self.adapter, self.model, self.tokenizer = self.load_model(
model_path, device, num_gpus, max_gpu_memory, quantization, cpu_offloading,
deepspeed, gptq, group_size, trust_remote_code, offload_folder
)

if hasattr(self.model, "max_seq_length"):
self._context_len = self.model.max_seq_length
elif hasattr(self.model.config, "max_sequence_length"):
self._context_len = self.model.config.max_sequence_length
elif hasattr(self.model.config, "max_position_embeddings"):
self._context_len = self.model.config.max_position_embeddings
Expand Down Expand Up @@ -93,7 +112,7 @@ def context_length(self) -> int:
def tokenize(self, text: str) -> List[int]:
input_ids = self.tokenizer(text).input_ids
return input_ids

# Mean Pooling - Take attention mask into account for correct averaging
def _mean_pooling(self, model_output, attention_mask):
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
Expand Down Expand Up @@ -130,7 +149,7 @@ def inference(self, worker: "EmbeddingModelWorker"):
prompts_index.extend([task_i] * len(task_input))
else:
raise Exception("Invalid prompt type...")

try:
tokenizer = self.tokenizer
model = self.model
Expand All @@ -141,28 +160,32 @@ def inference(self, worker: "EmbeddingModelWorker"):
if tokenizer._pad_token is None:
tokenizer.pad_token = tokenizer.eos_token

encoded_prompts = tokenizer(prompts, return_tensors="pt", padding="longest").to(self.device)
input_ids = encoded_prompts.input_ids
if model.config.is_encoder_decoder:
decoder_input_ids = torch.full(
(len(prompts), 1),
model.generation_config.decoder_start_token_id,
dtype=torch.long,
device=self.device,
)
model_output = model(input_ids, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
data = model_output.decoder_hidden_states[-1]
elif model.config.is_decoder:
model_output = model(input_ids, output_hidden_states=True)
is_chatglm = "chatglm" in str(type(model)).lower()
if is_chatglm:
data = model_output.hidden_states[-1].transpose(0, 1)
if model.__class__.__module__ + '.' + model.__class__.__name__ != 'sentence_transformers.SentenceTransformer.SentenceTransformer':
encoded_prompts = tokenizer(prompts, return_tensors="pt", padding="longest").to(self.device)
input_ids = encoded_prompts.input_ids
if model.config.is_encoder_decoder:
decoder_input_ids = torch.full(
(len(prompts), 1),
model.generation_config.decoder_start_token_id,
dtype=torch.long,
device=self.device,
)
model_output = model(input_ids, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
data = model_output.decoder_hidden_states[-1]
elif model.config.is_decoder:
model_output = model(input_ids, output_hidden_states=True)
is_chatglm = "chatglm" in str(type(model)).lower()
if is_chatglm:
data = model_output.hidden_states[-1].transpose(0, 1)
else:
data = model_output.hidden_states[-1]
else:
data = model_output.hidden_states[-1]
data = model(**encoded_prompts)
# embeddings = torch.mean(data, dim=1)
embeddings = self._mean_pooling(data, encoded_prompts['attention_mask'])
else:
data = model(**encoded_prompts)
# embeddings = torch.mean(data, dim=1)
embeddings = self._mean_pooling(data, encoded_prompts['attention_mask'])
embeddings = model.encode(prompts)

for task_i, cur_task in enumerate(tasks):
token_num = 0
embedding_list = []
Expand Down
55 changes: 55 additions & 0 deletions langport/model/executor/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,61 @@ def _load_hf_model(self, adapter, model_path: str, from_pretrained_kwargs: dict)

return model, tokenizer

def load_sentence_transformer_model(
self,
model_path: str,
device: str,
num_gpus: int,
max_gpu_memory: Optional[str] = None,
quantization: Optional[str] = None,
cpu_offloading: bool = False,
deepspeed: bool = False,
gptq: bool = False,
group_size: Optional[int] = None,
trust_remote_code: bool = False,
offload_folder: Optional[str] = None,
debug: bool = False,
):
"""Load a model from Hugging Face."""
from sentence_transformers import SentenceTransformer
adapter = get_model_adapter(model_path)

kwargs = {}
if device == "cpu":
kwargs["torch_dtype"] = torch.float32
elif device == "cuda":
kwargs["torch_dtype"] = torch.float16
if num_gpus != 1:
kwargs["device_map"] = "auto"
if max_gpu_memory is None:
kwargs["device_map"] = "sequential" # This is important for not the same VRAM sizes
available_gpu_memory = get_gpu_memory(num_gpus)
if len(available_gpu_memory) == 0:
kwargs["device_map"] = "auto"
elif all([mem == available_gpu_memory[0] for mem in available_gpu_memory]):
kwargs["device_map"] = "balanced"
else:
kwargs["max_memory"] = {
i: str(int(available_gpu_memory[i] * 0.55)) + "GiB"
for i in range(num_gpus)
}
else:
kwargs["max_memory"] = {i: max_gpu_memory for i in range(num_gpus)}
elif device == "mps":
kwargs["torch_dtype"] = torch.float16
# Avoid bugs in mps backend by not using in-place operations.
replace_llama_attn_with_non_inplace_operations()
else:
raise ValueError(f"Invalid device: {device}")

kwargs["trust_remote_code"] = trust_remote_code
if offload_folder is not None:
kwargs["offload_folder"] = offload_folder

model = SentenceTransformer(model_path, device=device, trust_remote_code=trust_remote_code, model_kwargs=kwargs)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=trust_remote_code)
return adapter, model, tokenizer

def load_model(
self,
model_path: str,
Expand Down

0 comments on commit 2b28af1

Please sign in to comment.