Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dynamic Multi LoRA Load \ Delete Support #3496

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
14 changes: 14 additions & 0 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,20 @@ async def show_available_models():
return JSONResponse(content=models.model_dump())


@app.put("/-/lora_cache")
async def add_lora_request(request: LoRA,
raw_request: Request):
model_card = await openai_serving_chat.add_lora(request)
return JSONResponse(content=model_card.model_dump())


@app.delete("/-/lora_cache/{model}")
async def show_available_model(model: str):
delete_response = await openai_serving_chat.delete_lora(model)

return JSONResponse(content=delete_response)


@app.get("/version")
async def show_version():
ver = {"version": vllm.__version__}
Expand Down
6 changes: 6 additions & 0 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ class UsageInfo(BaseModel):
completion_tokens: Optional[int] = 0


class DeleteResponse(BaseModel):
id: str
object: str = "model"
deleted: Optional[bool] = False


class ResponseFormat(BaseModel):
# type must be "json_object" or "text"
type: str = Literal["text", "json_object"]
Expand Down
34 changes: 33 additions & 1 deletion vllm/entrypoints/openai/serving_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
ChatCompletionRequest,
ErrorResponse, LogProbs,
ModelCard, ModelList,
ModelPermission)
ModelPermission, DeleteResponse)
from vllm.lora.request import LoRARequest
from vllm.sequence import Logprob

Expand Down Expand Up @@ -84,6 +84,38 @@ async def show_available_models(self) -> ModelList:
model_cards.extend(lora_cards)
return ModelList(data=model_cards)


async def add_lora(self, lora: LoRA) -> ModelCard:
last_lora_request: LoRARequest = self.lora_requests.get(self.lora_requests.size() - 1)
lora_idx = 1
if last_lora_request is not None:
lora_idx = last_lora_request.lora_int_id + 1

self.lora_requests.append(LoRARequest(
lora_name=lora.name,
lora_int_id=lora_idx,
lora_local_path=lora.local_path,
))
Comment on lines +89 to +98
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's have an increment only counter for LoRA ids instead?

also, is it possible to check if the lora has already been added before? we can use the local_path

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry for the delay, I will be active on this now; I will resume work here.


return ModelCard(id=lora.lora_name,
root=self.served_model,
permission=[ModelPermission()])

async def delete_lora(self, model: str) -> DeleteResponse:
if model == self.served_model:
raise ValueError("Unsupported delete operation, base model delete not supported")

lora_idx = -1
for idx, lora in enumerate(self.lora_requests):
if model == lora.lora_name:
lora_idx = idx
if lora_idx != -1:
lora = self.lora_requests.pop(lora_idx)
return DeleteResponse(id=lora.lora_name,
deleted=True)
Comment on lines +108 to +115
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see above - incrementing and decrementing the counter is brittle


raise ValueError(f"The model {model} does not exist.")

def _create_logprobs(
self,
token_ids: List[int],
Expand Down
Loading