Skip to content

Feature/CLI #8

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 37 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
3729b61
Update poetry lock
XkunW May 28, 2024
fc49b78
Add docker image for default environment
XkunW May 30, 2024
e597136
Update docker image to not create a virtural env
XkunW Jun 4, 2024
18647db
Update verison
XkunW Jun 4, 2024
b1571a0
Test container with single node llama 3
XkunW Jun 5, 2024
1dfd1c8
Add vllm-nccl-cu12 as dependency
XkunW Jun 5, 2024
00c469c
Update Dockerfile
XkunW Jun 5, 2024
966ed93
Move nccl file location
XkunW Jun 6, 2024
f986875
Update poetry lock, add mistral models, update default env to use sin…
XkunW Jun 6, 2024
8731c93
Update README installation instructions
XkunW Jun 6, 2024
2b9bdf4
Update env var name
XkunW Jun 6, 2024
af0ad0c
Move Poetry cache dir to working dir
XkunW Jun 11, 2024
8109795
Clone from main
XkunW Jun 12, 2024
b669354
Update to use vLLM 0.5.0
XkunW Jun 13, 2024
d28f03f
Add vim installation, remove cache directory as it is unused
XkunW Jun 13, 2024
c4dbed0
Update examplesto include VLM completions, add profiling scripts
XkunW Jun 13, 2024
c9bd432
Added support for VLMs - llava-1.5 and llava-next, updated default en…
XkunW Jun 13, 2024
f60c3f1
Fixed data type override logic, added --time argument
XkunW Jun 13, 2024
045fc81
Accidentally removed variant argument in previous commits, adding it …
XkunW Jun 17, 2024
07fbe33
Set default image input args for VLM models
XkunW Jun 17, 2024
57087f9
Update Llava 1.5 README
XkunW Jun 17, 2024
e88da1f
Update models README
XkunW Jun 17, 2024
2e465e1
Update README.md to reflect refactoring in examples folder
XkunW Jun 17, 2024
65bf554
Update README.md to reflect factored changes
XkunW Jun 17, 2024
4b608be
refactoring v1.
kohankhaki Jun 20, 2024
9e79c31
removed launched server from each models directory.
kohankhaki Jun 20, 2024
96a7233
removed MODEL_EXT
kohankhaki Jun 20, 2024
9e42483
Update config files, consolidate all job launching bash scripts to sa…
XkunW Jun 21, 2024
7e64ecb
Fix file path issues with the consolidated launch script
XkunW Jun 24, 2024
4054b3b
Update README according to refactor
XkunW Jun 24, 2024
fc84a0b
Update model variant names for llama2, added CodeLlama
XkunW Jul 6, 2024
1f1cec7
Bump version
XkunW Jul 6, 2024
6b116e8
Update version
XkunW Jul 25, 2024
b5ad503
Add CLI, update repo into a package, added llama 3.1 and gemma 2
XkunW Jul 30, 2024
a558b96
Bump version to 1.0.0
XkunW Jul 30, 2024
3dbbcb5
Merge branch 'develop' into feature/cli
XkunW Jul 30, 2024
2bde47b
Deleted old files unresolved from merge, delete a comment
XkunW Jul 30, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,16 @@ RUN python3.10 -m pip install --upgrade pip
RUN python3.10 -m pip install poetry

# Clone the repository
RUN git clone https://github.com/VectorInstitute/vector-inference /vec-inf
RUN git clone -b develop https://github.com/VectorInstitute/vector-inference /vec-inf

# Set the working directory
WORKDIR /vec-inf

# Configure Poetry to not create virtual environments
RUN poetry config virtualenvs.create false

# Update Poetry lock file if necessary
RUN poetry lock

# Install project dependencies via Poetry
RUN poetry install
# Install vec-inf
RUN python3.10 -m pip install .[dev]

# Install Flash Attention 2 backend
RUN python3.10 -m pip install flash-attn --no-build-isolation
Expand Down
2,325 changes: 1,458 additions & 867 deletions poetry.lock

Large diffs are not rendered by default.

24 changes: 16 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,18 +1,26 @@
[tool.poetry]
name = "vector-inference"
version = "0.2.1"
name = "vec-inf"
version = "1.0.0"
description = "Efficient LLM inference on Slurm clusters using vLLM."
authors = ["XkunW <marshall.wang@vectorinstitute.ai>"]
authors = ["Marshall Wang <marshall.wang@vectorinstitute.ai>"]
license = "MIT license"
readme = "README.md"
package-mode = false

[tool.poetry.dependencies]
python = "^3.10"
vllm = { version = "^0.5.0", allow-prereleases = true }
vllm-nccl-cu12 = ">=2.18,<2.19"
ray = "^2.9.3"
cupy-cuda12x = "12.1.0"
requests = "^2.31.0"
click = "^8.1.0"
rich = "^13.7.0"
vllm = { version = "^0.5.0", optional = true }
vllm-nccl-cu12 = { version = ">=2.18,<2.19", optional = true }
ray = { version = "^2.9.3", optional = true }
cupy-cuda12x = { version = "12.1.0", optional = true }

[tool.poetry.extras]
dev = ["vllm", "vllm-nccl-cu12", "ray", "cupy-cuda12x"]

[tool.poetry.scripts]
vec-inf = "vec_inf.cli._cli:cli"

[build-system]
requires = ["poetry-core"]
Expand Down
File renamed without changes.
Empty file added vec_inf/__init__.py
Empty file.
Empty file added vec_inf/cli/__init__.py
Empty file.
211 changes: 211 additions & 0 deletions vec_inf/cli/_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
import os

import click
from rich.console import Console

from ._utils import run_bash_command, is_server_running, model_health_check, get_base_url, create_table


console = Console()


@click.group()
def cli():
"""Main CLI group"""
pass

@cli.command("launch")
@click.argument(
"model-family",
type=str,
nargs=1
)
@click.option(
"--model-variant",
type=str,
help='The model variant according to the README in `models/model-family`'
)
@click.option(
"--partition",
type=str,
help='Type of compute partition, default to a40'
)
@click.option(
"--num-nodes",
type=int,
help='Number of nodes to use, default to suggested resource allocation for model'
)
@click.option(
"--num-gpus",
type=int,
help='Number of GPUs/node to use, default to suggested resource allocation for model'
)
@click.option(
"--qos",
type=str,
help='Quality of service, default to m3'
)
@click.option(
"--time",
type=str,
help='Time limit for job, this should comply with QoS, default to 4:00:00'
)
@click.option(
"--data-type",
type=str,
help='Model data type, default to auto'
)
@click.option(
"--venv",
type=str,
help='Path to virtual environment'
)
@click.option(
"--is-vlm",
type=bool,
help='Whether the model is a VLM model, default to False'
)
@click.option(
"--image-input-type",
type=str,
help='The image input type passed into vLLM, default to pixel_values'
)
@click.option(
"--image-token-id",
type=str,
help='Input ID for image token. Default to HF Config value. Default value set according to model'
)
@click.option(
"--image-input-shape",
type=str,
help='The biggest image input shape given an input type. Default value set according to model.'
)
@click.option(
"--image-feature-size",
type=str,
help='The image feature size along the context dimension. Default value set according to model'
)
@click.option(
"--json-mode",
is_flag=True,
help='Output in JSON string',
)
def launch(
model_family: str,
model_variant: str=None,
partition: str=None,
num_nodes: int=None,
num_gpus: int=None,
qos: str=None,
time: str=None,
data_type: str=None,
venv: str=None,
is_vlm: bool=None,
image_input_type: str=None,
image_token_id: str=None,
image_input_shape: str=None,
image_feature_size: str=None,
json_mode: bool=False
) -> None:
"""
Launch a model on the cluster
"""
input_args_list = list(locals().keys())
input_args_list.remove("json_mode")
launch_script_path = os.path.join(
os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
"launch_server.sh"
)
launch_cmd = f"bash {launch_script_path}"
for arg in input_args_list:
if locals()[arg] is not None:
named_arg = arg.replace("_", "-")
launch_cmd += f" --{named_arg} {locals()[arg]}"
output = run_bash_command(launch_cmd)

slurm_job_id = output.split(" ")[-1].strip().strip("\n")
output_lines = output.split("\n")[:-2]

table = create_table(key_title="Job Config", value_title="Value")
table.add_row("Slurm Job ID", slurm_job_id, style="blue")
output_dict = {"slurm_job_id": slurm_job_id}

for line in output_lines:
key, value = line.split(": ")
table.add_row(key, value)
output_dict[key.lower().replace(" ", "_")] = value

if json_mode:
click.echo(output_dict)
else:
console.print(table)


@cli.command("status")
@click.argument(
"slurm_job_id",
type=int,
nargs=1
)
@click.option(
"--json-mode",
is_flag=True,
help='Output in JSON string',
)
def status(slurm_job_id: int, json_mode: bool=False) -> None:
"""
Get the status of a running model on the cluster
"""
status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
output = run_bash_command(status_cmd)

status = "SHUTDOWN"
base_url = "UNAVAILABLE"

try:
slurm_job_name = output.split(" ")[1].split("=")[1]
slurm_job_state = output.split(" ")[9].split("=")[1]
# If Slurm job is currently PENDING
if slurm_job_state == "PENDING":
status = "PENDING"
# If Slurm job is currently RUNNING
elif slurm_job_state == "RUNNING":
# Check whether the server is ready, if yes, run model health check to further determine status
server_status = is_server_running(slurm_job_name, slurm_job_id)
if server_status == "RUNNING":
status = model_health_check(slurm_job_name)
if status == "READY":
# Only set base_url if model is ready to serve requests
base_url = get_base_url(slurm_job_name)
else:
status = server_status
except:
pass

if json_mode:
click.echo(f'{{"model_name": "{slurm_job_name}", "model_status": "{status}", "base_url": "{base_url}"}}')
else:
table = create_table(key_title="Job Status", value_title="Value")
table.add_row("Model Name", slurm_job_name)
table.add_row("Model Status", status, style="blue")
table.add_row("Base URL", base_url)
console.print(table)


@cli.command("shutdown")
@click.argument(
"slurm_job_id",
type=int,
nargs=1
)
def shutdown(slurm_job_id: int) -> None:
"""
Shutdown a running model on the cluster
"""
shutdown_cmd = f"scancel {slurm_job_id}"
run_bash_command(shutdown_cmd)
click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")


if __name__ == '__main__':
cli()
94 changes: 94 additions & 0 deletions vec_inf/cli/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import subprocess
import os

import requests
from rich.table import Table


MODEL_READY_SIGNATURE = "INFO: Uvicorn running on http://0.0.0.0:"


def run_bash_command(command: str) -> str:
"""
Run a bash command and return the output
"""
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
stdout, _ = process.communicate()
return stdout


def get_model_dir(slurm_job_name: str) -> str:
"""
Get the directory of a model
"""
models_dir = os.path.join(
os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
"models"
)
model_dir = ""
for dir in os.listdir(models_dir):
if dir in slurm_job_name.lower():
model_dir = os.path.join(models_dir, dir)
break
return model_dir


def is_server_running(slurm_job_name: str, slurm_job_id: int) -> str:
"""
Check if a model is ready to serve requests
"""
model_dir = get_model_dir(slurm_job_name)

try:
file_path = os.path.join(model_dir, f"{slurm_job_name}.{slurm_job_id}.err")
with open(file_path, 'r') as file:
lines = file.readlines()
except FileNotFoundError:
return "LAUNCHING"

for line in lines:
if MODEL_READY_SIGNATURE in line:
return "RUNNING"
return "LAUNCHING"


def get_base_url(slurm_job_name: str) -> str:
"""
Get the base URL of a model
"""
model_dir = get_model_dir(slurm_job_name)
try:
file_path = os.path.join(model_dir, f".{slurm_job_name}_url")
with open(file_path, 'r') as file:
lines = file.readlines()
except FileNotFoundError:
return "UNAVAILABLE"
return lines[0].strip().strip("\n")


def model_health_check(slurm_job_name: str) -> str:
"""
Check the health of a running model on the cluster
"""
base_url = get_base_url(slurm_job_name)
health_check_url = base_url.replace("v1", "health")

try:
response = requests.get(health_check_url)
# Check if the request was successful
if response.status_code == 200:
return "READY"
else:
return "FAILED"
except requests.exceptions.RequestException as e:
return "FAILED"


def create_table(key_title: str = "", value_title: str = "", show_header: bool = True) -> Table:
"""
Create a table for displaying model status
"""
table = Table(show_header=show_header, header_style="bold magenta")
table.add_column(key_title, style="dim")
table.add_column(value_title)
return table
File renamed without changes.
Loading