VectorInstitute · XkunW · Jul 30, 2024 · May 28, 2024 · May 30, 2024 · Jun 4, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -54,19 +54,16 @@ RUN python3.10 -m pip install --upgrade pip
 RUN python3.10 -m pip install poetry
 
 # Clone the repository
-RUN git clone https://github.com/VectorInstitute/vector-inference /vec-inf
+RUN git clone -b develop https://github.com/VectorInstitute/vector-inference /vec-inf
 
 # Set the working directory
 WORKDIR /vec-inf
 
-# Configure Poetry to not create virtual environments
-RUN poetry config virtualenvs.create false
-
 # Update Poetry lock file if necessary
 RUN poetry lock
 
-# Install project dependencies via Poetry
-RUN poetry install
+# Install vec-inf
+RUN python3.10 -m pip install .[dev]
 
 # Install Flash Attention 2 backend
 RUN python3.10 -m pip install flash-attn --no-build-isolation

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,18 +1,26 @@
 [tool.poetry]
-name = "vector-inference"
-version = "0.2.1"
+name = "vec-inf"
+version = "1.0.0"
 description = "Efficient LLM inference on Slurm clusters using vLLM."
-authors = ["XkunW <marshall.wang@vectorinstitute.ai>"]
+authors = ["Marshall Wang <marshall.wang@vectorinstitute.ai>"]
 license = "MIT license"
 readme = "README.md"
-package-mode = false
 
 [tool.poetry.dependencies]
 python = "^3.10"
-vllm = { version = "^0.5.0", allow-prereleases = true }
-vllm-nccl-cu12 = ">=2.18,<2.19"
-ray = "^2.9.3"
-cupy-cuda12x = "12.1.0"
+requests = "^2.31.0"
+click = "^8.1.0"
+rich = "^13.7.0"
+vllm = { version = "^0.5.0", optional = true }
+vllm-nccl-cu12 = { version = ">=2.18,<2.19", optional = true }
+ray = { version = "^2.9.3", optional = true }
+cupy-cuda12x = { version = "12.1.0", optional = true }
+
+[tool.poetry.extras]
+dev = ["vllm", "vllm-nccl-cu12", "ray", "cupy-cuda12x"]
+
+[tool.poetry.scripts]
+vec-inf = "vec_inf.cli._cli:cli"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/src/README.md → vec_inf/README.md b/src/README.md → vec_inf/README.md
diff --git a/vec_inf/__init__.py b/vec_inf/__init__.py
diff --git a/vec_inf/cli/__init__.py b/vec_inf/cli/__init__.py
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
@@ -0,0 +1,211 @@
+import os
+
+import click
+from rich.console import Console
+
+from ._utils import run_bash_command, is_server_running, model_health_check, get_base_url, create_table
+
+
+console = Console()
+
+
+@click.group()
+def cli():
+    """Main CLI group"""
+    pass
+
+@cli.command("launch")
+@click.argument(
+    "model-family",
+    type=str,
+    nargs=1
+)
+@click.option(
+    "--model-variant",
+    type=str,
+    help='The model variant according to the README in `models/model-family`'
+)
+@click.option(
+    "--partition",
+    type=str,
+    help='Type of compute partition, default to a40'
+)
+@click.option(
+    "--num-nodes",
+    type=int,
+    help='Number of nodes to use, default to suggested resource allocation for model'
+)
+@click.option(
+    "--num-gpus",
+    type=int,
+    help='Number of GPUs/node to use, default to suggested resource allocation for model'
+)
+@click.option(
+    "--qos",
+    type=str,
+    help='Quality of service, default to m3'
+)
+@click.option(
+    "--time",
+    type=str,
+    help='Time limit for job, this should comply with QoS, default to 4:00:00'
+)
+@click.option(
+    "--data-type",
+    type=str,
+    help='Model data type, default to auto'
+)
+@click.option(
+    "--venv",
+    type=str,
+    help='Path to virtual environment'
+)
+@click.option(
+    "--is-vlm",
+    type=bool,
+    help='Whether the model is a VLM model, default to False'
+)
+@click.option(
+    "--image-input-type",
+    type=str,
+    help='The image input type passed into vLLM, default to pixel_values'
+)
+@click.option(
+    "--image-token-id",
+    type=str,
+    help='Input ID for image token. Default to HF Config value. Default value set according to model'
+)
+@click.option(
+    "--image-input-shape",
+    type=str,
+    help='The biggest image input shape given an input type. Default value set according to model.'
+)
+@click.option(
+    "--image-feature-size",
+    type=str,
+    help='The image feature size along the context dimension. Default value set according to model'
+)
+@click.option(
+    "--json-mode",
+    is_flag=True,
+    help='Output in JSON string',
+)
+def launch(
+    model_family: str,
+    model_variant: str=None,
+    partition: str=None,
+    num_nodes: int=None,
+    num_gpus: int=None,
+    qos: str=None,
+    time: str=None,
+    data_type: str=None,
+    venv: str=None,
+    is_vlm: bool=None,
+    image_input_type: str=None,
+    image_token_id: str=None,
+    image_input_shape: str=None,
+    image_feature_size: str=None,
+    json_mode: bool=False
+) -> None:
+    """
+    Launch a model on the cluster
+    """
+    input_args_list = list(locals().keys())
+    input_args_list.remove("json_mode")
+    launch_script_path = os.path.join(
+        os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
+        "launch_server.sh"
+    )
+    launch_cmd = f"bash {launch_script_path}" 
+    for arg in input_args_list:
+        if locals()[arg] is not None:
+            named_arg = arg.replace("_", "-")
+            launch_cmd += f" --{named_arg} {locals()[arg]}"
+    output = run_bash_command(launch_cmd)
+
+    slurm_job_id = output.split(" ")[-1].strip().strip("\n")
+    output_lines = output.split("\n")[:-2]
+
+    table = create_table(key_title="Job Config", value_title="Value")
+    table.add_row("Slurm Job ID", slurm_job_id, style="blue")
+    output_dict = {"slurm_job_id": slurm_job_id}
+
+    for line in output_lines:
+        key, value = line.split(": ")
+        table.add_row(key, value)
+        output_dict[key.lower().replace(" ", "_")] = value
+
+    if json_mode:
+        click.echo(output_dict)
+    else:
+        console.print(table)
+
+
+@cli.command("status")
+@click.argument(
+    "slurm_job_id",
+    type=int,
+    nargs=1
+)
+@click.option(
+    "--json-mode",
+    is_flag=True,
+    help='Output in JSON string',
+)
+def status(slurm_job_id: int, json_mode: bool=False) -> None:
+    """
+    Get the status of a running model on the cluster
+    """
+    status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
+    output = run_bash_command(status_cmd)
+
+    status = "SHUTDOWN"
+    base_url = "UNAVAILABLE"
+
+    try:
+        slurm_job_name = output.split(" ")[1].split("=")[1]
+        slurm_job_state = output.split(" ")[9].split("=")[1]
+        # If Slurm job is currently PENDING
+        if slurm_job_state == "PENDING":
+            status = "PENDING"
+        # If Slurm job is currently RUNNING
+        elif slurm_job_state == "RUNNING":
+            # Check whether the server is ready, if yes, run model health check to further determine status
+            server_status = is_server_running(slurm_job_name, slurm_job_id)
+            if server_status == "RUNNING":
+                status = model_health_check(slurm_job_name)
+                if status == "READY":
+                    # Only set base_url if model is ready to serve requests
+                    base_url = get_base_url(slurm_job_name)
+            else:
+                status = server_status
+    except:
+        pass
+
+    if json_mode:
+        click.echo(f'{{"model_name": "{slurm_job_name}", "model_status": "{status}", "base_url": "{base_url}"}}')
+    else:
+        table = create_table(key_title="Job Status", value_title="Value")
+        table.add_row("Model Name", slurm_job_name)
+        table.add_row("Model Status", status, style="blue")
+        table.add_row("Base URL", base_url)
+        console.print(table)
+
+
+@cli.command("shutdown")
+@click.argument(
+    "slurm_job_id",
+    type=int,
+    nargs=1
+)
+def shutdown(slurm_job_id: int) -> None:
+    """
+    Shutdown a running model on the cluster
+    """
+    shutdown_cmd = f"scancel {slurm_job_id}"
+    run_bash_command(shutdown_cmd)
+    click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
+
+
+if __name__ == '__main__':
+    cli()
diff --git a/vec_inf/cli/_utils.py b/vec_inf/cli/_utils.py
@@ -0,0 +1,94 @@
+import subprocess
+import os
+
+import requests
+from rich.table import Table
+
+
+MODEL_READY_SIGNATURE = "INFO:     Uvicorn running on http://0.0.0.0:"
+
+
+def run_bash_command(command: str) -> str:
+    """
+    Run a bash command and return the output
+    """
+    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    stdout, _ = process.communicate()
+    return stdout
+
+
+def get_model_dir(slurm_job_name: str) -> str:
+    """
+    Get the directory of a model
+    """
+    models_dir = os.path.join(
+        os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
+        "models"
+    )
+    model_dir = ""
+    for dir in os.listdir(models_dir):
+        if dir in slurm_job_name.lower():
+            model_dir = os.path.join(models_dir, dir)
+            break
+    return model_dir
+
+
+def is_server_running(slurm_job_name: str, slurm_job_id: int) -> str:
+    """
+    Check if a model is ready to serve requests
+    """
+    model_dir = get_model_dir(slurm_job_name)
+
+    try:
+        file_path = os.path.join(model_dir, f"{slurm_job_name}.{slurm_job_id}.err")
+        with open(file_path, 'r') as file:
+            lines = file.readlines()
+    except FileNotFoundError:
+        return "LAUNCHING"
+
+    for line in lines:
+        if MODEL_READY_SIGNATURE in line:
+            return "RUNNING"
+    return "LAUNCHING"
+
+
+def get_base_url(slurm_job_name: str) -> str:
+    """
+    Get the base URL of a model
+    """
+    model_dir = get_model_dir(slurm_job_name)
+    try:
+        file_path = os.path.join(model_dir, f".{slurm_job_name}_url")
+        with open(file_path, 'r') as file:
+            lines = file.readlines()
+    except FileNotFoundError:
+        return "UNAVAILABLE"
+    return lines[0].strip().strip("\n")
+
+
+def model_health_check(slurm_job_name: str) -> str:
+    """
+    Check the health of a running model on the cluster
+    """
+    base_url = get_base_url(slurm_job_name)
+    health_check_url = base_url.replace("v1", "health")
+
+    try:
+        response = requests.get(health_check_url)
+        # Check if the request was successful
+        if response.status_code == 200:
+            return "READY"
+        else:
+            return "FAILED"
+    except requests.exceptions.RequestException as e:
+        return "FAILED"
+
+
+def create_table(key_title: str = "", value_title: str = "", show_header: bool = True) -> Table:
+    """
+    Create a table for displaying model status
+    """
+    table = Table(show_header=show_header, header_style="bold magenta")
+    table.add_column(key_title, style="dim")
+    table.add_column(value_title)
+    return table
diff --git a/src/find_port.sh → vec_inf/find_port.sh b/src/find_port.sh → vec_inf/find_port.sh