Skip to content

Commit

Permalink
feat: timer estimates GPU costs (OpenRouterTeam#60)
Browse files Browse the repository at this point in the history
* feat: timer estimates GPU costs

* fix: errors captured by timer

* fix: better naming for cost per second constants

* chore: add fixme for async tracking
  • Loading branch information
sambarnes authored Jan 24, 2024
1 parent 5d46cb3 commit 003dc83
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 12 deletions.
3 changes: 1 addition & 2 deletions modal/runner/endpoints/completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def completion(
)

runner = get_container(model_path, container_type)
tags = {"model": str(model_path), "container_type": container_type.value}

stats = runner.generate.get_current_stats()
logger.info(stats)
Expand Down Expand Up @@ -88,7 +87,7 @@ def completion(
return create_error_response(status.HTTP_400_BAD_REQUEST, str(e))

async def generate():
with timer("runner.generate", tags=tags):
with timer("runner.generate", str(model_path), container_type):
async for text in runner.generate.remote_gen.aio(
payload, sampling_params
):
Expand Down
4 changes: 2 additions & 2 deletions modal/runner/engines/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class VllmParams(BaseModel):

class VllmEngine(BaseEngine):
def __init__(self, params: VllmParams):
with timer("imports"):
with timer("imports", model=params.model):
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine

Expand All @@ -53,7 +53,7 @@ def __init__(self, params: VllmParams):
disable_log_requests=True,
)

with timer("engine init", tags={"model": self.engine_args.model}):
with timer("engine init", model=self.engine_args.model):
self.engine = AsyncLLMEngine.from_engine_args(self.engine_args)

# @method()
Expand Down
41 changes: 34 additions & 7 deletions modal/shared/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from datadog_api_client.v2.model.http_log_item import HTTPLogItem
from modal import Image, Secret

from shared.protocol import ContainerType

sentry_sdk.init(
dsn=os.environ.get("SENTRY_DSN"),
environment=os.environ.get("SENTRY_ENVIRONMENT") or "development",
Expand All @@ -34,14 +36,39 @@ def add_observability(image: Image):


@contextmanager
def timer(action: str, tags: dict[str, str | int] = None) -> None:
"""A simple timer context manager with structured logging for its output."""
start = time.perf_counter()
yield
elapsed = time.perf_counter() - start
def timer(
action: str,
model: str = None,
container_type: ContainerType = None,
tags: dict[str, str | int] = None,
) -> None:
"""
A simple timer context manager with structured logging for its output.
extra = (tags or {}) | {"duration": elapsed}
logging.info(f"{action} execution profiled", extra=extra)
Args:
action: The noun being timed
model: Optional, used as a tag
container_type: Optional, used as a tag and to estimate GPU cost
tags: Any additional tags to include in the structured log
"""
start = time.perf_counter()
try:
yield
finally:
# FIXME: this block doesnt seem to execute when an async function
# is called from within the context manager. Look into making an
# async variant.

elapsed = time.perf_counter() - start

extra = (tags or {}) | {"duration": elapsed}
if model:
extra["model"] = model
if container_type:
extra["container_type"] = container_type.value
extra["gpu_cost"] = elapsed * container_type.gpu_cost_per_second

logging.info(f"{action} execution profiled", extra=extra)


# skip natural LogRecord attributes
Expand Down
29 changes: 28 additions & 1 deletion modal/shared/protocol.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from enum import Enum
from typing import List, Optional, Union
from typing import Final, List, Optional, Union

from fastapi.responses import JSONResponse, PlainTextResponse
from pydantic import BaseModel

_COST_PER_SECOND_A100_40G: Final[float] = 0.001036
_COST_PER_SECOND_A100_80G: Final[float] = 0.001553


class ContainerType(Enum):
VllmContainer_7B = "VllmContainer_7B"
Expand All @@ -16,6 +19,30 @@ class ContainerType(Enum):
VllmContainerA100_160G = "VllmContainerA100_160G"
VllmContainerA100_160G_Isolated = "VllmContainerA100_160G_Isolated"

@property
def gpu_cost_per_second(self) -> float:
"""
Returns:
The quoted GPU compute cost per second for the container,
as found on https://modal.com/pricing
"""

# TODO: might be better to put this on the container class itself,
# but this is good enough(tm) for now
match self:
case ContainerType.VllmContainer_7B:
return _COST_PER_SECOND_A100_40G * 1
case ContainerType.VllmContainerA100_40G:
return _COST_PER_SECOND_A100_40G * 1
case ContainerType.VllmContainerA100_80G:
return _COST_PER_SECOND_A100_80G * 1
case ContainerType.VllmContainerA100_80G_32K:
return _COST_PER_SECOND_A100_80G * 1
case ContainerType.VllmContainerA100_160G:
return _COST_PER_SECOND_A100_80G * 2
case ContainerType.VllmContainerA100_160G_Isolated:
return _COST_PER_SECOND_A100_80G * 2


# https://github.com/vllm-project/vllm/blob/320a622ec4d098f2da5d097930f4031517e7327b/vllm/sampling_params.py#L7-L52
# Lines were sorted for consistency
Expand Down

0 comments on commit 003dc83

Please sign in to comment.