feat: timer estimates GPU costs (OpenRouterTeam#60)

* feat: timer estimates GPU costs * fix: errors captured by timer * fix: better naming for cost per second constants * chore: add fixme for async tracking
mojowebs · Jan 24, 2024 · 003dc83 · 003dc83
1 parent 5d46cb3
commit 003dc83
Show file tree

Hide file tree

Showing 4 changed files with 65 additions and 12 deletions.
diff --git a/modal/runner/endpoints/completion.py b/modal/runner/endpoints/completion.py
@@ -44,7 +44,6 @@ def completion(
         )
 
     runner = get_container(model_path, container_type)
-    tags = {"model": str(model_path), "container_type": container_type.value}
 
     stats = runner.generate.get_current_stats()
     logger.info(stats)
@@ -88,7 +87,7 @@ def completion(
         return create_error_response(status.HTTP_400_BAD_REQUEST, str(e))
 
     async def generate():
-        with timer("runner.generate", tags=tags):
+        with timer("runner.generate", str(model_path), container_type):
             async for text in runner.generate.remote_gen.aio(
                 payload, sampling_params
             ):

diff --git a/modal/runner/engines/vllm.py b/modal/runner/engines/vllm.py
@@ -44,7 +44,7 @@ class VllmParams(BaseModel):
 
 class VllmEngine(BaseEngine):
     def __init__(self, params: VllmParams):
-        with timer("imports"):
+        with timer("imports", model=params.model):
             from vllm.engine.arg_utils import AsyncEngineArgs
             from vllm.engine.async_llm_engine import AsyncLLMEngine
 
@@ -53,7 +53,7 @@ def __init__(self, params: VllmParams):
             disable_log_requests=True,
         )
 
-        with timer("engine init", tags={"model": self.engine_args.model}):
+        with timer("engine init", model=self.engine_args.model):
             self.engine = AsyncLLMEngine.from_engine_args(self.engine_args)
 
     # @method()

diff --git a/modal/shared/logging.py b/modal/shared/logging.py
@@ -13,6 +13,8 @@
 from datadog_api_client.v2.model.http_log_item import HTTPLogItem
 from modal import Image, Secret
 
+from shared.protocol import ContainerType
+
 sentry_sdk.init(
     dsn=os.environ.get("SENTRY_DSN"),
     environment=os.environ.get("SENTRY_ENVIRONMENT") or "development",
@@ -34,14 +36,39 @@ def add_observability(image: Image):
 
 
 @contextmanager
-def timer(action: str, tags: dict[str, str | int] = None) -> None:
-    """A simple timer context manager with structured logging for its output."""
-    start = time.perf_counter()
-    yield
-    elapsed = time.perf_counter() - start
+def timer(
+    action: str,
+    model: str = None,
+    container_type: ContainerType = None,
+    tags: dict[str, str | int] = None,
+) -> None:
+    """
+    A simple timer context manager with structured logging for its output.
 
-    extra = (tags or {}) | {"duration": elapsed}
-    logging.info(f"{action} execution profiled", extra=extra)
+    Args:
+        action: The noun being timed
+        model: Optional, used as a tag
+        container_type: Optional, used as a tag and to estimate GPU cost
+        tags: Any additional tags to include in the structured log
+    """
+    start = time.perf_counter()
+    try:
+        yield
+    finally:
+        # FIXME: this block doesnt seem to execute when an async function
+        # is called from within the context manager. Look into making an
+        # async variant.
+
+        elapsed = time.perf_counter() - start
+
+        extra = (tags or {}) | {"duration": elapsed}
+        if model:
+            extra["model"] = model
+        if container_type:
+            extra["container_type"] = container_type.value
+            extra["gpu_cost"] = elapsed * container_type.gpu_cost_per_second
+
+        logging.info(f"{action} execution profiled", extra=extra)
 
 
 # skip natural LogRecord attributes

diff --git a/modal/shared/protocol.py b/modal/shared/protocol.py
@@ -1,9 +1,12 @@
 from enum import Enum
-from typing import List, Optional, Union
+from typing import Final, List, Optional, Union
 
 from fastapi.responses import JSONResponse, PlainTextResponse
 from pydantic import BaseModel
 
+_COST_PER_SECOND_A100_40G: Final[float] = 0.001036
+_COST_PER_SECOND_A100_80G: Final[float] = 0.001553
+
 
 class ContainerType(Enum):
     VllmContainer_7B = "VllmContainer_7B"
@@ -16,6 +19,30 @@ class ContainerType(Enum):
     VllmContainerA100_160G = "VllmContainerA100_160G"
     VllmContainerA100_160G_Isolated = "VllmContainerA100_160G_Isolated"
 
+    @property
+    def gpu_cost_per_second(self) -> float:
+        """
+        Returns:
+            The quoted GPU compute cost per second for the container,
+            as found on https://modal.com/pricing
+        """
+
+        # TODO: might be better to put this on the container class itself,
+        #       but this is good enough(tm) for now
+        match self:
+            case ContainerType.VllmContainer_7B:
+                return _COST_PER_SECOND_A100_40G * 1
+            case ContainerType.VllmContainerA100_40G:
+                return _COST_PER_SECOND_A100_40G * 1
+            case ContainerType.VllmContainerA100_80G:
+                return _COST_PER_SECOND_A100_80G * 1
+            case ContainerType.VllmContainerA100_80G_32K:
+                return _COST_PER_SECOND_A100_80G * 1
+            case ContainerType.VllmContainerA100_160G:
+                return _COST_PER_SECOND_A100_80G * 2
+            case ContainerType.VllmContainerA100_160G_Isolated:
+                return _COST_PER_SECOND_A100_80G * 2
+
 
 # https://github.com/vllm-project/vllm/blob/320a622ec4d098f2da5d097930f4031517e7327b/vllm/sampling_params.py#L7-L52
 # Lines were sorted for consistency