From 57e5c5ff9bfefc69003c608981f2e702b0507e65 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Tue, 24 Feb 2026 14:14:34 -0500
Subject: [PATCH 1/2] feat(modal): add inference serving with call_inference
 API

- Add _build_inference_app() for Modal GPU inference with PEFT adapter
- Add upload_adapter_to_volume() for uploading adapters to Modal volume
- Add call_inference() as the primary API for remote inference
- Add 'serve' CLI command for interactive model serving
- Container caches model in memory across calls (container_idle_timeout=600)
- Support --no-adapter for zero-shot base model serving

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 openadapt_ml/cloud/modal_cloud.py | 345 +++++++++++++++++++++++++++++-
 tests/test_modal_cloud.py         | 112 ++++++++++
 2 files changed, 456 insertions(+), 1 deletion(-)

diff --git a/openadapt_ml/cloud/modal_cloud.py b/openadapt_ml/cloud/modal_cloud.py
index d9f452c..b569b46 100644
--- a/openadapt_ml/cloud/modal_cloud.py
+++ b/openadapt_ml/cloud/modal_cloud.py
@@ -1,4 +1,4 @@
-"""Modal cloud GPU integration for training.
+"""Modal cloud GPU integration for training and inference.
 
 Modal is a Python-native serverless cloud platform:
 - No SSH, no instances to manage
@@ -25,6 +25,11 @@
     # Download results
     python -m openadapt_ml.cloud.modal_cloud download --output ./results
 
+    # Serve fine-tuned model for inference
+    python -m openadapt_ml.cloud.modal_cloud serve \
+        --adapter /path/to/adapter \
+        --base-model Qwen/Qwen3-VL-2B-Instruct
+
     # List volumes
     python -m openadapt_ml.cloud.modal_cloud list-volumes
 """
@@ -255,6 +260,224 @@ def train_model(
     return train_model
 
 
+# ---------------------------------------------------------------------------
+# Inference serving
+# ---------------------------------------------------------------------------
+
+INFERENCE_APP_NAME = "openadapt-inference"
+
+
+def _build_inference_app(
+    adapter_path: str | None = None,
+    base_model: str = "Qwen/Qwen3-VL-2B-Instruct",
+    gpu: str = "A10G",
+):
+    """Build Modal app for model inference.
+
+    Args:
+        adapter_path: Path to PEFT adapter in the volume (e.g., /training/results/final).
+        base_model: HuggingFace model ID for the base model.
+        gpu: GPU type.
+
+    Returns:
+        (app, infer_fn) - the app and the inference function handle.
+    """
+    modal = _get_modal()
+
+    app = modal.App(INFERENCE_APP_NAME)
+    volume = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)
+
+    inference_image = modal.Image.debian_slim(python_version="3.12").pip_install(
+        "torch",
+        "transformers",
+        "peft",
+        "accelerate",
+        "pillow",
+        "qwen-vl-utils",
+    )
+
+    vol = volume
+    _adapter = adapter_path
+    _base = base_model
+
+    @app.function(
+        gpu=gpu,
+        image=inference_image,
+        volumes={VOLUME_MOUNT: vol},
+        timeout=300,
+        serialized=True,
+        container_idle_timeout=600,
+    )
+    def infer(
+        messages_json: str,
+        image_base64: str | None = None,
+        max_new_tokens: int = 512,
+    ) -> str:
+        """Run inference on the fine-tuned model.
+
+        Args:
+            messages_json: JSON-encoded list of messages (OpenAI chat format).
+            image_base64: Base64-encoded screenshot image (optional).
+            max_new_tokens: Maximum tokens to generate.
+
+        Returns:
+            JSON string with 'response' key containing model output.
+        """
+        import base64 as _base64
+        import json as _json
+        from io import BytesIO as _BytesIO
+
+        import torch
+        from PIL import Image as _Image
+        from transformers import AutoModelForVision2Seq, AutoProcessor
+
+        # Load model (cached in container memory across calls)
+        if not hasattr(infer, "_model"):
+            print(f"Loading base model: {_base}")
+            infer._model = AutoModelForVision2Seq.from_pretrained(
+                _base,
+                torch_dtype=torch.bfloat16,
+                device_map="auto",
+            )
+
+            if _adapter:
+                from peft import PeftModel
+
+                print(f"Loading PEFT adapter: {_adapter}")
+                vol.reload()
+                infer._model = PeftModel.from_pretrained(infer._model, _adapter)
+
+            infer._processor = AutoProcessor.from_pretrained(_base)
+            print("Model ready for inference")
+
+        messages = _json.loads(messages_json)
+
+        # If image_base64 is provided, decode it
+        image = None
+        if image_base64:
+            img_bytes = _base64.b64decode(image_base64)
+            image = _Image.open(_BytesIO(img_bytes)).convert("RGB")
+
+        # Build inputs using the processor's chat template
+        text = infer._processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+
+        if image is not None:
+            inputs = infer._processor(
+                text=[text], images=[image], return_tensors="pt", padding=True
+            )
+        else:
+            inputs = infer._processor(
+                text=[text], return_tensors="pt", padding=True
+            )
+
+        inputs = inputs.to(infer._model.device)
+
+        with torch.no_grad():
+            output_ids = infer._model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+            )
+
+        # Decode only the generated tokens (skip the input)
+        generated_ids = output_ids[:, inputs["input_ids"].shape[1] :]
+        response_text = infer._processor.batch_decode(
+            generated_ids, skip_special_tokens=True
+        )[0]
+
+        return _json.dumps({"response": response_text.strip()})
+
+    return app, infer
+
+
+def upload_adapter_to_volume(adapter_dir: str | Path) -> str:
+    """Upload a local PEFT adapter to the Modal volume.
+
+    Args:
+        adapter_dir: Path to local adapter directory.
+
+    Returns:
+        Remote path to the adapter in the volume.
+    """
+    adapter_dir = Path(adapter_dir)
+    if not adapter_dir.exists():
+        raise FileNotFoundError(f"Adapter not found: {adapter_dir}")
+    if not (adapter_dir / "adapter_config.json").exists():
+        raise FileNotFoundError(f"No adapter_config.json in: {adapter_dir}")
+
+    remote_path = "/adapter"
+
+    # Create volume if needed
+    create_cmd = ["modal", "volume", "create", VOLUME_NAME]
+    subprocess.run(create_cmd, capture_output=True, text=True)
+
+    cmd = [
+        "modal",
+        "volume",
+        "put",
+        VOLUME_NAME,
+        str(adapter_dir),
+        remote_path,
+        "--force",
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"Adapter upload failed: {result.stderr or result.stdout}")
+
+    full_remote = f"{VOLUME_MOUNT}{remote_path}"
+    print(f"Adapter uploaded to volume at: {full_remote}")
+    return full_remote
+
+
+def call_inference(
+    messages: list[dict],
+    image_base64: str | None = None,
+    max_new_tokens: int = 512,
+    adapter_path: str | None = None,
+    base_model: str = "Qwen/Qwen3-VL-2B-Instruct",
+    gpu: str = "A10G",
+) -> str:
+    """Call the Modal inference function remotely.
+
+    This is the primary API for external callers (e.g., Qwen3VLAgent).
+    Builds and runs the Modal app, sends a single inference request,
+    and returns the model output.
+
+    Args:
+        messages: Chat messages in OpenAI format.
+        image_base64: Base64-encoded image string.
+        max_new_tokens: Maximum tokens to generate.
+        adapter_path: Remote adapter path in the volume.
+        base_model: HuggingFace model ID for the base model.
+        gpu: GPU type.
+
+    Returns:
+        Model response text.
+    """
+    modal = _get_modal()
+    modal.enable_output()
+
+    app, infer_fn = _build_inference_app(
+        adapter_path=adapter_path,
+        base_model=base_model,
+        gpu=gpu,
+    )
+
+    messages_json = json.dumps(messages)
+
+    with app.run():
+        result_json = infer_fn.remote(
+            messages_json=messages_json,
+            image_base64=image_base64,
+            max_new_tokens=max_new_tokens,
+        )
+
+    result = json.loads(result_json)
+    return result.get("response", "")
+
+
 # ---------------------------------------------------------------------------
 # Local helpers for CLI commands
 # ---------------------------------------------------------------------------
@@ -462,6 +685,34 @@ def cli_main(argv: list[str] | None = None) -> int:
         help="Local output directory (default: training_output/modal)",
     )
 
+    # --- serve ---
+    serve_parser = subparsers.add_parser(
+        "serve", help="Serve fine-tuned model for inference on Modal GPU"
+    )
+    serve_parser.add_argument(
+        "--adapter",
+        help="Local adapter directory to upload and serve",
+    )
+    serve_parser.add_argument(
+        "--adapter-remote",
+        help="Remote adapter path already in the volume (e.g., /training/results/final)",
+    )
+    serve_parser.add_argument(
+        "--base-model",
+        default="Qwen/Qwen3-VL-2B-Instruct",
+        help="Base model HuggingFace ID (default: Qwen/Qwen3-VL-2B-Instruct)",
+    )
+    serve_parser.add_argument(
+        "--gpu",
+        default="A10G",
+        help="GPU type (default: A10G)",
+    )
+    serve_parser.add_argument(
+        "--no-adapter",
+        action="store_true",
+        help="Serve base model without adapter (zero-shot)",
+    )
+
     # --- list-volumes ---
     subparsers.add_parser("list-volumes", help="List Modal volumes")
 
@@ -477,6 +728,8 @@ def cli_main(argv: list[str] | None = None) -> int:
         return _cmd_status(args)
     elif args.command == "download":
         return _cmd_download(args)
+    elif args.command == "serve":
+        return _cmd_serve(args)
     elif args.command == "list-volumes":
         return _cmd_list_volumes(args)
     else:
@@ -626,6 +879,96 @@ def _cmd_download(args: argparse.Namespace) -> int:
         return 1
 
 
+def _cmd_serve(args: argparse.Namespace) -> int:
+    """Serve a fine-tuned model on Modal GPU for inference.
+
+    Uploads the adapter (if local path provided), then starts the
+    inference function that clients can call via Modal's .remote() API.
+    Alternatively, clients can use the HTTP wrapper in Qwen3VLAgent.
+    """
+    modal = _get_modal()
+
+    adapter_remote = None
+
+    if args.no_adapter:
+        print(f"Serving base model: {args.base_model} (no adapter)")
+    elif args.adapter:
+        # Upload local adapter to volume
+        print("Uploading adapter to Modal volume...")
+        try:
+            adapter_remote = upload_adapter_to_volume(args.adapter)
+        except (FileNotFoundError, RuntimeError) as e:
+            print(f"Error: {e}")
+            return 1
+    elif args.adapter_remote:
+        adapter_remote = args.adapter_remote
+        print(f"Using remote adapter: {adapter_remote}")
+    else:
+        # Default: use the latest training results
+        adapter_remote = f"{RESULTS_REMOTE_PATH}/final"
+        print(f"Using default adapter: {adapter_remote}")
+
+    print(f"Base model: {args.base_model}")
+    print(f"GPU: {args.gpu}")
+    print()
+
+    try:
+        modal.enable_output()
+
+        app, infer_fn = _build_inference_app(
+            adapter_path=adapter_remote,
+            base_model=args.base_model,
+            gpu=args.gpu,
+        )
+
+        print("Starting inference server on Modal...")
+        print("Press Ctrl+C to stop.\n")
+
+        with app.run():
+            # Test with a simple warmup call
+            test_messages = json.dumps(
+                [
+                    {
+                        "role": "system",
+                        "content": "You are a GUI automation agent.",
+                    },
+                    {
+                        "role": "user",
+                        "content": "Respond with: ready",
+                    },
+                ]
+            )
+            result = infer_fn.remote(messages_json=test_messages)
+            result_data = json.loads(result)
+            print(f"Model ready. Test response: {result_data.get('response', '')}")
+            print()
+            print("=" * 50)
+            print("INFERENCE SERVER RUNNING")
+            print("=" * 50)
+            print()
+            print(
+                "To run inference from another process, use:\n"
+                "  from openadapt_ml.cloud.modal_cloud import call_inference\n"
+                "  result = call_inference(messages, image_base64)\n"
+            )
+            print("Or use Qwen3VLAgent with --model-endpoint modal\n")
+
+            # Keep the app running until Ctrl+C
+            import time as _time
+
+            try:
+                while True:
+                    _time.sleep(1)
+            except KeyboardInterrupt:
+                print("\nShutting down inference server...")
+
+    except Exception as e:
+        print(f"Serve failed: {e}")
+        return 1
+
+    return 0
+
+
 def _cmd_list_volumes(args: argparse.Namespace) -> int:
     """List Modal volumes."""
     list_volumes()
diff --git a/tests/test_modal_cloud.py b/tests/test_modal_cloud.py
index 9e92eed..cf0641b 100644
--- a/tests/test_modal_cloud.py
+++ b/tests/test_modal_cloud.py
@@ -418,7 +418,119 @@ def test_constants(self):
         from openadapt_ml.cloud import modal_cloud
 
         assert modal_cloud.MODAL_APP_NAME == "openadapt-training"
+        assert modal_cloud.INFERENCE_APP_NAME == "openadapt-inference"
         assert modal_cloud.VOLUME_NAME == "openadapt-training-data"
         assert modal_cloud.VOLUME_MOUNT == "/training"
         assert modal_cloud.BUNDLE_REMOTE_PATH == "/training/bundle"
         assert modal_cloud.RESULTS_REMOTE_PATH == "/training/results"
+
+
+# ---------------------------------------------------------------------------
+# Serve / inference tests
+# ---------------------------------------------------------------------------
+
+
+class TestServeCLI:
+    """Test serve command CLI parsing."""
+
+    def test_serve_help(self):
+        """Test that serve --help exits cleanly."""
+        from openadapt_ml.cloud.modal_cloud import cli_main
+
+        with pytest.raises(SystemExit) as exc_info:
+            cli_main(["serve", "--help"])
+        assert exc_info.value.code == 0
+
+    def test_serve_no_adapter_flag(self):
+        """Test that serve --no-adapter uses base model only."""
+        from openadapt_ml.cloud.modal_cloud import cli_main
+
+        mock_modal = MagicMock()
+        mock_modal.enable_output = MagicMock()
+
+        mock_app = MagicMock()
+        mock_infer = MagicMock()
+
+        with (
+            patch(
+                "openadapt_ml.cloud.modal_cloud._get_modal", return_value=mock_modal
+            ),
+            patch(
+                "openadapt_ml.cloud.modal_cloud._build_inference_app",
+                return_value=(mock_app, mock_infer),
+            ) as mock_build,
+            patch("openadapt_ml.cloud.modal_cloud.json") as mock_json,
+        ):
+            mock_json.dumps.return_value = '[]'
+            mock_json.loads.return_value = {"response": "ready"}
+            mock_infer.remote.return_value = '{"response": "ready"}'
+            mock_app.run.return_value.__enter__ = MagicMock()
+            mock_app.run.return_value.__exit__ = MagicMock(return_value=False)
+
+            # The serve command blocks on the while loop, so we use KeyboardInterrupt
+            import time as _time
+
+            original_sleep = _time.sleep
+            call_count = [0]
+
+            def mock_sleep(t):
+                call_count[0] += 1
+                if call_count[0] > 1:
+                    raise KeyboardInterrupt()
+
+            with patch("time.sleep", side_effect=mock_sleep):
+                result = cli_main(["serve", "--no-adapter"])
+
+            # Should have called _build_inference_app with no adapter
+            mock_build.assert_called_once_with(
+                adapter_path=None,
+                base_model="Qwen/Qwen3-VL-2B-Instruct",
+                gpu="A10G",
+            )
+            assert result == 0
+
+
+class TestUploadAdapter:
+    """Test adapter upload logic."""
+
+    def test_upload_missing_adapter_raises(self):
+        """Test that uploading a non-existent adapter raises."""
+        from openadapt_ml.cloud.modal_cloud import upload_adapter_to_volume
+
+        with pytest.raises(FileNotFoundError, match="Adapter not found"):
+            upload_adapter_to_volume("/nonexistent/path")
+
+    def test_upload_adapter_without_config_raises(self):
+        """Test that adapter without adapter_config.json raises."""
+        from openadapt_ml.cloud.modal_cloud import upload_adapter_to_volume
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with pytest.raises(FileNotFoundError, match="No adapter_config.json"):
+                upload_adapter_to_volume(tmpdir)
+
+    def test_upload_adapter_calls_volume_put(self):
+        """Test that upload invokes volume put with correct args."""
+        from openadapt_ml.cloud.modal_cloud import upload_adapter_to_volume
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            (Path(tmpdir) / "adapter_config.json").write_text('{"test": true}')
+            (Path(tmpdir) / "adapter_model.safetensors").write_text("fake")
+
+            mock_result = MagicMock()
+            mock_result.returncode = 0
+            mock_result.stdout = ""
+            mock_result.stderr = ""
+
+            with patch(
+                "openadapt_ml.cloud.modal_cloud.subprocess.run",
+                return_value=mock_result,
+            ) as mock_run:
+                remote = upload_adapter_to_volume(tmpdir)
+
+                # Two calls: create + put
+                assert mock_run.call_count == 2
+                put_cmd = mock_run.call_args_list[1][0][0]
+                assert "put" in put_cmd
+                assert "/adapter" in put_cmd
+                assert "--force" in put_cmd
+                assert remote == "/training/adapter"

From e42ed755f1fd7517ae381de52e95742614225673 Mon Sep 17 00:00:00 2001
From: semantic-release <semantic-release>
Date: Tue, 24 Feb 2026 19:16:06 +0000
Subject: [PATCH 2/2] chore: release 0.11.0

---
 CHANGELOG.md   | 16 ++++++++++++++++
 pyproject.toml |  2 +-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 113f7f9..fa01cc5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,22 @@
 # CHANGELOG
 
 
+## v0.11.0 (2026-02-24)
+
+### Features
+
+- **modal**: Add inference serving with call_inference API
+  ([`57e5c5f`](https://github.com/OpenAdaptAI/openadapt-ml/commit/57e5c5ff9bfefc69003c608981f2e702b0507e65))
+
+- Add _build_inference_app() for Modal GPU inference with PEFT adapter - Add
+  upload_adapter_to_volume() for uploading adapters to Modal volume - Add call_inference() as the
+  primary API for remote inference - Add 'serve' CLI command for interactive model serving -
+  Container caches model in memory across calls (container_idle_timeout=600) - Support --no-adapter
+  for zero-shot base model serving
+
+Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
+
+
 ## v0.10.1 (2026-02-24)
 
 ### Bug Fixes
diff --git a/pyproject.toml b/pyproject.toml
index 3e67344..4283314 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "openadapt-ml"
-version = "0.10.1"
+version = "0.11.0"
 description = "Model-agnostic, domain-agnostic ML engine for GUI automation agents"
 readme = "README.md"
 requires-python = ">=3.10"