From 57e5c5ff9bfefc69003c608981f2e702b0507e65 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Tue, 24 Feb 2026 14:14:34 -0500 Subject: [PATCH 1/2] feat(modal): add inference serving with call_inference API - Add _build_inference_app() for Modal GPU inference with PEFT adapter - Add upload_adapter_to_volume() for uploading adapters to Modal volume - Add call_inference() as the primary API for remote inference - Add 'serve' CLI command for interactive model serving - Container caches model in memory across calls (container_idle_timeout=600) - Support --no-adapter for zero-shot base model serving Co-Authored-By: Claude Opus 4.6 --- openadapt_ml/cloud/modal_cloud.py | 345 +++++++++++++++++++++++++++++- tests/test_modal_cloud.py | 112 ++++++++++ 2 files changed, 456 insertions(+), 1 deletion(-) diff --git a/openadapt_ml/cloud/modal_cloud.py b/openadapt_ml/cloud/modal_cloud.py index d9f452c..b569b46 100644 --- a/openadapt_ml/cloud/modal_cloud.py +++ b/openadapt_ml/cloud/modal_cloud.py @@ -1,4 +1,4 @@ -"""Modal cloud GPU integration for training. +"""Modal cloud GPU integration for training and inference. Modal is a Python-native serverless cloud platform: - No SSH, no instances to manage @@ -25,6 +25,11 @@ # Download results python -m openadapt_ml.cloud.modal_cloud download --output ./results + # Serve fine-tuned model for inference + python -m openadapt_ml.cloud.modal_cloud serve \ + --adapter /path/to/adapter \ + --base-model Qwen/Qwen3-VL-2B-Instruct + # List volumes python -m openadapt_ml.cloud.modal_cloud list-volumes """ @@ -255,6 +260,224 @@ def train_model( return train_model +# --------------------------------------------------------------------------- +# Inference serving +# --------------------------------------------------------------------------- + +INFERENCE_APP_NAME = "openadapt-inference" + + +def _build_inference_app( + adapter_path: str | None = None, + base_model: str = "Qwen/Qwen3-VL-2B-Instruct", + gpu: str = "A10G", +): + """Build Modal app for model inference. + + Args: + adapter_path: Path to PEFT adapter in the volume (e.g., /training/results/final). + base_model: HuggingFace model ID for the base model. + gpu: GPU type. + + Returns: + (app, infer_fn) - the app and the inference function handle. + """ + modal = _get_modal() + + app = modal.App(INFERENCE_APP_NAME) + volume = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True) + + inference_image = modal.Image.debian_slim(python_version="3.12").pip_install( + "torch", + "transformers", + "peft", + "accelerate", + "pillow", + "qwen-vl-utils", + ) + + vol = volume + _adapter = adapter_path + _base = base_model + + @app.function( + gpu=gpu, + image=inference_image, + volumes={VOLUME_MOUNT: vol}, + timeout=300, + serialized=True, + container_idle_timeout=600, + ) + def infer( + messages_json: str, + image_base64: str | None = None, + max_new_tokens: int = 512, + ) -> str: + """Run inference on the fine-tuned model. + + Args: + messages_json: JSON-encoded list of messages (OpenAI chat format). + image_base64: Base64-encoded screenshot image (optional). + max_new_tokens: Maximum tokens to generate. + + Returns: + JSON string with 'response' key containing model output. + """ + import base64 as _base64 + import json as _json + from io import BytesIO as _BytesIO + + import torch + from PIL import Image as _Image + from transformers import AutoModelForVision2Seq, AutoProcessor + + # Load model (cached in container memory across calls) + if not hasattr(infer, "_model"): + print(f"Loading base model: {_base}") + infer._model = AutoModelForVision2Seq.from_pretrained( + _base, + torch_dtype=torch.bfloat16, + device_map="auto", + ) + + if _adapter: + from peft import PeftModel + + print(f"Loading PEFT adapter: {_adapter}") + vol.reload() + infer._model = PeftModel.from_pretrained(infer._model, _adapter) + + infer._processor = AutoProcessor.from_pretrained(_base) + print("Model ready for inference") + + messages = _json.loads(messages_json) + + # If image_base64 is provided, decode it + image = None + if image_base64: + img_bytes = _base64.b64decode(image_base64) + image = _Image.open(_BytesIO(img_bytes)).convert("RGB") + + # Build inputs using the processor's chat template + text = infer._processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + if image is not None: + inputs = infer._processor( + text=[text], images=[image], return_tensors="pt", padding=True + ) + else: + inputs = infer._processor( + text=[text], return_tensors="pt", padding=True + ) + + inputs = inputs.to(infer._model.device) + + with torch.no_grad(): + output_ids = infer._model.generate( + **inputs, + max_new_tokens=max_new_tokens, + do_sample=False, + ) + + # Decode only the generated tokens (skip the input) + generated_ids = output_ids[:, inputs["input_ids"].shape[1] :] + response_text = infer._processor.batch_decode( + generated_ids, skip_special_tokens=True + )[0] + + return _json.dumps({"response": response_text.strip()}) + + return app, infer + + +def upload_adapter_to_volume(adapter_dir: str | Path) -> str: + """Upload a local PEFT adapter to the Modal volume. + + Args: + adapter_dir: Path to local adapter directory. + + Returns: + Remote path to the adapter in the volume. + """ + adapter_dir = Path(adapter_dir) + if not adapter_dir.exists(): + raise FileNotFoundError(f"Adapter not found: {adapter_dir}") + if not (adapter_dir / "adapter_config.json").exists(): + raise FileNotFoundError(f"No adapter_config.json in: {adapter_dir}") + + remote_path = "/adapter" + + # Create volume if needed + create_cmd = ["modal", "volume", "create", VOLUME_NAME] + subprocess.run(create_cmd, capture_output=True, text=True) + + cmd = [ + "modal", + "volume", + "put", + VOLUME_NAME, + str(adapter_dir), + remote_path, + "--force", + ] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + raise RuntimeError(f"Adapter upload failed: {result.stderr or result.stdout}") + + full_remote = f"{VOLUME_MOUNT}{remote_path}" + print(f"Adapter uploaded to volume at: {full_remote}") + return full_remote + + +def call_inference( + messages: list[dict], + image_base64: str | None = None, + max_new_tokens: int = 512, + adapter_path: str | None = None, + base_model: str = "Qwen/Qwen3-VL-2B-Instruct", + gpu: str = "A10G", +) -> str: + """Call the Modal inference function remotely. + + This is the primary API for external callers (e.g., Qwen3VLAgent). + Builds and runs the Modal app, sends a single inference request, + and returns the model output. + + Args: + messages: Chat messages in OpenAI format. + image_base64: Base64-encoded image string. + max_new_tokens: Maximum tokens to generate. + adapter_path: Remote adapter path in the volume. + base_model: HuggingFace model ID for the base model. + gpu: GPU type. + + Returns: + Model response text. + """ + modal = _get_modal() + modal.enable_output() + + app, infer_fn = _build_inference_app( + adapter_path=adapter_path, + base_model=base_model, + gpu=gpu, + ) + + messages_json = json.dumps(messages) + + with app.run(): + result_json = infer_fn.remote( + messages_json=messages_json, + image_base64=image_base64, + max_new_tokens=max_new_tokens, + ) + + result = json.loads(result_json) + return result.get("response", "") + + # --------------------------------------------------------------------------- # Local helpers for CLI commands # --------------------------------------------------------------------------- @@ -462,6 +685,34 @@ def cli_main(argv: list[str] | None = None) -> int: help="Local output directory (default: training_output/modal)", ) + # --- serve --- + serve_parser = subparsers.add_parser( + "serve", help="Serve fine-tuned model for inference on Modal GPU" + ) + serve_parser.add_argument( + "--adapter", + help="Local adapter directory to upload and serve", + ) + serve_parser.add_argument( + "--adapter-remote", + help="Remote adapter path already in the volume (e.g., /training/results/final)", + ) + serve_parser.add_argument( + "--base-model", + default="Qwen/Qwen3-VL-2B-Instruct", + help="Base model HuggingFace ID (default: Qwen/Qwen3-VL-2B-Instruct)", + ) + serve_parser.add_argument( + "--gpu", + default="A10G", + help="GPU type (default: A10G)", + ) + serve_parser.add_argument( + "--no-adapter", + action="store_true", + help="Serve base model without adapter (zero-shot)", + ) + # --- list-volumes --- subparsers.add_parser("list-volumes", help="List Modal volumes") @@ -477,6 +728,8 @@ def cli_main(argv: list[str] | None = None) -> int: return _cmd_status(args) elif args.command == "download": return _cmd_download(args) + elif args.command == "serve": + return _cmd_serve(args) elif args.command == "list-volumes": return _cmd_list_volumes(args) else: @@ -626,6 +879,96 @@ def _cmd_download(args: argparse.Namespace) -> int: return 1 +def _cmd_serve(args: argparse.Namespace) -> int: + """Serve a fine-tuned model on Modal GPU for inference. + + Uploads the adapter (if local path provided), then starts the + inference function that clients can call via Modal's .remote() API. + Alternatively, clients can use the HTTP wrapper in Qwen3VLAgent. + """ + modal = _get_modal() + + adapter_remote = None + + if args.no_adapter: + print(f"Serving base model: {args.base_model} (no adapter)") + elif args.adapter: + # Upload local adapter to volume + print("Uploading adapter to Modal volume...") + try: + adapter_remote = upload_adapter_to_volume(args.adapter) + except (FileNotFoundError, RuntimeError) as e: + print(f"Error: {e}") + return 1 + elif args.adapter_remote: + adapter_remote = args.adapter_remote + print(f"Using remote adapter: {adapter_remote}") + else: + # Default: use the latest training results + adapter_remote = f"{RESULTS_REMOTE_PATH}/final" + print(f"Using default adapter: {adapter_remote}") + + print(f"Base model: {args.base_model}") + print(f"GPU: {args.gpu}") + print() + + try: + modal.enable_output() + + app, infer_fn = _build_inference_app( + adapter_path=adapter_remote, + base_model=args.base_model, + gpu=args.gpu, + ) + + print("Starting inference server on Modal...") + print("Press Ctrl+C to stop.\n") + + with app.run(): + # Test with a simple warmup call + test_messages = json.dumps( + [ + { + "role": "system", + "content": "You are a GUI automation agent.", + }, + { + "role": "user", + "content": "Respond with: ready", + }, + ] + ) + result = infer_fn.remote(messages_json=test_messages) + result_data = json.loads(result) + print(f"Model ready. Test response: {result_data.get('response', '')}") + print() + print("=" * 50) + print("INFERENCE SERVER RUNNING") + print("=" * 50) + print() + print( + "To run inference from another process, use:\n" + " from openadapt_ml.cloud.modal_cloud import call_inference\n" + " result = call_inference(messages, image_base64)\n" + ) + print("Or use Qwen3VLAgent with --model-endpoint modal\n") + + # Keep the app running until Ctrl+C + import time as _time + + try: + while True: + _time.sleep(1) + except KeyboardInterrupt: + print("\nShutting down inference server...") + + except Exception as e: + print(f"Serve failed: {e}") + return 1 + + return 0 + + def _cmd_list_volumes(args: argparse.Namespace) -> int: """List Modal volumes.""" list_volumes() diff --git a/tests/test_modal_cloud.py b/tests/test_modal_cloud.py index 9e92eed..cf0641b 100644 --- a/tests/test_modal_cloud.py +++ b/tests/test_modal_cloud.py @@ -418,7 +418,119 @@ def test_constants(self): from openadapt_ml.cloud import modal_cloud assert modal_cloud.MODAL_APP_NAME == "openadapt-training" + assert modal_cloud.INFERENCE_APP_NAME == "openadapt-inference" assert modal_cloud.VOLUME_NAME == "openadapt-training-data" assert modal_cloud.VOLUME_MOUNT == "/training" assert modal_cloud.BUNDLE_REMOTE_PATH == "/training/bundle" assert modal_cloud.RESULTS_REMOTE_PATH == "/training/results" + + +# --------------------------------------------------------------------------- +# Serve / inference tests +# --------------------------------------------------------------------------- + + +class TestServeCLI: + """Test serve command CLI parsing.""" + + def test_serve_help(self): + """Test that serve --help exits cleanly.""" + from openadapt_ml.cloud.modal_cloud import cli_main + + with pytest.raises(SystemExit) as exc_info: + cli_main(["serve", "--help"]) + assert exc_info.value.code == 0 + + def test_serve_no_adapter_flag(self): + """Test that serve --no-adapter uses base model only.""" + from openadapt_ml.cloud.modal_cloud import cli_main + + mock_modal = MagicMock() + mock_modal.enable_output = MagicMock() + + mock_app = MagicMock() + mock_infer = MagicMock() + + with ( + patch( + "openadapt_ml.cloud.modal_cloud._get_modal", return_value=mock_modal + ), + patch( + "openadapt_ml.cloud.modal_cloud._build_inference_app", + return_value=(mock_app, mock_infer), + ) as mock_build, + patch("openadapt_ml.cloud.modal_cloud.json") as mock_json, + ): + mock_json.dumps.return_value = '[]' + mock_json.loads.return_value = {"response": "ready"} + mock_infer.remote.return_value = '{"response": "ready"}' + mock_app.run.return_value.__enter__ = MagicMock() + mock_app.run.return_value.__exit__ = MagicMock(return_value=False) + + # The serve command blocks on the while loop, so we use KeyboardInterrupt + import time as _time + + original_sleep = _time.sleep + call_count = [0] + + def mock_sleep(t): + call_count[0] += 1 + if call_count[0] > 1: + raise KeyboardInterrupt() + + with patch("time.sleep", side_effect=mock_sleep): + result = cli_main(["serve", "--no-adapter"]) + + # Should have called _build_inference_app with no adapter + mock_build.assert_called_once_with( + adapter_path=None, + base_model="Qwen/Qwen3-VL-2B-Instruct", + gpu="A10G", + ) + assert result == 0 + + +class TestUploadAdapter: + """Test adapter upload logic.""" + + def test_upload_missing_adapter_raises(self): + """Test that uploading a non-existent adapter raises.""" + from openadapt_ml.cloud.modal_cloud import upload_adapter_to_volume + + with pytest.raises(FileNotFoundError, match="Adapter not found"): + upload_adapter_to_volume("/nonexistent/path") + + def test_upload_adapter_without_config_raises(self): + """Test that adapter without adapter_config.json raises.""" + from openadapt_ml.cloud.modal_cloud import upload_adapter_to_volume + + with tempfile.TemporaryDirectory() as tmpdir: + with pytest.raises(FileNotFoundError, match="No adapter_config.json"): + upload_adapter_to_volume(tmpdir) + + def test_upload_adapter_calls_volume_put(self): + """Test that upload invokes volume put with correct args.""" + from openadapt_ml.cloud.modal_cloud import upload_adapter_to_volume + + with tempfile.TemporaryDirectory() as tmpdir: + (Path(tmpdir) / "adapter_config.json").write_text('{"test": true}') + (Path(tmpdir) / "adapter_model.safetensors").write_text("fake") + + mock_result = MagicMock() + mock_result.returncode = 0 + mock_result.stdout = "" + mock_result.stderr = "" + + with patch( + "openadapt_ml.cloud.modal_cloud.subprocess.run", + return_value=mock_result, + ) as mock_run: + remote = upload_adapter_to_volume(tmpdir) + + # Two calls: create + put + assert mock_run.call_count == 2 + put_cmd = mock_run.call_args_list[1][0][0] + assert "put" in put_cmd + assert "/adapter" in put_cmd + assert "--force" in put_cmd + assert remote == "/training/adapter" From e42ed755f1fd7517ae381de52e95742614225673 Mon Sep 17 00:00:00 2001 From: semantic-release Date: Tue, 24 Feb 2026 19:16:06 +0000 Subject: [PATCH 2/2] chore: release 0.11.0 --- CHANGELOG.md | 16 ++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 113f7f9..fa01cc5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,22 @@ # CHANGELOG +## v0.11.0 (2026-02-24) + +### Features + +- **modal**: Add inference serving with call_inference API + ([`57e5c5f`](https://github.com/OpenAdaptAI/openadapt-ml/commit/57e5c5ff9bfefc69003c608981f2e702b0507e65)) + +- Add _build_inference_app() for Modal GPU inference with PEFT adapter - Add + upload_adapter_to_volume() for uploading adapters to Modal volume - Add call_inference() as the + primary API for remote inference - Add 'serve' CLI command for interactive model serving - + Container caches model in memory across calls (container_idle_timeout=600) - Support --no-adapter + for zero-shot base model serving + +Co-Authored-By: Claude Opus 4.6 + + ## v0.10.1 (2026-02-24) ### Bug Fixes diff --git a/pyproject.toml b/pyproject.toml index 3e67344..4283314 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "openadapt-ml" -version = "0.10.1" +version = "0.11.0" description = "Model-agnostic, domain-agnostic ML engine for GUI automation agents" readme = "README.md" requires-python = ">=3.10"