Genesis-Embodied-AI · duburcqa · Apr 24, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 23, 2026
diff --git a/.github/workflows/scripts/ti_build/entry.py b/.github/workflows/scripts/ti_build/entry.py
@@ -46,9 +46,14 @@ def setup_basic_build_env():
         setup_msvc()
 
     setup_llvm()
-    if u.system == "Linux":
-        # We support & test Vulkan shader debug printf on Linux
-        # This is done through the validation layer
+    if u.system in ("Linux", "Darwin"):
+        # Linux: validation layers + SPIR-V tools (shader debug printf support).
+        # macOS: the SDK bundles a current MoltenVK that advertises `VK_KHR_buffer_device_address`, which
+        # the adstack sizer shader needs for `ExternalTensorRead` via Physical Storage Buffer addressing.
+        # The Vulkan-Taichi-assets pin at `quadrants/rhi/CMakeLists.txt:40` is too old for PSB; wiring
+        # `setup_vulkan()` here lets the CMake glue pick up `$VULKAN_SDK/lib/libMoltenVK.dylib` (the flat
+        # layout LunarG's macOS SDK uses - see the layout note in `vulkan.py::setup_vulkan`) and ship
+        # that in the wheel instead.
         from .vulkan import setup_vulkan
 
         setup_vulkan()

diff --git a/.github/workflows/scripts/ti_build/vulkan.py b/.github/workflows/scripts/ti_build/vulkan.py
@@ -3,6 +3,7 @@
 # -- stdlib --
 import os
 import platform
+import subprocess
 
 # -- third party --
 # -- own --
@@ -37,8 +38,44 @@
             path_prepend("PATH", sdk / "bin")
             path_prepend("LD_LIBRARY_PATH", sdk / "lib")
             os.environ["VK_LAYER_PATH"] = str(sdk / "share" / "vulkan" / "explicit_layer.d")
-        # case ("Darwin", "arm64"):
-        # case ("Darwin", "x86_64"):
+        case ("Darwin", "arm64"):
+            # LunarG's macOS `.zip` is an `InstallVulkan.app` installer bundle (the same Qt installer
+            # Windows uses), not a ready-to-use SDK tree. We extract the bundle, then invoke its CLI
+            # non-interactively to drop the actual SDK payload into a sibling prefix. LunarG didn't
+            # publish a 1.4.321.1 macOS asset (the Linux / Windows pin's patch-level); the prior
+            # 1.4.321.0 does, and is inlined here rather than factored into a separate constant.
+            url = "https://sdk.lunarg.com/sdk/download/1.4.321.0/mac/vulkansdk-macos-1.4.321.0.zip"
+            installer_dir = get_cache_home() / "vulkan-macos-1.4.321.0-installer"
+            prefix = get_cache_home() / "vulkan-macos-1.4.321.0"
+
+            download_dep(url, installer_dir, strip=1)
+            if not (prefix / "macOS").exists():
+                installer_bin = installer_dir / "Contents" / "MacOS" / "vulkansdk-macOS-1.4.321.0"
+                # Python's `zipfile` doesn't preserve the Unix execute bit, so the extracted installer
+                # comes out with mode 0644 and `exec` trips `PermissionError: [Errno 13]`. `chmod +x`
+                # here is idempotent and scoped to this single binary.
+                installer_bin.chmod(0o755)
+                subprocess.check_call(
+                    [
+                        str(installer_bin),
+                        "--root",
+                        str(prefix),
+                        "--accept-licenses",
+                        "--default-answer",
+                        "--confirm-command",
+                        "install",
+                    ]
+                )
+            sdk = prefix / "macOS"
+            os.environ["VULKAN_SDK"] = str(sdk)
+            path_prepend("PATH", sdk / "bin")
+            path_prepend("DYLD_LIBRARY_PATH", sdk / "lib")
+            os.environ["VK_LAYER_PATH"] = str(sdk / "share" / "vulkan" / "explicit_layer.d")
+            # LunarG's macOS SDK installer drops `libMoltenVK.dylib` flat inside `macOS/lib/` alongside
+            # the desktop loader - no `MoltenVK/` subtree, no xcframework of dylibs (only `.a` statics in
+            # the xcframework). `MOLTENVK_DIR` points at the directory that actually contains the dylib
+            # so `find_file` in `quadrants/rhi/CMakeLists.txt` can locate it without path guessing.
+            os.environ["MOLTENVK_DIR"] = str(sdk / "lib")
         case ("Windows", "AMD64"):
             url = (
                 f"https://sdk.lunarg.com/sdk/download/{VULKAN_VERSION}/windows/VulkanSDK-{VULKAN_VERSION}-Installer.exe"

diff --git a/docs/source/user_guide/supported_systems.md b/docs/source/user_guide/supported_systems.md
@@ -22,6 +22,24 @@ We test the following systems in our CI servers:
 - AMD GPUs
 - Vulkan-compatible GPUs (e.g. Intel Arc)
 
+### Backend / OS matrix
+
+Which backends are available on each supported platform. `qd.cpu` and `qd.vulkan` run on every OS; the other GPU backends are platform-specific because they wrap vendor drivers (CUDA on NVIDIA, ROCm on AMD, Metal on Apple).
+
+| OS \ backend | `qd.cpu` | `qd.cuda` | `qd.amdgpu` | `qd.metal` | `qd.vulkan` |
+| --- | --- | --- | --- | --- | --- |
+| macOS (Apple Silicon) | yes | n/a | n/a | yes | yes |
+| Linux x64 | yes | yes | yes | n/a | yes |
+| Linux ARM64 | yes | no | no | n/a | yes |
+| Windows x86 | yes | yes | no | n/a | yes |
+| Windows ARM64 | yes | no | no | n/a | yes |
+
+Notes:
+- `qd.cuda` requires an NVIDIA driver + CUDA runtime on the host; quadrants links against the CUDA runtime discovered at import time. NVIDIA ships CUDA for Linux ARM64 and Windows ARM64, but quadrants does not support them yet.
+- `qd.amdgpu` currently wires up the Linux x64 ROCm path only. AMD's HIP SDK also ships on Windows and on some Linux ARM64 targets, but quadrants does not support them yet.
+- `qd.metal` is only available on Apple hardware and is the recommended GPU backend there.
+- `qd.vulkan` on macOS ships a bundled MoltenVK dylib inside the wheel, so no separate MoltenVK install is required.
+
 ### Python backend
 
 A pure-Python backend (`qd.python`) is available on any system where PyTorch is installed. See [Python backend](./python_backend.md).
diff --git a/quadrants/codegen/spirv/spirv_codegen.cpp b/quadrants/codegen/spirv/spirv_codegen.cpp
@@ -950,7 +950,31 @@ void TaskCodegen::generate_overflow_branch(const spirv::Value &cond_v, const std
   ir_->make_inst(spv::OpBranchConditional, cond, then_label, merge_label);
   // then block
   ir_->start_label(then_label);
-  ir_->call_debugprintf(op + " overflow detected in " + tb, {});
+  // `bin->get_tb()` carries the Python traceback that surfaced the binary op - file path, line number,
+  // and a copy of the source line - and we want it in the runtime diagnostic. But the SPIR-V debug-printf
+  // format string flows verbatim into MoltenVK's SPIRV-Cross -> MSL translator, which embeds it as an MSL
+  // string literal; a `"`, `\n`, or `\r` terminates the literal mid-parse and the downstream MSL compile
+  // fails with `use of undeclared identifier '<path fragment>'`. A raw `%` is equally hazardous: the
+  // concatenated string is the printf-style format with an empty args vector, so an unescaped `%` in the
+  // source line (e.g. `a % b`, `"%d" % x`, a `%20` URL-escape) surfaces as a format specifier with no
+  // matching argument - undefined behaviour on the validation-layer debug-printf path and on MoltenVK's
+  // MSL translation. Escape all four so the printed traceback is preserved byte-for-byte on native Vulkan
+  // drivers and still round-trips cleanly through MSL on Apple Silicon. The `%` handling mirrors
+  // `sanitize_format_string` above.
+  std::string safe_tb;
+  safe_tb.reserve(tb.size());
+  for (char c : tb) {
+    if (c == '"') {
+      safe_tb += "\\\"";
+    } else if (c == '\n' || c == '\r') {
+      safe_tb += ' ';
+    } else if (c == '%') {
+      safe_tb += "%%";
+    } else {
+      safe_tb += c;
+    }
+  }
+  ir_->call_debugprintf(op + " overflow detected in " + safe_tb, {});
   ir_->make_inst(spv::OpBranch, merge_label);
   // merge label
   ir_->start_label(merge_label);

diff --git a/quadrants/codegen/spirv/spirv_ir_builder.cpp b/quadrants/codegen/spirv/spirv_ir_builder.cpp
@@ -161,9 +161,13 @@ std::vector<uint32_t> IRBuilder::finalize() {
 
 void IRBuilder::init_pre_defs() {
   ext_glsl450_ = ext_inst_import("GLSL.std.450");
-  if (caps_->get(cap::spirv_has_non_semantic_info)) {
-    debug_printf_ = ext_inst_import("NonSemantic.DebugPrintf");
-  }
+  // `debug_printf_` is imported lazily in `call_debugprintf` on first use rather than here. A declared-but-unused
+  // `OpExtInstImport "NonSemantic.DebugPrintf"` at the top of a SPIR-V module is accepted by native Vulkan
+  // drivers but rejected by MoltenVK: the SPIRV-Cross -> MSL translator emits an unconditional stub that calls
+  // `debugPrintfEXT` even when no `OpExtInst` targets the import, and the subsequent MSL compile fails with
+  // `use of undeclared identifier 'debugPrintfEXT'`. Skipping the import entirely when no call site needs it
+  // keeps kernels without `print` or debug assert traffic compatible with MoltenVK even while the
+  // `spirv_has_non_semantic_info` capability is advertised by the Vulkan device.
 
   t_bool_ = declare_primitive_type(get_data_type<bool>());
   if (caps_->get(cap::spirv_has_int8)) {

diff --git a/quadrants/codegen/spirv/spirv_ir_builder.h b/quadrants/codegen/spirv/spirv_ir_builder.h
@@ -439,6 +439,12 @@ class IRBuilder {
 
   // Create a debugPrintf call
   void call_debugprintf(std::string formats, const std::vector<Value> &args) {
+    // Lazy import: see the explanatory comment in `IRBuilder::init_pre_defs`. We only emit
+    // `OpExtInstImport "NonSemantic.DebugPrintf"` the first time a printf / debug-assert call site actually
+    // needs it so kernels with no debug traffic stay MoltenVK-compatible.
+    if (!debug_printf_.id) {
+      debug_printf_ = ext_inst_import("NonSemantic.DebugPrintf");
+    }
     Value format_str = debug_string(formats);
     Value val = new_value(t_void_, ValueKind::kNormal);
     ib_.begin(spv::OpExtInst).add_seq(t_void_, val, debug_printf_, 1, format_str);

diff --git a/quadrants/rhi/CMakeLists.txt b/quadrants/rhi/CMakeLists.txt
@@ -28,18 +28,42 @@ endif()
 if (QD_WITH_VULKAN)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DQD_WITH_VULKAN")
     if (APPLE)
-        # The latest Molten-vk v1.2.0 and v1.1.11 breaks GGUI: mpm3d_ggui.py
-        # So we have to manually download and install Molten-vk v1.10.0
-        #
-        # Uncomment the following lines if the mpm3d_ggui.py runs well with the latest Molten-vk
-        #find_library(MOLTEN_VK libMoltenVK.dylib PATHS $HOMEBREW_CELLAR/molten-vk $VULKAN_SDK REQUIRED)
-        #configure_file(${MOLTEN_VK} ${CMAKE_BINARY_DIR}/libMoltenVK.dylib COPYONLY)
-        #message(STATUS "MoltenVK library ${MOLTEN_VK}")
-
-        if(NOT EXISTS ${CMAKE_BINARY_DIR}/libMoltenVK.dylib)
-            execute_process(COMMAND curl -L -o ${CMAKE_BINARY_DIR}/libMoltenVK.zip https://github.com/taichi-dev/taichi_assets/files/9977436/libMoltenVK.dylib.zip)
-            execute_process(COMMAND tar -xf ${CMAKE_BINARY_DIR}/libMoltenVK.zip --directory ${CMAKE_BINARY_DIR})
+        # MoltenVK comes from the LunarG Vulkan SDK. Sources are tried in order: explicit `$MOLTENVK_DIR`
+        # (set by `build.py --shell`), then `$VULKAN_SDK/lib` (LunarG's globally-installed layout, also what
+        # `build.py --shell` exports), then the previously-extracted SDK under the `ti-build-cache` that
+        # `build.py --shell` populated on a prior invocation - this last fallback is what lets a plain
+        # `cmake .` reconfigure without re-entering the build shell every time. No legacy-pin fallback on
+        # purpose: the prior third-party dylib predates `VK_KHR_buffer_device_address` and is unusable for
+        # the on-device adstack sizer.
+        set(_MVK_CANDIDATES "")
+        if(DEFINED ENV{MOLTENVK_DIR})
+            list(APPEND _MVK_CANDIDATES "$ENV{MOLTENVK_DIR}")
+        endif()
+        if(DEFINED ENV{VULKAN_SDK})
+            list(APPEND _MVK_CANDIDATES "$ENV{VULKAN_SDK}/lib")
+        endif()
+        # `build.py --shell` extracts the SDK to `~/.cache/ti-build-cache/vulkan-macos-<VER>/macOS/lib`.
+        # Glob for any version under that directory so developers don't have to re-enter the build shell.
+        file(GLOB _MVK_CACHE_CANDIDATES "$ENV{HOME}/.cache/ti-build-cache/vulkan-macos-*/macOS/lib")
+        list(APPEND _MVK_CANDIDATES ${_MVK_CACHE_CANDIDATES})
+        set(MOLTEN_VK "")
+        foreach(_dir ${_MVK_CANDIDATES})
+            if(EXISTS "${_dir}/libMoltenVK.dylib")
+                set(MOLTEN_VK "${_dir}/libMoltenVK.dylib")
+                break()
+            endif()
+        endforeach()
+        if(NOT MOLTEN_VK)
+            message(FATAL_ERROR
+                "MoltenVK: cannot locate `libMoltenVK.dylib`. Searched: `$MOLTENVK_DIR`, `$VULKAN_SDK/lib`, "
+                "and `~/.cache/ti-build-cache/vulkan-macos-*/macOS/lib` (populated by `build.py --shell`). "
+                "Run `./build.py --shell` once to fetch LunarG's Vulkan SDK, or install the LunarG Vulkan "
+                "SDK globally, before configuring Quadrants with `QD_WITH_VULKAN=ON`. The adstack sizer "
+                "shader requires a MoltenVK with `VK_KHR_buffer_device_address` (>= 1.2.1); shipping an "
+                "older pin would hard-error every reverse-mode kernel at launch time.")
         endif()
+        message(STATUS "MoltenVK: using LunarG Vulkan SDK copy at ${MOLTEN_VK}")
+        configure_file(${MOLTEN_VK} ${CMAKE_BINARY_DIR}/libMoltenVK.dylib COPYONLY)
         install(FILES ${CMAKE_BINARY_DIR}/libMoltenVK.dylib DESTINATION ${INSTALL_LIB_DIR}/runtime)
     endif()
     add_subdirectory(vulkan)

diff --git a/quadrants/rhi/vulkan/vulkan_device.cpp b/quadrants/rhi/vulkan/vulkan_device.cpp
@@ -1592,10 +1592,17 @@ RhiResult VulkanDevice::allocate_memory(const AllocParams &params, DeviceAllocat
   }
 
   if (get_caps().get(DeviceCapability::spirv_has_physical_storage_buffer) &&
-      ((alloc_info.usage & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) ||
-       (alloc_info.usage & VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR) ||
-       (alloc_info.usage & VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR) ||
-       (alloc_info.usage & VK_BUFFER_USAGE_SHADER_BINDING_TABLE_BIT_KHR))) {
+      ((buffer_info.usage & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) ||
+       (buffer_info.usage & VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR) ||
+       (buffer_info.usage & VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR) ||
+       (buffer_info.usage & VK_BUFFER_USAGE_SHADER_BINDING_TABLE_BIT_KHR))) {
+    // Check the Vulkan `VkBufferCreateInfo::usage` flags, not the VMA `VmaAllocationCreateInfo::usage`
+    // enum - the latter is `VMA_MEMORY_USAGE_UNKNOWN` here (never set) and never matches any
+    // `VK_BUFFER_USAGE_*` bit, so without this the branch was dead and the SHADER_DEVICE_ADDRESS usage
+    // flag was never attached. The downstream `vkGetBufferDeviceAddressKHR` call below then fires
+    // `VUID-VkBufferDeviceAddressInfo-buffer-02601` under the validation layer, and on drivers that
+    // don't validate (MoltenVK) returns a garbage address that silently miscomputes every subsequent
+    // PSB-backed ndarray load.
     buffer_info.usage |= VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR;
   }
 
@@ -1607,7 +1614,13 @@ RhiResult VulkanDevice::allocate_memory(const AllocParams &params, DeviceAllocat
 
   vmaGetAllocationInfo(alloc.buffer->allocator, alloc.buffer->allocation, &alloc.alloc_info);
 
-  if (get_caps().get(DeviceCapability::spirv_has_physical_storage_buffer)) {
+  if (get_caps().get(DeviceCapability::spirv_has_physical_storage_buffer) &&
+      (buffer_info.usage & VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR)) {
+    // Gated on the matching usage bit the block above may have added: `vkGetBufferDeviceAddressKHR`
+    // requires the target buffer to have been created with `VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT`
+    // (VUID-VkBufferDeviceAddressInfo-buffer-02601). Buffers that do not qualify for the bit (uniform-only,
+    // vertex/index, transfer-only staging) never need a device address - they are only read via descriptor
+    // binding. Leaving `alloc.addr` at zero in that case matches what happens on backends without the cap.
     VkBufferDeviceAddressInfoKHR info{};
     info.sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_KHR;
     info.buffer = alloc.buffer->buffer;