huggingface · jackzhxng · Aug 18, 2025 · Aug 1, 2025 · Aug 1, 2025 · Aug 4, 2025
diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
@@ -34,9 +34,10 @@ jobs:
       fail-fast: false
       matrix:
         test-modeling: ${{ fromJson(needs.discover-tests.outputs.model_names) }}
-        executorch-version: ['0.6.0', 'nightly']
+        executorch-version: ['0.7.0', 'nightly']
         python-version: ['3.11']
-        os: [macos-15, ubuntu-22.04]
+        # os: [macos-15, ubuntu-22.04]  # TODO(#122): Re-enable the mac tests after fixing seg fault.
+        os: [ubuntu-22.04]
 
     # Custom job name, now shortened and cleaner
     name: ${{ matrix.test-modeling }} (et=${{ matrix.executorch-version }}, py=${{ matrix.python-version }}, ${{ matrix.os }})

diff --git a/install_dev.py b/install_dev.py
@@ -5,21 +5,21 @@
 
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
-    EXECUTORCH_NIGHTLY_VERSION = "dev20250625"
-    TORCHAO_NIGHTLY_VERSION = "dev20250620"
+    EXECUTORCH_NIGHTLY_VERSION = "dev20250730"
+    TORCHAO_NIGHTLY_VERSION = "dev20250730"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74
-    TORCH_NIGHTLY_VERSION = "dev20250601"
+    TORCH_NIGHTLY_VERSION = "dev20250725"
     subprocess.check_call(
         [
             sys.executable,
             "-m",
             "pip",
             "install",
-            f"executorch==0.7.0.{EXECUTORCH_NIGHTLY_VERSION}",
-            f"torch==2.8.0.{TORCH_NIGHTLY_VERSION}",
-            f"torchvision==0.23.0.{TORCH_NIGHTLY_VERSION}",
+            f"executorch==0.8.0.{EXECUTORCH_NIGHTLY_VERSION}",
+            f"torch==2.9.0.{TORCH_NIGHTLY_VERSION}",
+            f"torchvision==0.24.0.{TORCH_NIGHTLY_VERSION}",
             f"torchaudio==2.8.0.{TORCH_NIGHTLY_VERSION}",
-            f"torchao==0.12.0.{TORCHAO_NIGHTLY_VERSION}",
+            f"torchao==0.13.0.{TORCHAO_NIGHTLY_VERSION}",
             "--extra-index-url",
             "https://download.pytorch.org/whl/nightly/cpu",
         ]
@@ -34,7 +34,7 @@ def install_dep_from_source():
             "-m",
             "pip",
             "install",
-            "git+https://github.com/huggingface/transformers@896e9cea1ade521b2648f4798218550f6c72190c#egg=transformers",  # 4.53.1
+            "git+https://github.com/huggingface/transformers@9c641dc16154964e5ffc0c13e9ec6aaffa295ed6#egg=transformers",  # 4.54.1
         ]
     )
     subprocess.check_call(

diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py
@@ -54,12 +54,12 @@ def __init__(
 
         # Create a list of CustomKVCache instances, one per layer
         self.kv_cache = torch.nn.ModuleList()
-        for _ in range(config.num_hidden_layers):
+        for layer in self.layers:
             layer_cache = CustomKVCache(
-                max_batch_size=self.max_batch_size,
-                max_context_length=self.max_cache_len,
-                n_heads=self.num_key_value_heads,
-                head_dim=self.head_dim,
+                max_batch_size=layer.max_batch_size,
+                max_context_length=layer.max_cache_len,
+                n_heads=layer.num_heads,
+                head_dim=layer.head_dim,
                 dtype=dtype,
             )
             self.kv_cache.append(layer_cache)
@@ -202,32 +202,29 @@ def __init__(
             layer_device_map=layer_device_map,
         )
 
-        # make sure layer_device_map is none
         assert layer_device_map is None
         assert device is None or device == "cpu", "Device must be None or 'cpu'"
 
         self.cache_position = None
-        # Create a list of cache instances, one per layer
-        # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers
+        # Create a list of cache instances, one per layer.
+        # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers.
         self.kv_cache = torch.nn.ModuleList()
-        for layer_idx in range(config.num_hidden_layers):
-            # newer version of transfomer has is_sliding defined
-            # for HybridCache
-            if self.is_sliding[layer_idx]:
+        for layer in self.layers:
+            if layer.is_sliding:
                 # This is a sliding window layer
                 layer_cache = CustomRingKVCache(
-                    max_batch_size=self.max_batch_size,
-                    max_context_length=self.sliding_window_len,
-                    n_heads=self.num_key_value_heads,
-                    head_dim=self.head_dim,
+                    max_batch_size=layer.max_batch_size,
+                    max_context_length=layer.max_cache_len,
+                    n_heads=layer.num_heads,
+                    head_dim=layer.head_dim,
                     dtype=dtype,
                 )
             else:
                 layer_cache = CustomKVCache(
-                    max_batch_size=self.max_batch_size,
-                    max_context_length=self.max_cache_len,
-                    n_heads=self.num_key_value_heads,
-                    head_dim=self.head_dim,
+                    max_batch_size=layer.max_batch_size,
+                    max_context_length=layer.max_cache_len,
+                    n_heads=layer.num_heads,
+                    head_dim=layer.head_dim,
                     dtype=dtype,
                 )
             self.kv_cache.append(layer_cache)
@@ -284,7 +281,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
 
         # For CustomRingKVCache, we need to handle the sequence length differently
         layer_cache = self.kv_cache[layer_idx]
-        if self.is_sliding[layer_idx]:
+        if self.layers[layer_idx].is_sliding:
             # CustomRingKVCache cache_position_manager which
             # maintains cache position for each slot in the kv cache
             # we return the max position + 1 to indicate max position
@@ -308,7 +305,7 @@ def get_layer_cache(self, layer_idx: int):
 
 def replace_with_et_custom_kv_cache(module, config, generation_config, cache_dtype):
     """
-    Replace all KV caches in the module with ETCustomStaticCache.
+    Replace all KV caches in the module with ETCustomStaticCache or ETCustomHybridCache.
     This modifies the model in place.
 
     Args:
@@ -342,18 +339,18 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
         if getattr(module, "replace_cache", None) is not None:
             static_cache = ETCustomStaticCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             module.replace_cache(static_cache)
         else:
             module.static_cache = ETCustomStaticCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             # Dont know why we need to this even though
@@ -370,25 +367,25 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
         if getattr(module, "replace_cache", None) is not None:
             hybrid_cache = ETCustomHybridCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             module.replace_cache(hybrid_cache)
         else:
             module.cache = ETCustomHybridCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             # Register cache attributes for each layer
             for i in range(len(module.cache.kv_cache)):
                 setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache)
                 setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache)
-                if module.cache.is_sliding[i]:
+                if module.cache.layers[i].is_sliding:
                     # Register cache_positions as buffer for sliding window layers
                     # This prevents it from being traced as a constant
                     module.register_buffer(

diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
@@ -16,6 +16,7 @@
 
 import logging
 import os
+import shutil
 from abc import ABC, abstractmethod
 from pathlib import Path
 from tempfile import TemporaryDirectory
@@ -24,6 +25,7 @@
 import torch
 from huggingface_hub import hf_hub_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa
 from transformers import (
     AutoModelForCausalLM,
     AutoModelForImageClassification,
@@ -102,6 +104,34 @@ def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedCon
 
         self.stats = Stats()
 
+        # Initialize cleanup tracking
+        self._temp_dir = None
+
+    def __del__(self):
+        """Clean up temporary files when the model instance is destroyed."""
+        self._cleanup_temp_resources()
+
+    def _cleanup_temp_resources(self):
+        """Clean up temporary directory and files."""
+        if hasattr(self, "_temp_dir") and self._temp_dir is not None:
+            try:
+                if hasattr(self._temp_dir, "cleanup"):
+                    # It's a TemporaryDirectory object
+                    logging.info(f"Cleaning up temporary directory: {self._temp_dir.name}")
+                    self._temp_dir.cleanup()
+                    logging.info("Temporary directory cleanup completed")
+                elif isinstance(self._temp_dir, (str, Path)):
+                    # It's a path
+                    logging.info(f"Cleaning up temporary path: {self._temp_dir}")
+                    shutil.rmtree(self._temp_dir, ignore_errors=True)
+                    logging.info("Temporary path cleanup completed")
+            except Exception as e:
+                # Log cleanup errors for debugging
+                logging.warning(f"Error during temp directory cleanup: {e}")
+                pass
+            finally:
+                self._temp_dir = None
+
     @abstractmethod
     def forward(self, *args, **kwargs):
         """
@@ -242,7 +272,7 @@ def _export(
         inferred_task = TasksManager.infer_task_from_model(cls.auto_model_class)
         logging.info(f"Inferred task from model class: {inferred_task}")
 
-        save_dir = TemporaryDirectory()
+        save_dir = TemporaryDirectory(prefix="executorch_export_")
         save_dir_path = Path(save_dir.name)
 
         # Export to ExecuTorch and save the pte file to the temporary directory
@@ -266,7 +296,7 @@ def _export(
         for name, _ in executorch_progs.items():
             models.update(cls._from_pretrained(save_dir_path, file_name=f"{name}.pte", config=config))
 
-        return models
+        return models, save_dir
 
     def _save_pretrained(self, save_directory):
         """
@@ -298,6 +328,7 @@ def from_pretrained(
             logger.info("Offline mode: setting `local_files_only=True`")
             local_files_only = True
 
+        # See if model was already exported to ExecuTorch and uplaoded to the HuggingFace repo.
         _export = export
         try:
             if local_files_only and not os.path.isdir(model_id):
@@ -324,21 +355,21 @@ def from_pretrained(
                 if export:
                     logger.warning(
                         f"The model {model_id} was already converted to the ExecuTorch IR but got `export=True`, the model will be converted to ExecuTorch once again. "
-                        # "Don't forget to save the resulting model with `.save_pretrained()`"
                     )
                     _export = True
                 else:
                     logger.warning(
                         f"No ExecuTorch files were found for {model_id}, setting `export=True` to convert the model to the ExecuTorch IR. "
-                        # "Don't forget to save the resulting model with `.save_pretrained()`"
                     )
         except Exception as exception:
             logger.warning(
                 f"Could not infer whether the model was already converted or not to the ExecuTorch IR, keeping `export={export}`.\n{exception}"
             )
 
+        temp_dir = None
         if _export:
-            models_dict = cls._export(
+            logging.info(f"Exporting {model_id} to ExecuTorch program...")
+            models_dict, temp_dir = cls._export(
                 model_id=model_id,
                 config=config,
                 revision=revision,
@@ -351,6 +382,9 @@ def from_pretrained(
                 **kwargs,
             )
         else:
+            logging.info(
+                f"Pre-exported `.pte` artifact already exists in HuggingFace repo or provided file path for {model_id}, skipping export."
+            )
             models_dict = {}
             for pte_file in pte_files:
                 models_dict.update(
@@ -368,7 +402,14 @@ def from_pretrained(
                     )
                 )
 
-        return cls(models_dict, config)
+        model_instance = cls(models_dict, config)
+
+        # Store the TemporaryDirectory reference to prevent GC
+        if temp_dir is not None:
+            model_instance._temp_dir = temp_dir
+            logging.info(f"Stored temp directory reference in model: {temp_dir.name}")
+
+        return model_instance
 
 
 class ExecuTorchModelForSeq2SeqLM(ExecuTorchModelBase):

diff --git a/optimum/exporters/executorch/__main__.py b/optimum/exporters/executorch/__main__.py
@@ -15,6 +15,7 @@
 """Entry point to the optimum.exporters.executorch command line."""
 
 import argparse
+import logging
 import os
 import warnings
 from pathlib import Path
@@ -130,10 +131,14 @@ def main_export(
     kwargs["force_download"] = force_download
     kwargs["config"] = config
 
+    # 1. Load model, apply source transformations, and torch.export() into a graph (ExportedProgram).
+    logging.info(f"Loading {model_name_or_path} and exporting to static graph...")
     recipe_kwargs = kwargs.pop("recipe_kwargs", {})
 
     model = task_func(model_name_or_path, **kwargs)
 
+    # 2. Export to ExecuTorch through ExecuTorch's lowering APIs.
+    logging.info(f"Lowering {model_name_or_path} to ExecuTorch...")
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
 

diff --git a/optimum/exporters/executorch/convert.py b/optimum/exporters/executorch/convert.py
@@ -19,20 +19,17 @@
 from pathlib import Path
 from typing import Union
 
+from transformers.integrations.executorch import sdpa_mask_without_vmap
+from transformers.masking_utils import AttentionMaskInterface
 from transformers.modeling_utils import AttentionInterface
 
 from optimum.executorch.attentions.custom_sdpa import custom_sdpa_with_start_pos_forward
-from optimum.utils.import_utils import is_transformers_version
 
 from .recipe_registry import discover_recipes, recipe_registry
 
 
 AttentionInterface.register("custom_sdpa", custom_sdpa_with_start_pos_forward)
-if is_transformers_version(">=", "4.53.0.dev0"):
-    from transformers.integrations.executorch import sdpa_mask_without_vmap
-    from transformers.masking_utils import AttentionMaskInterface
-
-    AttentionMaskInterface.register("custom_sdpa", sdpa_mask_without_vmap)
+AttentionMaskInterface.register("custom_sdpa", sdpa_mask_without_vmap)
 
 
 def export_to_executorch(