Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
3a960bb
Bump transformers to 4.54.1
jackzhxng Aug 1, 2025
3d223a2
Bump torch
jackzhxng Aug 1, 2025
207f8b1
Fix no module found error for custom_kv_cache
jackzhxng Aug 4, 2025
bc82841
Try to fix Missing operator: [8] quantized_decomposed::embedding_byte…
jackzhxng Aug 4, 2025
35fc918
Fix quantization requires torchao >= 0.11.0
jackzhxng Aug 4, 2025
6a26464
Fix sliding window, print loaded ops
jackzhxng Aug 5, 2025
4d68263
Bump ET nightly pin, fixes missing quantized ops
jackzhxng Aug 5, 2025
6a3e1d4
Fix no Q_ANNOTATION_KEY
jackzhxng Aug 5, 2025
2b5fe7e
Try to fix segfault/bus error by holding onto temp dir
jackzhxng Aug 8, 2025
bb0089c
Bigger mac runners
jackzhxng Aug 9, 2025
72802e3
Revert "Bigger mac runners"
jackzhxng Aug 10, 2025
9876c7e
Add helpful logs
jackzhxng Aug 10, 2025
19f4d21
Re-enable smollm3 tests for linux
jackzhxng Aug 10, 2025
99805f8
Experiment reverting transformers bump
jackzhxng Aug 10, 2025
108ed17
Revert "Experiment reverting transformers bump"
jackzhxng Aug 13, 2025
59778eb
Formatting and remove logs
jackzhxng Aug 13, 2025
ff8a2a1
Bump ET release from 0.6 -> 0.7
jackzhxng Aug 14, 2025
a3009ca
Bisect down to ET 20250701
jackzhxng Aug 14, 2025
ae488b1
Experiment reverting transformers bump
jackzhxng Aug 10, 2025
b7a2fa1
Clean
jackzhxng Aug 14, 2025
1e0a671
Bisect down to ET 20250628
jackzhxng Aug 14, 2025
896f0da
Bisect down to ET 20250626
jackzhxng Aug 14, 2025
abd641b
Revert "Bisect down to ET 20250626"
jackzhxng Aug 15, 2025
7f7f9c2
Revert "Bisect down to ET 20250628"
jackzhxng Aug 15, 2025
5f8a56f
Revert "Experiment reverting transformers bump"
jackzhxng Aug 15, 2025
92bc2ba
Revert "Bisect down to ET 20250701"
jackzhxng Aug 15, 2025
4abb2ec
Skip mac tests
jackzhxng Aug 15, 2025
ad9b639
Remove unnecessary ET 0.6 guards
jackzhxng Aug 15, 2025
b252038
Ruff format
jackzhxng Aug 15, 2025
e135310
Remove all transformers < 4.54 guards
jackzhxng Aug 15, 2025
671bc06
Merge branch 'main' into jz/bump-transformers
jackzhxng Aug 15, 2025
70338e9
Format
jackzhxng Aug 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/test_models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@ jobs:
fail-fast: false
matrix:
test-modeling: ${{ fromJson(needs.discover-tests.outputs.model_names) }}
executorch-version: ['0.6.0', 'nightly']
executorch-version: ['0.7.0', 'nightly']
python-version: ['3.11']
os: [macos-15, ubuntu-22.04]
# os: [macos-15, ubuntu-22.04] # TODO(#122): Re-enable the mac tests after fixing seg fault.
os: [ubuntu-22.04]

# Custom job name, now shortened and cleaner
name: ${{ matrix.test-modeling }} (et=${{ matrix.executorch-version }}, py=${{ matrix.python-version }}, ${{ matrix.os }})
Expand Down
16 changes: 8 additions & 8 deletions install_dev.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,21 @@

def install_torch_nightly_deps():
"""Install torch related dependencies from pinned nightly"""
EXECUTORCH_NIGHTLY_VERSION = "dev20250625"
TORCHAO_NIGHTLY_VERSION = "dev20250620"
EXECUTORCH_NIGHTLY_VERSION = "dev20250730"
TORCHAO_NIGHTLY_VERSION = "dev20250730"
# Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74
TORCH_NIGHTLY_VERSION = "dev20250601"
TORCH_NIGHTLY_VERSION = "dev20250725"
subprocess.check_call(
[
sys.executable,
"-m",
"pip",
"install",
f"executorch==0.7.0.{EXECUTORCH_NIGHTLY_VERSION}",
f"torch==2.8.0.{TORCH_NIGHTLY_VERSION}",
f"torchvision==0.23.0.{TORCH_NIGHTLY_VERSION}",
f"executorch==0.8.0.{EXECUTORCH_NIGHTLY_VERSION}",
f"torch==2.9.0.{TORCH_NIGHTLY_VERSION}",
f"torchvision==0.24.0.{TORCH_NIGHTLY_VERSION}",
f"torchaudio==2.8.0.{TORCH_NIGHTLY_VERSION}",
f"torchao==0.12.0.{TORCHAO_NIGHTLY_VERSION}",
f"torchao==0.13.0.{TORCHAO_NIGHTLY_VERSION}",
"--extra-index-url",
"https://download.pytorch.org/whl/nightly/cpu",
]
Expand All @@ -34,7 +34,7 @@ def install_dep_from_source():
"-m",
"pip",
"install",
"git+https://github.com/huggingface/transformers@896e9cea1ade521b2648f4798218550f6c72190c#egg=transformers", # 4.53.1
"git+https://github.com/huggingface/transformers@9c641dc16154964e5ffc0c13e9ec6aaffa295ed6#egg=transformers", # 4.54.1
]
)
subprocess.check_call(
Expand Down
67 changes: 32 additions & 35 deletions optimum/executorch/attentions/custom_kv_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,12 @@ def __init__(

# Create a list of CustomKVCache instances, one per layer
self.kv_cache = torch.nn.ModuleList()
for _ in range(config.num_hidden_layers):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what happened here? like config doesnt exist anymore?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It still exists, feel like it's more idiomatic to iterate over the actual layers

for layer in self.layers:
layer_cache = CustomKVCache(
max_batch_size=self.max_batch_size,
max_context_length=self.max_cache_len,
n_heads=self.num_key_value_heads,
head_dim=self.head_dim,
max_batch_size=layer.max_batch_size,
max_context_length=layer.max_cache_len,
n_heads=layer.num_heads,
head_dim=layer.head_dim,
dtype=dtype,
)
self.kv_cache.append(layer_cache)
Expand Down Expand Up @@ -202,32 +202,29 @@ def __init__(
layer_device_map=layer_device_map,
)

# make sure layer_device_map is none
assert layer_device_map is None
assert device is None or device == "cpu", "Device must be None or 'cpu'"

self.cache_position = None
# Create a list of cache instances, one per layer
# Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers
# Create a list of cache instances, one per layer.
# Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers.
self.kv_cache = torch.nn.ModuleList()
for layer_idx in range(config.num_hidden_layers):
# newer version of transfomer has is_sliding defined
# for HybridCache
if self.is_sliding[layer_idx]:
for layer in self.layers:
if layer.is_sliding:
# This is a sliding window layer
layer_cache = CustomRingKVCache(
max_batch_size=self.max_batch_size,
max_context_length=self.sliding_window_len,
n_heads=self.num_key_value_heads,
head_dim=self.head_dim,
max_batch_size=layer.max_batch_size,
max_context_length=layer.max_cache_len,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wait what is happening here? is this same as sliding_window_len

Copy link
Collaborator Author

@jackzhxng jackzhxng Aug 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

n_heads=layer.num_heads,
head_dim=layer.head_dim,
dtype=dtype,
)
else:
layer_cache = CustomKVCache(
max_batch_size=self.max_batch_size,
max_context_length=self.max_cache_len,
n_heads=self.num_key_value_heads,
head_dim=self.head_dim,
max_batch_size=layer.max_batch_size,
max_context_length=layer.max_cache_len,
n_heads=layer.num_heads,
head_dim=layer.head_dim,
dtype=dtype,
)
self.kv_cache.append(layer_cache)
Expand Down Expand Up @@ -284,7 +281,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:

# For CustomRingKVCache, we need to handle the sequence length differently
layer_cache = self.kv_cache[layer_idx]
if self.is_sliding[layer_idx]:
if self.layers[layer_idx].is_sliding:
# CustomRingKVCache cache_position_manager which
# maintains cache position for each slot in the kv cache
# we return the max position + 1 to indicate max position
Expand All @@ -308,7 +305,7 @@ def get_layer_cache(self, layer_idx: int):

def replace_with_et_custom_kv_cache(module, config, generation_config, cache_dtype):
"""
Replace all KV caches in the module with ETCustomStaticCache.
Replace all KV caches in the module with ETCustomStaticCache or ETCustomHybridCache.
This modifies the model in place.

Args:
Expand Down Expand Up @@ -342,18 +339,18 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
if getattr(module, "replace_cache", None) is not None:
static_cache = ETCustomStaticCache(
config=config,
max_batch_size=generation_config.cache_config.batch_size,
max_cache_len=generation_config.cache_config.max_cache_len,
device=generation_config.cache_config.device,
max_batch_size=generation_config.cache_config.get("batch_size"),
max_cache_len=generation_config.cache_config.get("max_cache_len"),
device=generation_config.cache_config.get("device"),
dtype=cache_dtype,
)
module.replace_cache(static_cache)
else:
module.static_cache = ETCustomStaticCache(
config=config,
max_batch_size=generation_config.cache_config.batch_size,
max_cache_len=generation_config.cache_config.max_cache_len,
device=generation_config.cache_config.device,
max_batch_size=generation_config.cache_config.get("batch_size"),
max_cache_len=generation_config.cache_config.get("max_cache_len"),
device=generation_config.cache_config.get("device"),
dtype=cache_dtype,
)
# Dont know why we need to this even though
Expand All @@ -370,25 +367,25 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
if getattr(module, "replace_cache", None) is not None:
hybrid_cache = ETCustomHybridCache(
config=config,
max_batch_size=generation_config.cache_config.batch_size,
max_cache_len=generation_config.cache_config.max_cache_len,
device=generation_config.cache_config.device,
max_batch_size=generation_config.cache_config.get("batch_size"),
max_cache_len=generation_config.cache_config.get("max_cache_len"),
device=generation_config.cache_config.get("device"),
dtype=cache_dtype,
)
module.replace_cache(hybrid_cache)
else:
module.cache = ETCustomHybridCache(
config=config,
max_batch_size=generation_config.cache_config.batch_size,
max_cache_len=generation_config.cache_config.max_cache_len,
device=generation_config.cache_config.device,
max_batch_size=generation_config.cache_config.get("batch_size"),
max_cache_len=generation_config.cache_config.get("max_cache_len"),
device=generation_config.cache_config.get("device"),
dtype=cache_dtype,
)
# Register cache attributes for each layer
for i in range(len(module.cache.kv_cache)):
setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache)
setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache)
if module.cache.is_sliding[i]:
if module.cache.layers[i].is_sliding:
# Register cache_positions as buffer for sliding window layers
# This prevents it from being traced as a constant
module.register_buffer(
Expand Down
53 changes: 47 additions & 6 deletions optimum/executorch/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import logging
import os
import shutil
from abc import ABC, abstractmethod
from pathlib import Path
from tempfile import TemporaryDirectory
Expand All @@ -24,6 +25,7 @@
import torch
from huggingface_hub import hf_hub_download
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib # noqa
from transformers import (
AutoModelForCausalLM,
AutoModelForImageClassification,
Expand Down Expand Up @@ -102,6 +104,34 @@ def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedCon

self.stats = Stats()

# Initialize cleanup tracking
self._temp_dir = None

def __del__(self):
"""Clean up temporary files when the model instance is destroyed."""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldnt this already happen automatically?

Copy link
Collaborator Author

@jackzhxng jackzhxng Aug 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah probably, but added just to be extra sure that it's cleaned up between tests

self._cleanup_temp_resources()

def _cleanup_temp_resources(self):
"""Clean up temporary directory and files."""
if hasattr(self, "_temp_dir") and self._temp_dir is not None:
try:
if hasattr(self._temp_dir, "cleanup"):
# It's a TemporaryDirectory object
logging.info(f"Cleaning up temporary directory: {self._temp_dir.name}")
self._temp_dir.cleanup()
logging.info("Temporary directory cleanup completed")
elif isinstance(self._temp_dir, (str, Path)):
# It's a path
logging.info(f"Cleaning up temporary path: {self._temp_dir}")
shutil.rmtree(self._temp_dir, ignore_errors=True)
logging.info("Temporary path cleanup completed")
except Exception as e:
# Log cleanup errors for debugging
logging.warning(f"Error during temp directory cleanup: {e}")
pass
finally:
self._temp_dir = None

@abstractmethod
def forward(self, *args, **kwargs):
"""
Expand Down Expand Up @@ -242,7 +272,7 @@ def _export(
inferred_task = TasksManager.infer_task_from_model(cls.auto_model_class)
logging.info(f"Inferred task from model class: {inferred_task}")

save_dir = TemporaryDirectory()
save_dir = TemporaryDirectory(prefix="executorch_export_")
save_dir_path = Path(save_dir.name)

# Export to ExecuTorch and save the pte file to the temporary directory
Expand All @@ -266,7 +296,7 @@ def _export(
for name, _ in executorch_progs.items():
models.update(cls._from_pretrained(save_dir_path, file_name=f"{name}.pte", config=config))

return models
return models, save_dir

def _save_pretrained(self, save_directory):
"""
Expand Down Expand Up @@ -298,6 +328,7 @@ def from_pretrained(
logger.info("Offline mode: setting `local_files_only=True`")
local_files_only = True

# See if model was already exported to ExecuTorch and uplaoded to the HuggingFace repo.
_export = export
try:
if local_files_only and not os.path.isdir(model_id):
Expand All @@ -324,21 +355,21 @@ def from_pretrained(
if export:
logger.warning(
f"The model {model_id} was already converted to the ExecuTorch IR but got `export=True`, the model will be converted to ExecuTorch once again. "
# "Don't forget to save the resulting model with `.save_pretrained()`"
)
_export = True
else:
logger.warning(
f"No ExecuTorch files were found for {model_id}, setting `export=True` to convert the model to the ExecuTorch IR. "
# "Don't forget to save the resulting model with `.save_pretrained()`"
)
except Exception as exception:
logger.warning(
f"Could not infer whether the model was already converted or not to the ExecuTorch IR, keeping `export={export}`.\n{exception}"
)

temp_dir = None
if _export:
models_dict = cls._export(
logging.info(f"Exporting {model_id} to ExecuTorch program...")
models_dict, temp_dir = cls._export(
model_id=model_id,
config=config,
revision=revision,
Expand All @@ -351,6 +382,9 @@ def from_pretrained(
**kwargs,
)
else:
logging.info(
f"Pre-exported `.pte` artifact already exists in HuggingFace repo or provided file path for {model_id}, skipping export."
)
models_dict = {}
for pte_file in pte_files:
models_dict.update(
Expand All @@ -368,7 +402,14 @@ def from_pretrained(
)
)

return cls(models_dict, config)
model_instance = cls(models_dict, config)

# Store the TemporaryDirectory reference to prevent GC
if temp_dir is not None:
model_instance._temp_dir = temp_dir
logging.info(f"Stored temp directory reference in model: {temp_dir.name}")

return model_instance


class ExecuTorchModelForSeq2SeqLM(ExecuTorchModelBase):
Expand Down
5 changes: 5 additions & 0 deletions optimum/exporters/executorch/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""Entry point to the optimum.exporters.executorch command line."""

import argparse
import logging
import os
import warnings
from pathlib import Path
Expand Down Expand Up @@ -130,10 +131,14 @@ def main_export(
kwargs["force_download"] = force_download
kwargs["config"] = config

# 1. Load model, apply source transformations, and torch.export() into a graph (ExportedProgram).
logging.info(f"Loading {model_name_or_path} and exporting to static graph...")
recipe_kwargs = kwargs.pop("recipe_kwargs", {})

model = task_func(model_name_or_path, **kwargs)

# 2. Export to ExecuTorch through ExecuTorch's lowering APIs.
logging.info(f"Lowering {model_name_or_path} to ExecuTorch...")
if not os.path.exists(output_dir):
os.makedirs(output_dir)

Expand Down
9 changes: 3 additions & 6 deletions optimum/exporters/executorch/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,17 @@
from pathlib import Path
from typing import Union

from transformers.integrations.executorch import sdpa_mask_without_vmap
from transformers.masking_utils import AttentionMaskInterface
from transformers.modeling_utils import AttentionInterface

from optimum.executorch.attentions.custom_sdpa import custom_sdpa_with_start_pos_forward
from optimum.utils.import_utils import is_transformers_version

from .recipe_registry import discover_recipes, recipe_registry


AttentionInterface.register("custom_sdpa", custom_sdpa_with_start_pos_forward)
if is_transformers_version(">=", "4.53.0.dev0"):
from transformers.integrations.executorch import sdpa_mask_without_vmap
from transformers.masking_utils import AttentionMaskInterface

AttentionMaskInterface.register("custom_sdpa", sdpa_mask_without_vmap)
AttentionMaskInterface.register("custom_sdpa", sdpa_mask_without_vmap)


def export_to_executorch(
Expand Down
Loading
Loading