docling-project
diff --git a/‎docling/datamodel/asr_model_specs.py‎
Lines changed: 149 additions & 30 deletions b/‎docling/datamodel/asr_model_specs.py‎
Lines changed: 149 additions & 30 deletions
diff --git a/‎docling/datamodel/pipeline_options_asr_model.py‎
Lines changed: 20 additions & 1 deletion b/‎docling/datamodel/pipeline_options_asr_model.py‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎docling/pipeline/asr_pipeline.py‎
Lines changed: 135 additions & 0 deletions b/‎docling/pipeline/asr_pipeline.py‎
Lines changed: 135 additions & 0 deletions
@@ -11,6 +11,7 @@
     # ApiAsrOptions,
     InferenceAsrFramework,
     InlineAsrNativeWhisperOptions,
+    InlineAsrMlxWhisperOptions,
     TransformersModelType,
 )
 
@@ -27,16 +28,54 @@
     max_time_chunk=30.0,
 )
 
-WHISPER_SMALL = InlineAsrNativeWhisperOptions(
-    repo_id="small",
-    inference_framework=InferenceAsrFramework.WHISPER,
-    verbose=True,
-    timestamps=True,
-    word_timestamps=True,
-    temperature=0.0,
-    max_new_tokens=256,
-    max_time_chunk=30.0,
-)
+def _get_whisper_small_model():
+    """
+    Get the best Whisper Small model for the current hardware.
+    
+    Automatically selects MLX Whisper Small for Apple Silicon (MPS) if available,
+    otherwise falls back to native Whisper Small.
+    """
+    # Check if MPS is available (Apple Silicon)
+    try:
+        import torch
+        has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
+    except ImportError:
+        has_mps = False
+    
+    # Check if mlx-whisper is available
+    try:
+        import mlx_whisper  # type: ignore
+        has_mlx_whisper = True
+    except ImportError:
+        has_mlx_whisper = False
+    
+    # Use MLX Whisper if both MPS and mlx-whisper are available
+    if has_mps and has_mlx_whisper:
+        return InlineAsrMlxWhisperOptions(
+            repo_id="mlx-community/whisper-small-mlx",
+            inference_framework=InferenceAsrFramework.MLX,
+            language="en",
+            task="transcribe",
+            word_timestamps=True,
+            no_speech_threshold=0.6,
+            logprob_threshold=-1.0,
+            compression_ratio_threshold=2.4,
+        )
+    else:
+        return InlineAsrNativeWhisperOptions(
+            repo_id="small",
+            inference_framework=InferenceAsrFramework.WHISPER,
+            verbose=True,
+            timestamps=True,
+            word_timestamps=True,
+            temperature=0.0,
+            max_new_tokens=256,
+            max_time_chunk=30.0,
+        )
+
+
+# Create the model instance
+WHISPER_SMALL = _get_whisper_small_model()
 
 WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
     repo_id="medium",
@@ -49,16 +88,54 @@
     max_time_chunk=30.0,
 )
 
-WHISPER_BASE = InlineAsrNativeWhisperOptions(
-    repo_id="base",
-    inference_framework=InferenceAsrFramework.WHISPER,
-    verbose=True,
-    timestamps=True,
-    word_timestamps=True,
-    temperature=0.0,
-    max_new_tokens=256,
-    max_time_chunk=30.0,
-)
+def _get_whisper_base_model():
+    """
+    Get the best Whisper Base model for the current hardware.
+    
+    Automatically selects MLX Whisper Base for Apple Silicon (MPS) if available,
+    otherwise falls back to native Whisper Base.
+    """
+    # Check if MPS is available (Apple Silicon)
+    try:
+        import torch
+        has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
+    except ImportError:
+        has_mps = False
+    
+    # Check if mlx-whisper is available
+    try:
+        import mlx_whisper  # type: ignore
+        has_mlx_whisper = True
+    except ImportError:
+        has_mlx_whisper = False
+    
+    # Use MLX Whisper if both MPS and mlx-whisper are available
+    if has_mps and has_mlx_whisper:
+        return InlineAsrMlxWhisperOptions(
+            repo_id="mlx-community/whisper-base-mlx",
+            inference_framework=InferenceAsrFramework.MLX,
+            language="en",
+            task="transcribe",
+            word_timestamps=True,
+            no_speech_threshold=0.6,
+            logprob_threshold=-1.0,
+            compression_ratio_threshold=2.4,
+        )
+    else:
+        return InlineAsrNativeWhisperOptions(
+            repo_id="base",
+            inference_framework=InferenceAsrFramework.WHISPER,
+            verbose=True,
+            timestamps=True,
+            word_timestamps=True,
+            temperature=0.0,
+            max_new_tokens=256,
+            max_time_chunk=30.0,
+        )
+
+
+# Create the model instance
+WHISPER_BASE = _get_whisper_base_model()
 
 WHISPER_LARGE = InlineAsrNativeWhisperOptions(
     repo_id="large",
@@ -71,16 +148,58 @@
     max_time_chunk=30.0,
 )
 
-WHISPER_TURBO = InlineAsrNativeWhisperOptions(
-    repo_id="turbo",
-    inference_framework=InferenceAsrFramework.WHISPER,
-    verbose=True,
-    timestamps=True,
-    word_timestamps=True,
-    temperature=0.0,
-    max_new_tokens=256,
-    max_time_chunk=30.0,
-)
+def _get_whisper_turbo_model():
+    """
+    Get the best Whisper Turbo model for the current hardware.
+    
+    Automatically selects MLX Whisper Turbo for Apple Silicon (MPS) if available,
+    otherwise falls back to native Whisper Turbo.
+    """
+    # Check if MPS is available (Apple Silicon)
+    try:
+        import torch
+        has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
+    except ImportError:
+        has_mps = False
+    
+    # Check if mlx-whisper is available
+    try:
+        import mlx_whisper  # type: ignore
+        has_mlx_whisper = True
+    except ImportError:
+        has_mlx_whisper = False
+    
+    # Use MLX Whisper if both MPS and mlx-whisper are available
+    if has_mps and has_mlx_whisper:
+        return InlineAsrMlxWhisperOptions(
+            repo_id="mlx-community/whisper-turbo",
+            inference_framework=InferenceAsrFramework.MLX,
+            language="en",
+            task="transcribe",
+            word_timestamps=True,
+            no_speech_threshold=0.6,
+            logprob_threshold=-1.0,
+            compression_ratio_threshold=2.4,
+        )
+    else:
+        return InlineAsrNativeWhisperOptions(
+            repo_id="turbo",
+            inference_framework=InferenceAsrFramework.WHISPER,
+            verbose=True,
+            timestamps=True,
+            word_timestamps=True,
+            temperature=0.0,
+            max_new_tokens=256,
+            max_time_chunk=30.0,
+        )
+
+
+# Create the model instance
+WHISPER_TURBO = _get_whisper_turbo_model()
+
+# Note: MLX Whisper models are now automatically selected when using
+# WHISPER_TURBO, WHISPER_BASE, WHISPER_SMALL, etc. on Apple Silicon systems
+# with mlx-whisper installed. No need for separate MLX-specific model specs.
 
 
 class AsrModelType(str, Enum):
 
@@ -17,7 +17,7 @@ class BaseAsrOptions(BaseModel):
 
 
 class InferenceAsrFramework(str, Enum):
-    # MLX = "mlx" # disabled for now
+    MLX = "mlx"
     # TRANSFORMERS = "transformers" # disabled for now
     WHISPER = "whisper"
 
@@ -55,3 +55,22 @@ class InlineAsrNativeWhisperOptions(InlineAsrOptions):
         AcceleratorDevice.CUDA,
     ]
     word_timestamps: bool = True
+
+
+class InlineAsrMlxWhisperOptions(InlineAsrOptions):
+    """
+    MLX Whisper options for Apple Silicon optimization.
+    
+    Uses mlx-whisper library for efficient inference on Apple Silicon devices.
+    """
+    inference_framework: InferenceAsrFramework = InferenceAsrFramework.MLX
+
+    language: str = "en"
+    task: str = "transcribe"  # "transcribe" or "translate"
+    supported_devices: List[AcceleratorDevice] = [
+        AcceleratorDevice.MPS,  # MLX is optimized for Apple Silicon
+    ]
+    word_timestamps: bool = True
+    no_speech_threshold: float = 0.6  # Threshold for detecting speech
+    logprob_threshold: float = -1.0  # Log probability threshold
+    compression_ratio_threshold: float = 2.4  # Compression ratio threshold
@@ -32,6 +32,7 @@
 )
 from docling.datamodel.pipeline_options_asr_model import (
     InlineAsrNativeWhisperOptions,
+    InlineAsrMlxWhisperOptions,
     # AsrResponseFormat,
     InlineAsrOptions,
 )
@@ -201,6 +202,130 @@ def transcribe(self, fpath: Path) -> list[_ConversationItem]:
         return convo
 
 
+class _MlxWhisperModel:
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        accelerator_options: AcceleratorOptions,
+        asr_options: InlineAsrMlxWhisperOptions,
+    ):
+        """
+        Transcriber using MLX Whisper for Apple Silicon optimization.
+        """
+        self.enabled = enabled
+
+        _log.info(f"artifacts-path: {artifacts_path}")
+        _log.info(f"accelerator_options: {accelerator_options}")
+
+        if self.enabled:
+            try:
+                import mlx_whisper  # type: ignore
+            except ImportError:
+                raise ImportError(
+                    "mlx-whisper is not installed. Please install it via `pip install mlx-whisper` or do `uv sync --extra asr`."
+                )
+            self.asr_options = asr_options
+            self.mlx_whisper = mlx_whisper
+
+            self.device = decide_device(
+                accelerator_options.device,
+                supported_devices=asr_options.supported_devices,
+            )
+            _log.info(f"Available device for MLX Whisper: {self.device}")
+
+            self.model_name = asr_options.repo_id
+            _log.info(f"loading _MlxWhisperModel({self.model_name})")
+            
+            # MLX Whisper models are loaded differently - they use HuggingFace repos
+            self.model_path = self.model_name
+
+            # Store MLX-specific options
+            self.language = asr_options.language
+            self.task = asr_options.task
+            self.word_timestamps = asr_options.word_timestamps
+            self.no_speech_threshold = asr_options.no_speech_threshold
+            self.logprob_threshold = asr_options.logprob_threshold
+            self.compression_ratio_threshold = asr_options.compression_ratio_threshold
+
+    def run(self, conv_res: ConversionResult) -> ConversionResult:
+        audio_path: Path = Path(conv_res.input.file).resolve()
+
+        try:
+            conversation = self.transcribe(audio_path)
+
+            # Ensure we have a proper DoclingDocument
+            origin = DocumentOrigin(
+                filename=conv_res.input.file.name or "audio.wav",
+                mimetype="audio/x-wav",
+                binary_hash=conv_res.input.document_hash,
+            )
+            conv_res.document = DoclingDocument(
+                name=conv_res.input.file.stem or "audio.wav", origin=origin
+            )
+
+            for citem in conversation:
+                conv_res.document.add_text(
+                    label=DocItemLabel.TEXT, text=citem.to_string()
+                )
+
+            conv_res.status = ConversionStatus.SUCCESS
+            return conv_res
+
+        except Exception as exc:
+            _log.error(f"MLX Audio transcription has an error: {exc}")
+
+        conv_res.status = ConversionStatus.FAILURE
+        return conv_res
+
+    def transcribe(self, fpath: Path) -> list[_ConversationItem]:
+        """
+        Transcribe audio using MLX Whisper.
+        
+        Args:
+            fpath: Path to audio file
+            
+        Returns:
+            List of conversation items with timestamps
+        """
+        result = self.mlx_whisper.transcribe(
+            str(fpath),
+            path_or_hf_repo=self.model_path,
+            language=self.language,
+            task=self.task,
+            word_timestamps=self.word_timestamps,
+            no_speech_threshold=self.no_speech_threshold,
+            logprob_threshold=self.logprob_threshold,
+            compression_ratio_threshold=self.compression_ratio_threshold,
+        )
+
+        convo: list[_ConversationItem] = []
+        
+        # MLX Whisper returns segments similar to native Whisper
+        for segment in result.get("segments", []):
+            item = _ConversationItem(
+                start_time=segment.get("start"),
+                end_time=segment.get("end"),
+                text=segment.get("text", "").strip(),
+                words=[]
+            )
+            
+            # Add word-level timestamps if available
+            if self.word_timestamps and "words" in segment:
+                item.words = []
+                for word_data in segment["words"]:
+                    item.words.append(
+                        _ConversationWord(
+                            start_time=word_data.get("start"),
+                            end_time=word_data.get("end"),
+                            text=word_data.get("word", ""),
+                        )
+                    )
+            convo.append(item)
+
+        return convo
+
+
 class AsrPipeline(BasePipeline):
     def __init__(self, pipeline_options: AsrPipelineOptions):
         super().__init__(pipeline_options)
@@ -218,6 +343,16 @@ def __init__(self, pipeline_options: AsrPipelineOptions):
                 accelerator_options=pipeline_options.accelerator_options,
                 asr_options=asr_options,
             )
+        elif isinstance(self.pipeline_options.asr_options, InlineAsrMlxWhisperOptions):
+            asr_options: InlineAsrMlxWhisperOptions = (
+                self.pipeline_options.asr_options
+            )
+            self._model = _MlxWhisperModel(
+                enabled=True,  # must be always enabled for this pipeline to make sense.
+                artifacts_path=self.artifacts_path,
+                accelerator_options=pipeline_options.accelerator_options,
+                asr_options=asr_options,
+            )
         else:
             _log.error(f"No model support for {self.pipeline_options.asr_options}")