Improving modularity and code structure (#212)

* CI fetch depth 0 * VAD refactoring * Update logo * Add WhisperKitConfig * Open whisperkit methods * add missing @available --------- Co-authored-by: BlaiseMuhirwa <blaisemuhirwa3@gmail.com> Co-authored-by: ZachNagengast <znagengast@gmail.com>
argmaxinc · Oct 2, 2024 · c2f1b57 · c2f1b57
1 parent 3cd3ef1
commit c2f1b57
Show file tree

Hide file tree

Showing 22 changed files with 371 additions and 263 deletions.
diff --git a/.github/workflows/development-tests.yml b/.github/workflows/development-tests.yml
@@ -25,10 +25,9 @@ jobs:
       reviews: ${{ steps.reviews.outputs.state }}
     permissions:
       pull-requests: read
+      contents: read
     steps:
       - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
       - name: Check Approvals
         id: reviews
         env:

diff --git a/.github/workflows/expo-update.yml b/.github/workflows/expo-update.yml
@@ -18,7 +18,6 @@ jobs:
           uses: actions/checkout@v4
           with:
             repository: seb-sep/whisper-kit-expo
-            fetch-depth: 0
             token: ${{ secrets.COMMITTER_TOKEN }}
             ref: main
 

diff --git a/Examples/WhisperAX/WhisperAX/Views/ContentView.swift b/Examples/WhisperAX/WhisperAX/Views/ContentView.swift
@@ -990,14 +990,13 @@ struct ContentView: View {
 
         whisperKit = nil
         Task {
-            whisperKit = try await WhisperKit(
-                computeOptions: getComputeOptions(),
-                verbose: true,
-                logLevel: .debug,
-                prewarm: false,
-                load: false,
-                download: false
-            )
+            let config = WhisperKitConfig(computeOptions: getComputeOptions(),
+                                          verbose: true,
+                                          logLevel: .debug,
+                                          prewarm: false,
+                                          load: false,
+                                          download: false)
+            whisperKit = try await WhisperKit(config)
             guard let whisperKit = whisperKit else {
                 return
             }

diff --git a/Examples/WhisperAX/WhisperAXWatchApp/WhisperAXExampleView.swift b/Examples/WhisperAX/WhisperAXWatchApp/WhisperAXExampleView.swift
@@ -340,13 +340,13 @@ struct WhisperAXWatchView: View {
 
         whisperKit = nil
         Task {
-            whisperKit = try await WhisperKit(
-                verbose: true,
-                logLevel: .debug,
-                prewarm: false,
-                load: false,
-                download: false
-            )
+            let config = WhisperKitConfig(verbose: true,
+                                          logLevel: .debug,
+                                          prewarm: false,
+                                          load: false,
+                                          download: false)
+
+            whisperKit = try await WhisperKit(config)
             guard let whisperKit = whisperKit else {
                 return
             }

diff --git a/README.md b/README.md
@@ -2,11 +2,11 @@
 <div align="center">
 
 <a href="https://github.com/argmaxinc/WhisperKit#gh-light-mode-only">
-  <img src="https://github.com/argmaxinc/WhisperKit/assets/1981179/6ac3360b-2f5c-4392-a71a-05c5dda71093" alt="WhisperKit" width="20%" />
+  <img src="https://github.com/user-attachments/assets/f0699c07-c29f-45b6-a9c6-f6d491b8f791" alt="WhisperKit" width="20%" />
 </a>
 
 <a href="https://github.com/argmaxinc/WhisperKit#gh-dark-mode-only">
-  <img src="https://github.com/argmaxinc/WhisperKit/assets/1981179/a682ce21-80e0-4a98-a99f-836663538a4f" alt="WhisperKit" width="20%" />
+  <img src="https://github.com/user-attachments/assets/1be5e31c-de42-40ab-9b85-790cb911ed47" alt="WhisperKit" width="20%" />
 </a>
 
 # WhisperKit
@@ -92,13 +92,13 @@ Task {
 WhisperKit automatically downloads the recommended model for the device if not specified. You can also select a specific model by passing in the model name:
 
 ```swift
-let pipe = try? await WhisperKit(model: "large-v3")
+let pipe = try? await WhisperKit(WhisperKitConfig(model: "large-v3"))
 ```
 
 This method also supports glob search, so you can use wildcards to select a model:
 
 ```swift
-let pipe = try? await WhisperKit(model: "distil*large-v3")
+let pipe = try? await WhisperKit(WhisperKitConfig(model: "distil*large-v3"))
 ```
 
 Note that the model search must return a single model from the source repo, otherwise an error will be thrown.
@@ -110,7 +110,8 @@ For a list of available models, see our [HuggingFace repo](https://huggingface.c
 WhisperKit also comes with the supporting repo [`whisperkittools`](https://github.com/argmaxinc/whisperkittools) which lets you create and deploy your own fine tuned versions of Whisper in CoreML format to HuggingFace. Once generated, they can be loaded by simply changing the repo name to the one used to upload the model:
 
 ```swift
-let pipe = try? await WhisperKit(model: "large-v3", modelRepo: "username/your-model-repo")
+let config = WhisperKitConfig(model: "large-v3", modelRepo: "username/your-model-repo")
+let pipe = try? await WhisperKit(config)
 ```
 
 ### Swift CLI

diff --git a/Sources/WhisperKit/Core/AudioChunker.swift → .../WhisperKit/Core/Audio/AudioChunker.swift b/Sources/WhisperKit/Core/AudioChunker.swift → .../WhisperKit/Core/Audio/AudioChunker.swift
@@ -48,9 +48,9 @@ open class VADAudioChunker: AudioChunking {
     private let windowPadding: Int
     private let vad: VoiceActivityDetector
 
-    init(windowPadding: Int = 16000, vad: VoiceActivityDetector = EnergyVAD()) {
+    public init(windowPadding: Int = 16000, vad: VoiceActivityDetector? = nil) {
         self.windowPadding = windowPadding
-        self.vad = vad
+        self.vad = vad ?? EnergyVAD()
     }
 
     private func splitOnMiddleOfLongestSilence(audioArray: [Float], startIndex: Int, endIndex: Int) -> Int {

diff --git a/Sources/WhisperKit/Core/AudioProcessor.swift → ...hisperKit/Core/Audio/AudioProcessor.swift b/Sources/WhisperKit/Core/AudioProcessor.swift → ...hisperKit/Core/Audio/AudioProcessor.swift
diff --git a/...sperKit/Core/AudioStreamTranscriber.swift → ...t/Core/Audio/AudioStreamTranscriber.swift b/...sperKit/Core/AudioStreamTranscriber.swift → ...t/Core/Audio/AudioStreamTranscriber.swift
diff --git a/Sources/WhisperKit/Core/VAD/EnergyVAD.swift → ...ces/WhisperKit/Core/Audio/EnergyVAD.swift b/Sources/WhisperKit/Core/VAD/EnergyVAD.swift → ...ces/WhisperKit/Core/Audio/EnergyVAD.swift
diff --git a/...rKit/Core/VAD/VoiceActivityDetector.swift → ...it/Core/Audio/VoiceActivityDetector.swift b/...rKit/Core/VAD/VoiceActivityDetector.swift → ...it/Core/Audio/VoiceActivityDetector.swift
@@ -6,23 +6,23 @@ import Foundation
 /// A base class for Voice Activity Detection (VAD), used to identify and separate segments of audio that contain human speech from those that do not.
 /// Subclasses must implement the `voiceActivity(in:)` method to provide specific voice activity detection functionality.
 @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
-class VoiceActivityDetector {
+open class VoiceActivityDetector {
     /// The sample rate of the audio signal, in samples per second.
-    var sampleRate: Int
+    public let sampleRate: Int
 
     /// The length of each frame in samples.
-    var frameLengthSamples: Int
+    public let frameLengthSamples: Int
 
-    // The number of samples overlapping between consecutive frames.
-    var frameOverlapSamples: Int
+    /// The number of samples overlapping between consecutive frames.
+    public let frameOverlapSamples: Int
 
     /// Initializes a new `VoiceActivityDetector` instance with the specified parameters.
     /// - Parameters:
     ///   - sampleRate: The sample rate of the audio signal in samples per second. Defaults to 16000.
     ///   - frameLengthSamples: The length of each frame in samples.
     ///   - frameOverlapSamples: The number of samples overlapping between consecutive frames. Defaults to 0.
     /// - Note: Subclasses should override the `voiceActivity(in:)` method to provide specific VAD functionality.
-    init(
+    public init(
         sampleRate: Int = 16000,
         frameLengthSamples: Int,
         frameOverlapSamples: Int = 0
@@ -35,14 +35,14 @@ class VoiceActivityDetector {
     /// Analyzes the provided audio waveform to determine which segments contain voice activity.
     /// - Parameter waveform: An array of `Float` values representing the audio waveform.
     /// - Returns: An array of `Bool` values where `true` indicates the presence of voice activity and `false` indicates silence.
-    func voiceActivity(in waveform: [Float]) -> [Bool] {
+    open func voiceActivity(in waveform: [Float]) -> [Bool] {
         fatalError("`voiceActivity` must be implemented by subclass")
     }
 
     /// Calculates and returns a list of active audio chunks, each represented by a start and end index.
     /// - Parameter waveform: An array of `Float` values representing the audio waveform.
     /// - Returns: An array of tuples where each tuple contains the start and end indices of an active audio chunk.
-    func calculateActiveChunks(in waveform: [Float]) -> [(startIndex: Int, endIndex: Int)] {
+    public func calculateActiveChunks(in waveform: [Float]) -> [(startIndex: Int, endIndex: Int)] {
         let vad: [Bool] = voiceActivity(in: waveform)
         var result = [(startIndex: Int, endIndex: Int)]()
 
@@ -74,18 +74,18 @@ class VoiceActivityDetector {
     /// Converts a voice activity index to the corresponding audio sample index.
     /// - Parameter index: The voice activity index to convert.
     /// - Returns: The corresponding audio sample index.
-    func voiceActivityIndexToAudioSampleIndex(_ index: Int) -> Int {
+    public func voiceActivityIndexToAudioSampleIndex(_ index: Int) -> Int {
         return index * frameLengthSamples
     }
 
-    func voiceActivityIndexToSeconds(_ index: Int) -> Float {
+    public func voiceActivityIndexToSeconds(_ index: Int) -> Float {
         return Float(voiceActivityIndexToAudioSampleIndex(index)) / Float(sampleRate)
     }
 
     /// Identifies the longest continuous period of silence within the provided voice activity detection results.
     /// - Parameter vadResult: An array of `Bool` values representing voice activity detection results.
     /// - Returns: A tuple containing the start and end indices of the longest silence period, or `nil` if no silence is found.
-    func findLongestSilence(in vadResult: [Bool]) -> (startIndex: Int, endIndex: Int)? {
+    public func findLongestSilence(in vadResult: [Bool]) -> (startIndex: Int, endIndex: Int)? {
         var longestStartIndex: Int?
         var longestEndIndex: Int?
         var longestCount = 0

diff --git a/Sources/WhisperKit/Core/Configurations.swift b/Sources/WhisperKit/Core/Configurations.swift
@@ -0,0 +1,205 @@
+//  For licensing see accompanying LICENSE.md file.
+//  Copyright © 2024 Argmax, Inc. All rights reserved.
+
+import Foundation
+
+/// Configuration to initialize WhisperKit
+@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
+open class WhisperKitConfig {
+    /// Name for whisper model to use
+    public var model: String?
+    /// Base URL for downloading models
+    public var downloadBase: URL?
+    /// Repository for downloading models
+    public var modelRepo: String?
+
+    /// Folder to store models
+    public var modelFolder: String?
+    /// Folder to store tokenizers
+    public var tokenizerFolder: URL?
+
+    /// Model compute options, see `ModelComputeOptions`
+    public var computeOptions: ModelComputeOptions?
+    /// Audio processor for the model
+    public var audioProcessor: (any AudioProcessing)?
+    /// Audio processor for the model
+    public var featureExtractor: (any FeatureExtracting)?
+    public var audioEncoder: (any AudioEncoding)?
+    public var textDecoder: (any TextDecoding)?
+    public var logitsFilters: [any LogitsFiltering]?
+    public var segmentSeeker: (any SegmentSeeking)?
+
+    /// Enable extra verbosity for logging
+    public var verbose: Bool
+    /// Maximum log level
+    public var logLevel: Logging.LogLevel
+
+    /// Enable model prewarming
+    public var prewarm: Bool?
+    /// Load models if available
+    public var load: Bool?
+    /// Download models if not available
+    public var download: Bool
+    /// Use background download session
+    public var useBackgroundDownloadSession: Bool
+
+    public init(model: String? = nil,
+                downloadBase: URL? = nil,
+                modelRepo: String? = nil,
+                modelFolder: String? = nil,
+                tokenizerFolder: URL? = nil,
+                computeOptions: ModelComputeOptions? = nil,
+                audioProcessor: (any AudioProcessing)? = nil,
+                featureExtractor: (any FeatureExtracting)? = nil,
+                audioEncoder: (any AudioEncoding)? = nil,
+                textDecoder: (any TextDecoding)? = nil,
+                logitsFilters: [any LogitsFiltering]? = nil,
+                segmentSeeker: (any SegmentSeeking)? = nil,
+                verbose: Bool = true,
+                logLevel: Logging.LogLevel = .info,
+                prewarm: Bool? = nil,
+                load: Bool? = nil,
+                download: Bool = true,
+                useBackgroundDownloadSession: Bool = false
+    ) {
+        self.model = model
+        self.downloadBase = downloadBase
+        self.modelRepo = modelRepo
+        self.modelFolder = modelFolder
+        self.tokenizerFolder = tokenizerFolder
+        self.computeOptions = computeOptions
+        self.audioProcessor = audioProcessor
+        self.featureExtractor = featureExtractor
+        self.audioEncoder = audioEncoder
+        self.textDecoder = textDecoder
+        self.logitsFilters = logitsFilters
+        self.segmentSeeker = segmentSeeker
+        self.verbose = verbose
+        self.logLevel = logLevel
+        self.prewarm = prewarm
+        self.load = load
+        self.download = download
+        self.useBackgroundDownloadSession = useBackgroundDownloadSession
+    }
+}
+
+
+/// Options for how to transcribe an audio file using WhisperKit.
+///
+/// - Parameters:
+///   - verbose: Whether to display the text being decoded to the console.
+///              If true, displays all details; if false, displays minimal details;
+///   - task: Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')
+///   - language: Language spoken in the audio
+///   - temperature: Temperature to use for sampling.
+///   - temperatureIncrementOnFallback: Increment which will be
+///                  successively added to temperature upon failures according to either `compressionRatioThreshold`
+///                  or `logProbThreshold`.
+///   - temperatureFallbackCount: Number of times to increment temperature on fallback.
+///   - sampleLength: The maximum number of tokens to sample.
+///   - topK: Number of candidates when sampling with non-zero temperature.
+///   - usePrefillPrompt: If true, the prefill tokens will be forced according to task and language settings.
+///   - usePrefillCache: If true, the kv cache will be prefilled based on the prefill data mlmodel.
+///   - detectLanguage: Use this in conjuntion with `usePrefillPrompt: true` to detect the language of the input audio.
+///   - skipSpecialTokens: Whether to skip special tokens in the output.
+///   - withoutTimestamps: Whether to include timestamps in the transcription result.
+///   - wordTimestamps: Whether to include word-level timestamps in the transcription result.
+///   - maxInitialTimestamp: Maximal initial timestamp.
+///   - clipTimestamps: Array of timestamps (in seconds) to split the audio into segments for transcription.
+///   - promptTokens: Array of token IDs to use as the conditioning prompt for the decoder. These are prepended to the prefill tokens.
+///   - prefixTokens: Array of token IDs to use as the initial prefix for the decoder. These are appended to the prefill tokens.
+///   - suppressBlank: If true, blank tokens will be suppressed during decoding.
+///   - supressTokens: List of token IDs to suppress during decoding.
+///   - compressionRatioThreshold: If the compression ratio of the transcription text is above this value, it is too repetitive and treated as failed.
+///   - logProbThreshold: If the average log probability over sampled tokens is below this value, treat as failed.
+///   - firstTokenLogProbThreshold: If the log probability over the first sampled token is below this value, treat as failed.
+///   - noSpeechThreshold: If the no speech probability is higher than this value AND the average log
+///                        probability over sampled tokens is below `logProbThreshold`, consider the segment as silent.
+@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
+public struct DecodingOptions {
+    public var verbose: Bool
+    public var task: DecodingTask
+    public var language: String?
+    public var temperature: Float
+    public var temperatureIncrementOnFallback: Float
+    public var temperatureFallbackCount: Int
+    public var sampleLength: Int
+    public var topK: Int
+    public var usePrefillPrompt: Bool
+    public var usePrefillCache: Bool
+    public var detectLanguage: Bool
+    public var skipSpecialTokens: Bool
+    public var withoutTimestamps: Bool
+    public var wordTimestamps: Bool
+    public var maxInitialTimestamp: Float?
+    public var clipTimestamps: [Float]
+    public var promptTokens: [Int]?
+    public var prefixTokens: [Int]?
+    public var suppressBlank: Bool
+    public var supressTokens: [Int]
+    public var compressionRatioThreshold: Float?
+    public var logProbThreshold: Float?
+    public var firstTokenLogProbThreshold: Float?
+    public var noSpeechThreshold: Float?
+    public var concurrentWorkerCount: Int
+    public var chunkingStrategy: ChunkingStrategy?
+    public var voiceActivityDetector: VoiceActivityDetector?
+
+    public init(
+        verbose: Bool = false,
+        task: DecodingTask = .transcribe,
+        language: String? = nil,
+        temperature: Float = 0.0,
+        temperatureIncrementOnFallback: Float = 0.2,
+        temperatureFallbackCount: Int = 5,
+        sampleLength: Int = Constants.maxTokenContext,
+        topK: Int = 5,
+        usePrefillPrompt: Bool = true,
+        usePrefillCache: Bool = true,
+        detectLanguage: Bool? = nil,
+        skipSpecialTokens: Bool = false,
+        withoutTimestamps: Bool = false,
+        wordTimestamps: Bool = false,
+        maxInitialTimestamp: Float? = nil,
+        clipTimestamps: [Float] = [],
+        promptTokens: [Int]? = nil,
+        prefixTokens: [Int]? = nil,
+        suppressBlank: Bool = false,
+        supressTokens: [Int]? = nil,
+        compressionRatioThreshold: Float? = 2.4,
+        logProbThreshold: Float? = -1.0,
+        firstTokenLogProbThreshold: Float? = -1.5,
+        noSpeechThreshold: Float? = 0.6,
+        concurrentWorkerCount: Int = 16,
+        chunkingStrategy: ChunkingStrategy? = nil,
+        voiceActivityDetector: VoiceActivityDetector? = nil
+    ) {
+        self.verbose = verbose
+        self.task = task
+        self.language = language
+        self.temperature = temperature
+        self.temperatureIncrementOnFallback = temperatureIncrementOnFallback
+        self.temperatureFallbackCount = temperatureFallbackCount
+        self.sampleLength = sampleLength
+        self.topK = topK
+        self.usePrefillPrompt = usePrefillPrompt
+        self.usePrefillCache = usePrefillCache
+        self.detectLanguage = detectLanguage ?? !usePrefillPrompt // If prefill is false, detect language by default
+        self.skipSpecialTokens = skipSpecialTokens
+        self.withoutTimestamps = withoutTimestamps
+        self.wordTimestamps = wordTimestamps
+        self.maxInitialTimestamp = maxInitialTimestamp
+        self.clipTimestamps = clipTimestamps
+        self.promptTokens = promptTokens
+        self.prefixTokens = prefixTokens
+        self.suppressBlank = suppressBlank
+        self.supressTokens = supressTokens ?? [] // nonSpeechTokens() // TODO: implement these as default
+        self.compressionRatioThreshold = compressionRatioThreshold
+        self.logProbThreshold = logProbThreshold
+        self.firstTokenLogProbThreshold = firstTokenLogProbThreshold
+        self.noSpeechThreshold = noSpeechThreshold
+        self.concurrentWorkerCount = concurrentWorkerCount
+        self.chunkingStrategy = chunkingStrategy
+        self.voiceActivityDetector = voiceActivityDetector
+    }
+}