Fix timestamp rules filter

- Also adds back missing language property from merge
argmaxinc · Mar 28, 2024 · 9e215f0 · 9e215f0
1 parent d01bca4
commit 9e215f0
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 28 deletions.
diff --git a/Sources/WhisperKit/Core/LogitsFilter.swift b/Sources/WhisperKit/Core/LogitsFilter.swift
@@ -74,9 +74,11 @@ open class TimestampRulesFilter: LogitsFiltering {
     }
 
     public func filterLogits(_ logits: MLMultiArray, withTokens tokens: [Int]) -> MLMultiArray {
-        guard let sampleBegin = sampleBegin(for: tokens) else {
+        guard let sampleBegin = sampleBegin(for: tokens),
+              sampleBegin > tokens.count else {
             return logits
         }
+
         // suppress <|notimestamps|> which is handled by `withoutTimestamps`
         logits.fill(indexes: [[0, 0, specialTokens.noTimestampsToken as NSNumber]], with: -FloatType.infinity)
 
@@ -109,15 +111,17 @@ open class TimestampRulesFilter: LogitsFiltering {
             }
         }
 
-       if tokens.count == sampleBegin {
-           // suppress generating non-timestamp tokens at the beginning
-           logits.fillLastDimension(indexes: 0..<specialTokens.timeTokenBegin, with: -FloatType.infinity)
-           if let maxInitialTimestampIndex {
-               // apply the `maxInitialTimestamp` option
-               let lastAllowed = specialTokens.timeTokenBegin + maxInitialTimestampIndex + 1
-               logits.fillLastDimension(indexes: lastAllowed..<logits.count, with: -FloatType.infinity)
-           }
-       }
+        // TODO: Allow model to predict initial timestamp
+        // Currently initial timestamp is forced to <|0.00|> every time
+//       if tokens.count == sampleBegin {
+//           // suppress generating non-timestamp tokens at the beginning
+//           logits.fillLastDimension(indexes: 0..<specialTokens.timeTokenBegin, with: -FloatType.infinity)
+//           if let maxInitialTimestampIndex {
+//               // apply the `maxInitialTimestamp` option
+//               let lastAllowed = specialTokens.timeTokenBegin + maxInitialTimestampIndex + 1
+//               logits.fillLastDimension(indexes: lastAllowed..<logits.count, with: -FloatType.infinity)
+//           }
+//       }
 
         // if sum of probability over timestamps is above any other token, sample timestamp
         if sumOfProbabilityOverTimestampsIsAboveAnyOtherToken(logits: logits, timeTokenBegin: specialTokens.timeTokenBegin) {
@@ -242,7 +246,7 @@ open class TimestampRulesFilter: LogitsFiltering {
 
 
 @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
-public class LanguageLogitsFilter: LogitsFiltering {
+open class LanguageLogitsFilter: LogitsFiltering {
     let allLanguageTokens: Set<Int>
     let logitsDim: Int
     let sampleBegin: Int

diff --git a/Sources/WhisperKit/Core/Models.swift b/Sources/WhisperKit/Core/Models.swift
@@ -884,7 +884,7 @@ public protocol WhisperTokenizer: Tokenizer {
 struct WhisperTokenizerWrapper: WhisperTokenizer {
     let tokenizer: any Tokenizer
     let specialTokens: SpecialTokens
-
+    
     init(tokenizer: any Tokenizer) {
         self.tokenizer = tokenizer
         self.specialTokens = SpecialTokens(
@@ -904,37 +904,37 @@ struct WhisperTokenizerWrapper: WhisperTokenizer {
     private func splitTokensOnUnicode(tokens: [Int]) -> (words: [String], wordTokens: [[Int]]) {
         let decodedFull = tokenizer.decode(tokens: tokens)
         let replacementString = "\u{fffd}"
-
+        
         var words: [String] = []
         var wordTokens: [[Int]] = []
         var currentTokens: [Int] = []
         var unicodeOffset = 0
-
+        
         for token in tokens {
             currentTokens.append(token)
             let decoded = tokenizer.decode(tokens: currentTokens)
-
+            
             var hasUnicodeInFullString = false
             if let range = decoded.range(of: replacementString) {
                 hasUnicodeInFullString = decodedFull[range] == replacementString
             }
-
+            
             if !decoded.contains(replacementString) || hasUnicodeInFullString {
                 words.append(decoded)
                 wordTokens.append(currentTokens)
                 currentTokens = []
                 unicodeOffset += decoded.count
             }
         }
-
+        
         return (words, wordTokens)
     }
-
+    
     private func splitTokensOnSpaces(tokens: [Int]) -> (words: [String], wordTokens: [[Int]]) {
         let (subwords, subwordTokensList) = splitTokensOnUnicode(tokens: tokens)
         var words: [String] = []
         var wordTokens: [[Int]] = []
-
+        
         for (subword, subwordTokens) in zip(subwords, subwordTokensList) {
             let special = subwordTokens.first! >= specialTokens.specialTokenBegin
             let withSpace = subword.hasPrefix(" ")
@@ -950,10 +950,10 @@ struct WhisperTokenizerWrapper: WhisperTokenizer {
                 wordTokens[words.count - 1].append(contentsOf: subwordTokens)
             }
         }
-
+        
         return (words, wordTokens)
     }
-
+    
     private func isPunctuation(_ text: String, tokenRange: Range<String.Index>, tag: NLTag?) -> Bool {
         let punctuationCharacters = CharacterSet.punctuationCharacters
         let token = String(text[tokenRange])
@@ -965,25 +965,27 @@ struct WhisperTokenizerWrapper: WhisperTokenizer {
         }
         return false
     }
-
+    
     /// Decodes token ids into individual words and per-word subtokens
-    /// - Parameter tokenIds: Array of tokens to decode and then split 
+    /// - Parameter tokenIds: Array of tokens to decode and then split
     /// - Returns: Tuple containing and array of the split words and all tokens for each word
     func splitToWordTokens(tokenIds: [Int]) -> (words: [String], wordTokens: [[Int]]) {
         let decodedWords = tokenizer.decode(tokens: tokenIds.filter { $0 < specialTokens.specialTokenBegin })
-
+        
         // Detect language of input text
         let recognizer = NLLanguageRecognizer()
         recognizer.processString(decodedWords)
         let languageCode = recognizer.dominantLanguage?.rawValue
-
+        
         if ["zh", "ja", "th", "lo", "my", "yue"].contains(languageCode) {
             return splitTokensOnUnicode(tokens: tokenIds)
         } else {
             return splitTokensOnSpaces(tokens: tokenIds)
         }
     }
+}
 
+public extension WhisperTokenizer {
     var languages: [String: String] {
         [
             "english": "en",

diff --git a/Sources/WhisperKit/Core/TextDecoder.swift b/Sources/WhisperKit/Core/TextDecoder.swift
@@ -424,7 +424,7 @@ open class TextDecoder: TextDecoding, WhisperMLModel {
         // Single loop variables
         var timings = TranscriptionTimings()
         let prefilledIndex = decoderInputs.cacheLength[0].intValue
-        let intialPromptIndex = decoderInputs.initialPrompt.count - 1
+        let intialPromptIndex = decoderInputs.initialPrompt.count
         var currentTokens: [Int] = decoderInputs.initialPrompt
         var nextToken: Int = decoderInputs.initialPrompt.last!
         var logProbs: [Float] = Array(repeating: 0, count: prefilledIndex + 1)
@@ -477,8 +477,8 @@ open class TextDecoder: TextDecoding, WhisperMLModel {
 
             // Check if current index is part of the initial prompt
             isPrefill = false
-            if tokenIndex <= intialPromptIndex {
-                isPrefill = tokenIndex < intialPromptIndex // Prefill stops at the last token of the initial prompt
+            if tokenIndex < intialPromptIndex {
+                isPrefill = tokenIndex < intialPromptIndex - 1 // Prefill stops at the last token of the initial prompt
                 let prefillToken = currentTokens[tokenIndex]
                 nextToken = prefillToken
                 Logging.debug("Forcing token \(nextToken) at index \(tokenIndex) from initial prompt")