@@ -38,6 +38,31 @@ open class WhisperKitConfig {
3838 public var logLevel : Logging . LogLevel
3939
4040 /// Enable model prewarming
41+ ///
42+ /// What does "prewarm" mean and when should it be enabled?
43+ ///
44+ /// WhisperKit uses Apple Core ML models that are downloaded as device-agnostic
45+ /// model files (*.mlmodelc). These models need to be "specialized" to a user's
46+ /// device chip before it can be used. Core ML "specializes" a model automatically
47+ /// during the first time the models are being loaded. The resulting "specialized"
48+ /// model files are cached on-disk by Core ML (not by Argmax) outside the app bundle.
49+ /// This cache is maintained by Apple and is evicted after every OS update and if
50+ /// the models are not used for extended periods of time. Unfortunately, Apple does
51+ /// not yet provide a third-party API to check whether the cache will be hit or is
52+ /// evicted. Hence, Argmax built a defensive "prewarm" option to ensure that each
53+ /// model gets loaded sequentially and unloaded immediately to trigger specialization if necessary.
54+ ///
55+ /// **Trade-offs**
56+ /// - **Pro** — The peak memory usage during compilation is reduced because
57+ /// only one model is kept in memory at any given point. Otherwise, the
58+ /// peak memory will bloat to all model weights combined plus the peak
59+ /// compilation memory (higher than model weights).
60+ /// - **Con** — The load time will be multiplied by 2 (usually <1s when cache is hit)
61+ /// because of the load-unload-load pattern when the specialized model file cache is
62+ /// actually hit and prewarm does not trigger specialization
63+ ///
64+ /// Enable `prewarm` when you want to minimize your peak memory impact throughout your app's lifecycle
65+ /// Disable `prewarm` if you can not take a 2x increase in load time
4166 public var prewarm : Bool ?
4267 /// Load models if available
4368 public var load : Bool ?
0 commit comments