AuralKit is a simple, lightweight Swift wrapper for speech-to-text transcription using iOS 26's SpeechTranscriber
and SpeechAnalyzer
APIs while handling microphone capture, buffer conversion, model downloads, and cancellation on your behalf.
Public API: SpeechSession
- A clean, session-based interface for speech transcription.
This project would not have been possible without Apple's excellent sample code. The implementation is heavily inspired by Bringing advanced speech-to-text capabilities to your app, which shows how to add live speech-to-text transcription with SpeechAnalyzer
.
import AuralKit
// Create a speech session with your preferred locale
let session = SpeechSession(locale: .current)
let streamTask = Task {
do {
// Start the async stream
for try await result in session.startTranscribing() {
if result.isFinal {
print("Final: \(result.text)")
} else {
print("Partial: \(result.text)")
}
}
} catch {
print("Transcription failed: \(error)")
}
}
// Later, when you want to stop capturing audio
Task {
await session.stopTranscribing()
await streamTask.value
}
Add AuralKit to your project through Xcode:
- File → Add Package Dependencies
- Enter:
https://github.com/rryam/AuralKit
- Click Add Package
Or add it to your Package.swift
:
dependencies: [
.package(url: "https://github.com/rryam/AuralKit", from: "1.0.0")
]
import AuralKit
// Create with default locale
let session = SpeechSession()
// Or specify a locale
let session = SpeechSession(locale: Locale(identifier: "es-ES"))
let streamTask = Task {
do {
// Start transcribing
for try await attributedText in session.startTranscribing() {
// Access the plain text
let plainText = String(attributedText.characters)
print(plainText)
// Access timing metadata for each word/phrase
for run in attributedText.runs {
if let timeRange = run.audioTimeRange {
print("Text: \(run.text), Start: \(timeRange.start.seconds)s")
}
}
}
} catch {
print("Transcription failed: \(error.localizedDescription)")
}
}
// Stop when needed
Task {
await session.stopTranscribing()
await streamTask.value
}
Check out the included Aural demo app to see AuralKit in action! The demo showcases:
- Live Transcription: Real-time speech-to-text with visual feedback
- Language Selection: Switch between multiple locales
- History Tracking: View past transcriptions
- Export & Share: Share transcriptions via standard iOS share sheet
- Open
Aural.xcodeproj
in theAural
directory - Build and run on your iOS 26+ device or simulator
- Grant microphone and speech recognition permissions
- Start transcribing!
import SwiftUI
import AuralKit
struct ContentView: View {
@State private var session = SpeechSession()
@State private var transcript: AttributedString = ""
@State private var isTranscribing = false
var body: some View {
VStack(spacing: 20) {
Text(transcript)
.frame(minHeight: 100)
.padding()
Button(isTranscribing ? "Stop" : "Start") {
if isTranscribing {
Task {
await session.stopTranscribing()
isTranscribing = false
}
} else {
isTranscribing = true
Task {
for try await result in session.startTranscribing() {
if result.isFinal {
transcript += result.text
}
}
isTranscribing = false
}
}
}
}
.padding()
}
}
Add one more state variable to show real-time partial transcription:
struct ContentView: View {
@State private var session = SpeechSession()
@State private var finalText: AttributedString = ""
@State private var partialText: AttributedString = ""
@State private var isTranscribing = false
var body: some View {
VStack(spacing: 20) {
Text(finalText + partialText)
.frame(minHeight: 100)
.padding()
Button(isTranscribing ? "Stop" : "Start") {
if isTranscribing {
Task {
await session.stopTranscribing()
isTranscribing = false
}
} else {
isTranscribing = true
Task {
for try await result in session.startTranscribing() {
if result.isFinal {
finalText += result.text
partialText = ""
} else {
partialText = result.text
}
}
isTranscribing = false
}
}
}
}
.padding()
}
}
The TranscriptionManager
in the demo app adds language selection, history tracking, and export.
When a locale has not been installed yet, AuralKit automatically downloads the appropriate speech model. You can observe download progress through the modelDownloadProgress
property:
let kit = AuralKit(locale: Locale(identifier: "ja-JP"))
if let progress = kit.modelDownloadProgress {
print("Downloading model: \(progress.fractionCompleted * 100)%")
}
You can use this progress to a ProgressView
for visual feedback.
AuralKit surfaces detailed SpeechSessionError
values so you can present actionable messaging:
do {
for try await segment in kit.startTranscribing() {
// Use the transcription
}
} catch let error as SpeechSessionError {
switch error {
case .modelDownloadNoInternet:
// Prompt the user to reconnect before retrying
case .modelDownloadFailed(let underlying):
// Inspect `underlying` for more detail
default:
break
}
} catch {
// Handle unexpected errors
}
public final class SpeechSession: @unchecked Sendable {
// Initialize with a locale
public init(locale: Locale = .current)
/// Current speech model download progress, if any
public var modelDownloadProgress: Progress? { get }
/// Start transcribing - returns stream of SpeechTranscriber.Result
public func startTranscribing() -> AsyncThrowingStream<SpeechTranscriber.Result, Error>
/// Stop transcribing
public func stopTranscribing() async
}
AuralKit returns SpeechTranscriber.Result
directly from the Speech framework, which provides:
public struct Result {
/// The most likely transcription with timing and confidence metadata
public var text: AttributedString
/// Alternative interpretations in descending order of likelihood
public let alternatives: [AttributedString]
/// Whether this result is final or volatile (partial)
public var isFinal: Bool
/// The audio time range this result applies to
public let range: CMTimeRange
/// Time up to which results are finalized
public let resultsFinalizationTime: CMTime
}
Access transcription text, timing, confidence scores, and alternatives:
for try await result in session.startTranscribing() {
// Get plain text
let plainText = String(result.text.characters)
// Access timing information
for run in result.text.runs {
if let audioRange = run.audioTimeRange {
let startTime = audioRange.start.seconds
let endTime = audioRange.end.seconds
print("\(run.text): \(startTime)s - \(endTime)s")
}
// Access confidence scores (0.0 to 1.0)
if let confidence = run.transcriptionConfidence {
print("Confidence: \(confidence)")
}
}
// Access alternative transcriptions
for (index, alternative) in result.alternatives.enumerated() {
print("Alternative \(index): \(String(alternative.characters))")
}
}
Add to your Info.plist
:
<key>NSMicrophoneUsageDescription</key>
<string>This app needs microphone access to transcribe speech.</string>
<key>NSSpeechRecognitionUsageDescription</key>
<string>This app needs speech recognition to convert your speech to text.</string>
- iOS 26.0+ / macOS 26.0+
- Swift 6.2+
- Microphone and speech recognition permissions
Contributions are welcome! Please feel free to submit a Pull Request.
AuralKit is available under the MIT License. See the LICENSE file for more info.