Skip to content

Commit 7c0436c

Browse files
authored
Add sorting and filtering of TTS voices (#663)
1 parent 1e6e895 commit 7c0436c

File tree

8 files changed

+614
-38
lines changed

8 files changed

+614
-38
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ All notable changes to this project will be documented in this file. Take a look
1010

1111
* Added `VisualNavigatorDelegate.navigatorContentInset(_:)` to customize the content and safe-area insets used by the navigator.
1212
* By default, the navigator uses the window's `safeAreaInsets`, which can cause content to shift when the status bar is shown or hidden (since those insets change). To avoid this, implement `navigatorContentInset(_:)` and return insets that remain stable across status bar visibility changes — for example, a top inset large enough to accommodate the maximum expected status bar height.
13+
* Added `[TTSVoice].filterByLanguage(_:)` to filter TTS voices by language and region.
14+
* Added `[TTSVoice].sorted()` to sort TTS voices by region, quality, and gender.
1315

1416
#### LCP
1517

@@ -24,6 +26,7 @@ All notable changes to this project will be documented in this file. Take a look
2426
* `EPUBNavigatorViewController.Configuration.contentInset` now expects values that already include the safe area insets.
2527
* If you previously supplied content-only margins, update them to add the safe-area values to preserve the same visible layout.
2628
* Alternatively, implement `VisualNavigatorDelegate.navigatorContentInset(_:)` to compute and return the full insets (content + safe area), helping avoid layout shifts when system UI (e.g., the status bar) appears or disappears.
29+
* Eloquence and novelty TTS voices are removed from the `PublicationSpeechSynthesizer` API, as they are not a good fit to read publications.
2730

2831
#### LCP
2932

Sources/Navigator/TTS/AVTTSEngine.swift

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,20 @@ public class AVTTSEngine: NSObject, TTSEngine, AVSpeechSynthesizerDelegate, Logg
4949

5050
public lazy var availableVoices: [TTSVoice] =
5151
AVSpeechSynthesisVoice.speechVoices()
52+
.filter { voice in
53+
// Remove novelty, eloquence and "classic" voices, as they are
54+
// not a good modern fit to read publications.
55+
if
56+
#available(iOS 17.0, *),
57+
voice.voiceTraits.contains(.isNoveltyVoice) ||
58+
voice.voiceTraits.contains(.isPersonalVoice)
59+
{
60+
return false
61+
}
62+
63+
return !voice.identifier.contains(".eloquence.")
64+
&& !voice.identifier.starts(with: "com.apple.speech.synthesis.voice.")
65+
}
5266
.map { TTSVoice(voice: $0) }
5367

5468
public func voiceWithIdentifier(_ id: String) -> TTSVoice? {
@@ -360,12 +374,18 @@ private extension TTSVoice.Quality {
360374
init?(voice: AVSpeechSynthesisVoice) {
361375
switch voice.quality {
362376
case .default:
363-
self = .medium
377+
if voice.identifier.contains(".compact.") {
378+
self = .low
379+
} else if voice.identifier.contains(".super-compact.") {
380+
self = .lower
381+
} else {
382+
self = .medium
383+
}
364384
case .enhanced:
365385
self = .high
366386
#if swift(>=5.7)
367387
case .premium:
368-
self = .high
388+
self = .higher
369389
#endif
370390
@unknown default:
371391
return nil

Sources/Navigator/TTS/TTSEngine.swift

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -62,38 +62,3 @@ public struct TTSUtterance {
6262
}
6363
}
6464
}
65-
66-
/// Represents a voice provided by the TTS engine which can speak an utterance.
67-
public struct TTSVoice: Hashable {
68-
public enum Gender: Hashable {
69-
case female, male, unspecified
70-
}
71-
72-
public enum Quality: Hashable {
73-
case low, medium, high
74-
}
75-
76-
/// Unique and stable identifier for this voice. Can be used to store and retrieve the voice from the user
77-
/// preferences.
78-
public let identifier: String
79-
80-
/// Human-friendly name for this voice, when available.
81-
public let name: String?
82-
83-
/// Language (and region) this voice belongs to.
84-
public let language: Language
85-
86-
/// Voice gender.
87-
public let gender: Gender
88-
89-
/// Voice quality.
90-
public let quality: Quality?
91-
92-
public init(identifier: String, language: Language, name: String, gender: Gender, quality: Quality?) {
93-
self.identifier = identifier
94-
self.language = language
95-
self.name = name
96-
self.gender = gender
97-
self.quality = quality
98-
}
99-
}
Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
//
2+
// Copyright 2025 Readium Foundation. All rights reserved.
3+
// Use of this source code is governed by the BSD-style license
4+
// available in the top-level LICENSE file of the project.
5+
//
6+
7+
import AVFoundation
8+
import Foundation
9+
import ReadiumShared
10+
11+
/// Represents a voice provided by the TTS engine which can speak an utterance.
12+
public struct TTSVoice: Hashable {
13+
public enum Gender: Hashable {
14+
case female, male, unspecified
15+
}
16+
17+
public enum Quality: Hashable {
18+
case lower, low, medium, high, higher
19+
}
20+
21+
/// Unique and stable identifier for this voice. Can be used to store and retrieve the voice from the user
22+
/// preferences.
23+
public let identifier: String
24+
25+
/// Human-friendly name for this voice.
26+
public let name: String
27+
28+
/// Language (and region) this voice belongs to.
29+
public let language: Language
30+
31+
/// Voice gender.
32+
public let gender: Gender
33+
34+
/// Voice quality.
35+
public let quality: Quality?
36+
37+
public init(
38+
identifier: String,
39+
language: Language,
40+
name: String,
41+
gender: Gender,
42+
quality: Quality?
43+
) {
44+
self.identifier = identifier
45+
self.language = language
46+
self.name = name
47+
self.gender = gender
48+
self.quality = quality
49+
}
50+
}
51+
52+
public extension [TTSVoice] {
53+
/// Filter voices by language.
54+
///
55+
/// If the input language includes a region (e.g., "en-US"), only voices
56+
/// with that exact language and region combination will be returned.
57+
/// If the input language has no region (e.g., "en"), all voices matching
58+
/// the base language will be returned, regardless of their region.
59+
func filterByLanguage(_ language: Language) -> [TTSVoice] {
60+
if language.region != nil {
61+
// Exact match: language code and region must both match
62+
return filter { $0.language == language }
63+
} else {
64+
// Base language match: filter by language code only, ignoring region
65+
return filter { $0.language.removingRegion() == language }
66+
}
67+
}
68+
69+
/// Sort the voices according to the following specification:
70+
/// 1. Order by region:
71+
/// - Use the regions of the devices at the top of the list.
72+
/// - If missing, use a default region per language.
73+
/// - Then order the remaining regions alphabetically.
74+
/// 2. Order by voice quality from highest to lowest.
75+
/// 3. Order by voice gender: female > male > unspecified.
76+
/// 4. Order by voice name alphabetically.
77+
func sorted(
78+
preferredRegions: [Language.Region]? = nil,
79+
displayLocale: Locale = .current
80+
) -> [TTSVoice] {
81+
let languagesAndRegions: [(language: Language, region: Language.Region?)] =
82+
map { ($0.language.removingRegion(), $0.language.region) }
83+
84+
let regionsByLanguage: [Language: Set<Language.Region>] =
85+
Dictionary(grouping: languagesAndRegions, by: \.language)
86+
.mapValues { value in
87+
Set(value.compactMap(\.region))
88+
}
89+
90+
let preferredRegions = preferredRegions ?? Locale.preferredRegions
91+
92+
let regionPrioritiesByLanguage: [Language: [Language.Region: Int]] =
93+
Dictionary(uniqueKeysWithValues: regionsByLanguage.map { language, regions in
94+
var ordered: [Language.Region] = []
95+
96+
// 1. Start with device-preferred regions.
97+
ordered.append(
98+
contentsOf: preferredRegions.filter { regions.contains($0) }
99+
)
100+
101+
// 2. Default region for the language, if any.
102+
if let defaultRegion = defaultRegionByLanguage[language.code] {
103+
ordered.append(defaultRegion)
104+
}
105+
106+
// 3. Add remaining regions ordered by localized name.
107+
ordered.append(contentsOf: regions.sorted {
108+
($0.localizedName(in: displayLocale) ?? $0.code) < ($1.localizedName(in: displayLocale) ?? $1.code)
109+
}
110+
)
111+
112+
ordered = ordered.removingDuplicates()
113+
114+
// Assign priorities: lower Int = higher priority
115+
let priorities = Dictionary(uniqueKeysWithValues:
116+
ordered.enumerated().map { idx, region in (region, idx) }
117+
)
118+
119+
return (language, priorities)
120+
})
121+
122+
func sortKey(for voice: TTSVoice) -> (
123+
language: String,
124+
region: Int,
125+
quality: Int,
126+
gender: Int,
127+
name: String
128+
) {
129+
let language = voice.language.removingRegion()
130+
131+
let regionPriority: Int =
132+
if
133+
let region = voice.language.region,
134+
let regionPriorities = regionPrioritiesByLanguage[language]
135+
{
136+
regionPriorities[region] ?? .max
137+
} else {
138+
.max
139+
}
140+
141+
return (
142+
language: language.localizedLanguage(in: displayLocale) ?? voice.language.code.bcp47,
143+
region: regionPriority,
144+
quality: voice.quality.flatMap { qualityPriorities[$0] } ?? .max,
145+
gender: genderPriorities[voice.gender] ?? .max,
146+
name: voice.name
147+
)
148+
}
149+
150+
let voicesWithKeys = map { voice in
151+
(voice: voice, key: sortKey(for: voice))
152+
}
153+
154+
return voicesWithKeys.sorted { a, b in
155+
let ka = a.key
156+
let kb = b.key
157+
158+
if ka.language != kb.language {
159+
return ka.language < kb.language
160+
}
161+
if ka.region != kb.region {
162+
return ka.region < kb.region
163+
}
164+
if ka.quality != kb.quality {
165+
return ka.quality < kb.quality
166+
}
167+
if ka.gender != kb.gender {
168+
return ka.gender < kb.gender
169+
}
170+
171+
return ka.name.localizedCaseInsensitiveCompare(kb.name) == .orderedAscending
172+
}.map(\.voice)
173+
}
174+
}
175+
176+
private extension Locale {
177+
static var preferredRegions: [ReadiumShared.Language.Region] {
178+
preferredLanguages
179+
.compactMap { ReadiumShared.Language(code: .bcp47($0)).region }
180+
}
181+
}
182+
183+
// Default region per base language.
184+
// Source: https://github.com/HadrienGardeur/web-speech-recommended-voices
185+
private let defaultRegionByLanguage: [Language.Code: Language.Region] = [
186+
.bcp47("ar"): "SA",
187+
.bcp47("bg"): "BG",
188+
.bcp47("bho"): "IN",
189+
.bcp47("bn"): "IN",
190+
.bcp47("ca"): "ES",
191+
.bcp47("cmn"): "CN",
192+
.bcp47("cs"): "CZ",
193+
.bcp47("da"): "DK",
194+
.bcp47("de"): "DE",
195+
.bcp47("el"): "GR",
196+
.bcp47("en"): "US",
197+
.bcp47("es"): "ES",
198+
.bcp47("eu"): "ES",
199+
.bcp47("fa"): "IR",
200+
.bcp47("fi"): "FI",
201+
.bcp47("fr"): "FR",
202+
.bcp47("gl"): "ES",
203+
.bcp47("he"): "IL",
204+
.bcp47("hi"): "IN",
205+
.bcp47("hr"): "HR",
206+
.bcp47("hu"): "HU",
207+
.bcp47("id"): "ID",
208+
.bcp47("it"): "IT",
209+
.bcp47("ja"): "JP",
210+
.bcp47("kn"): "IN",
211+
.bcp47("ko"): "KR",
212+
.bcp47("mr"): "IN",
213+
.bcp47("ms"): "MY",
214+
.bcp47("nb"): "NO",
215+
.bcp47("nl"): "NL",
216+
.bcp47("pl"): "PL",
217+
.bcp47("pt"): "BR",
218+
.bcp47("ro"): "RO",
219+
.bcp47("ru"): "RU",
220+
.bcp47("sk"): "SK",
221+
.bcp47("sl"): "SI",
222+
.bcp47("sv"): "SE",
223+
.bcp47("ta"): "IN",
224+
.bcp47("te"): "IN",
225+
.bcp47("th"): "TH",
226+
.bcp47("tr"): "TR",
227+
.bcp47("uk"): "UA",
228+
.bcp47("vi"): "VN",
229+
.bcp47("wuu"): "CN",
230+
.bcp47("yue"): "HK",
231+
]
232+
233+
// Quality order priority: higher to lower
234+
private let qualityPriorities: [TTSVoice.Quality: Int] = [
235+
.higher: 0,
236+
.high: 1,
237+
.medium: 2,
238+
.low: 3,
239+
.lower: 4,
240+
]
241+
242+
// Gender order priority: female > male > unspecified
243+
private let genderPriorities: [TTSVoice.Gender: Int] = [
244+
.female: 0,
245+
.male: 1,
246+
.unspecified: 2,
247+
]

Sources/Shared/Toolkit/Language.swift

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,26 @@ public struct Language: Hashable, Sendable {
3333

3434
public let code: Code
3535

36+
public struct Region: Hashable, Sendable, ExpressibleByStringLiteral {
37+
public let code: String
38+
39+
public init(code: String) {
40+
self.code = code
41+
}
42+
43+
public init(stringLiteral value: StringLiteralType) {
44+
self.init(code: value)
45+
}
46+
47+
public func localizedName(in targetLocale: Locale = Locale.current) -> String? {
48+
targetLocale.localizedString(forRegionCode: code)
49+
}
50+
}
51+
52+
public var region: Region? {
53+
locale.regionCode.flatMap { Region(code: $0) }
54+
}
55+
3656
public var locale: Locale { Locale(identifier: code.bcp47) }
3757

3858
public func localizedDescription(in locale: Locale = Locale.current) -> String {

Support/Carthage/.xcodegen

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,6 @@
468468
../../Sources/Navigator/EPUB/Preferences/EPUBPreferencesEditor.swift
469469
../../Sources/Navigator/EPUB/Preferences/EPUBSettings.swift
470470
../../Sources/Navigator/EPUB/Scripts
471-
../../Sources/Navigator/EPUB/Scripts/.DS_Store
472471
../../Sources/Navigator/EPUB/Scripts/.eslintrc.json
473472
../../Sources/Navigator/EPUB/Scripts/.gitignore
474473
../../Sources/Navigator/EPUB/Scripts/.prettierignore
@@ -577,6 +576,7 @@
577576
../../Sources/Navigator/TTS/AVTTSEngine.swift
578577
../../Sources/Navigator/TTS/PublicationSpeechSynthesizer.swift
579578
../../Sources/Navigator/TTS/TTSEngine.swift
579+
../../Sources/Navigator/TTS/TTSVoice.swift
580580
../../Sources/Navigator/VisualNavigator.swift
581581
../../Sources/OPDS
582582
../../Sources/OPDS/OPDS1Parser.swift

0 commit comments

Comments
 (0)