Skip to content

Commit 9121ecd

Browse files
authored
feat: text to speech (#95)
* wip * feat: support latest beta * feat: working example * chore: update readme * chore: temporary fix for the crash
1 parent 3a9984b commit 9121ecd

File tree

8 files changed

+410
-33
lines changed

8 files changed

+410
-33
lines changed

apps/example-apple/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
"@ungap/structured-clone": "^1.3.0",
1818
"ai": "^5.0.0-beta.25",
1919
"expo": "^53.0.0",
20+
"expo-clipboard": "~7.1.5",
2021
"expo-document-picker": "~13.1.6",
2122
"expo-status-bar": "2.2.3",
2223
"react": "19.0.0",

apps/example-apple/src/schema-demos.ts

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
import { createAppleProvider } from '@react-native-ai/apple'
2-
import { experimental_transcribe, generateObject, streamText, tool } from 'ai'
2+
import {
3+
experimental_generateSpeech,
4+
experimental_transcribe,
5+
generateObject,
6+
streamText,
7+
tool,
8+
} from 'ai'
9+
import * as Clipboard from 'expo-clipboard'
310
import { z } from 'zod'
411

512
const getWeather = tool({
@@ -128,6 +135,15 @@ export async function basicTranscribeDemo() {
128135
return response.text
129136
}
130137

138+
export async function basicSpeechDemo() {
139+
const response = await experimental_generateSpeech({
140+
model: apple.speechModel(),
141+
text: 'What is the weather in Wroclaw?',
142+
})
143+
await Clipboard.setStringAsync(response.audio.base64)
144+
return 'Speech copied to clipboard. Go to https://base64.guru/converter/decode/audio to play.'
145+
}
146+
131147
export const schemaDemos = {
132148
basicString: { name: 'String', func: basicStringDemo },
133149
basicStringStreaming: {
@@ -140,6 +156,7 @@ export const schemaDemos = {
140156
basicObject: { name: 'Object', func: basicObjectDemo },
141157
basicArray: { name: 'Array', func: basicArrayDemo },
142158
basicTranscribe: { name: 'Transcribe', func: basicTranscribeDemo },
159+
basicSpeech: { name: 'Speech', func: basicSpeechDemo },
143160
}
144161

145162
export type DemoKey = keyof typeof schemaDemos

bun.lock

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
"@ungap/structured-clone": "^1.3.0",
6969
"ai": "^5.0.0-beta.25",
7070
"expo": "^53.0.0",
71+
"expo-clipboard": "~7.1.5",
7172
"expo-document-picker": "~13.1.6",
7273
"expo-status-bar": "2.2.3",
7374
"react": "19.0.0",
@@ -84,7 +85,7 @@
8485
},
8586
"packages/apple-llm": {
8687
"name": "@react-native-ai/apple",
87-
"version": "0.2.0",
88+
"version": "0.4.0",
8889
"dependencies": {
8990
"@ai-sdk/provider": "^2.0.0-beta.1",
9091
"@ai-sdk/provider-utils": "^3.0.0-beta.5",
@@ -1554,6 +1555,8 @@
15541555

15551556
"expo-asset": ["expo-asset@11.1.7", "", { "dependencies": { "@expo/image-utils": "^0.7.6", "expo-constants": "~17.1.7" }, "peerDependencies": { "expo": "*", "react": "*", "react-native": "*" } }, "sha512-b5P8GpjUh08fRCf6m5XPVAh7ra42cQrHBIMgH2UXP+xsj4Wufl6pLy6jRF5w6U7DranUMbsXm8TOyq4EHy7ADg=="],
15561557

1558+
"expo-clipboard": ["expo-clipboard@7.1.5", "", { "peerDependencies": { "expo": "*", "react": "*", "react-native": "*" } }, "sha512-TCANUGOxouoJXxKBW5ASJl2WlmQLGpuZGemDCL2fO5ZMl57DGTypUmagb0CVUFxDl0yAtFIcESd78UsF9o64aw=="],
1559+
15571560
"expo-constants": ["expo-constants@17.1.7", "", { "dependencies": { "@expo/config": "~11.0.12", "@expo/env": "~1.0.7" }, "peerDependencies": { "expo": "*", "react-native": "*" } }, "sha512-byBjGsJ6T6FrLlhOBxw4EaiMXrZEn/MlUYIj/JAd+FS7ll5X/S4qVRbIimSJtdW47hXMq0zxPfJX6njtA56hHA=="],
15581561

15591562
"expo-document-picker": ["expo-document-picker@13.1.6", "", { "peerDependencies": { "expo": "*" } }, "sha512-8FTQPDOkyCvFN/i4xyqzH7ELW4AsB6B3XBZQjn1FEdqpozo6rpNJRr7sWFU/93WrLgA9FJEKpKbyr6XxczK6BA=="],

packages/apple-llm/ios/speech/AppleSpeech.mm

Lines changed: 98 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ @interface AppleSpeech : NativeAppleSpeechSpecBase <NativeAppleSpeechSpec, RCTCa
2525

2626
using namespace facebook;
2727
using namespace JS::NativeAppleLLM;
28+
using namespace react;
2829

2930
@implementation AppleSpeech
3031

@@ -42,21 +43,107 @@ + (NSString *)moduleName {
4243
return @"NativeAppleSpeech";
4344
}
4445

45-
- (std::shared_ptr<react::TurboModule>)getTurboModule:(const react::ObjCTurboModule::InitParams &)params {
46-
return std::make_shared<react::NativeAppleSpeechSpecJSI>(params);
46+
- (void)installGenerateFunc:(std::shared_ptr<facebook::react::CallInvoker>)jsInvoker {
47+
AppleSpeechImpl *speechModule = _speech;
48+
49+
auto runOnJS = [jsInvoker](std::function<void()>&& f) {
50+
jsInvoker->invokeAsync(std::move(f));
51+
};
52+
53+
jsInvoker->invokeAsync([speechModule, runOnJS](jsi::Runtime& rt) {
54+
@try {
55+
auto global = rt.global();
56+
57+
auto generateAudioFunc = jsi::Function::createFromHostFunction(
58+
rt,
59+
jsi::PropNameID::forAscii(rt, "generateAudio"),
60+
2,
61+
[speechModule, runOnJS](jsi::Runtime& rt, const jsi::Value& thisVal, const jsi::Value* args, size_t count) -> jsi::Value {
62+
if (count < 1 || !args[0].isString()) {
63+
throw jsi::JSError(rt, "First argument must be a string (text)");
64+
}
65+
66+
auto textStr = args[0].asString(rt).utf8(rt);
67+
NSString *text = [NSString stringWithUTF8String:textStr.c_str()];
68+
69+
auto *options = [NSMutableDictionary new];
70+
if (count > 1 && args[1].isObject()) {
71+
auto opts = args[1].asObject(rt);
72+
73+
if (opts.hasProperty(rt, "language")) {
74+
auto langProp = opts.getProperty(rt, "language");
75+
if (langProp.isString()) {
76+
auto langStr = langProp.asString(rt).utf8(rt);
77+
options[@"language"] = [NSString stringWithUTF8String:langStr.c_str()];
78+
}
79+
}
80+
81+
if (opts.hasProperty(rt, "voice")) {
82+
auto voiceProp = opts.getProperty(rt, "voice");
83+
if (voiceProp.isString()) {
84+
auto voiceStr = voiceProp.asString(rt).utf8(rt);
85+
options[@"voice"] = [NSString stringWithUTF8String:voiceStr.c_str()];
86+
}
87+
}
88+
}
89+
90+
auto Promise = rt.global().getPropertyAsFunction(rt, "Promise");
91+
92+
return Promise.callAsConstructor(rt, jsi::Function::createFromHostFunction(
93+
rt,
94+
jsi::PropNameID::forAscii(rt, "executor"),
95+
2,
96+
[speechModule, text, options, runOnJS](jsi::Runtime& rt, const jsi::Value& thisVal, const jsi::Value* args, size_t count) -> jsi::Value {
97+
auto resolve = std::make_shared<jsi::Function>(args[0].asObject(rt).asFunction(rt));
98+
auto reject = std::make_shared<jsi::Function>(args[1].asObject(rt).asFunction(rt));
99+
100+
[speechModule generateAudio:text options:options resolve:^(NSData *audioData) {
101+
runOnJS([resolve, audioData, &rt]() {
102+
class NSDataMutableBuffer : public facebook::jsi::MutableBuffer {
103+
public:
104+
NSDataMutableBuffer(uint8_t* data, size_t size) : _data(data), _size(size) {}
105+
uint8_t* data() override { return _data; }
106+
size_t size() const override { return _size; }
107+
private:
108+
uint8_t* _data;
109+
size_t _size;
110+
};
111+
112+
uint8_t* data = (uint8_t*)[audioData bytes];
113+
size_t size = [audioData length];
114+
115+
auto mutableBuffer = std::make_shared<NSDataMutableBuffer>(data, size);
116+
auto arrayBuffer = jsi::ArrayBuffer(rt, mutableBuffer);
117+
118+
resolve->call(rt, std::move(arrayBuffer));
119+
});
120+
} reject:^(NSString *code, NSString *message, NSError *error) {
121+
runOnJS([reject, message, &rt]() {
122+
auto jsError = jsi::String::createFromUtf8(rt, [message UTF8String]);
123+
reject->call(rt, jsError);
124+
});
125+
}];
126+
127+
return jsi::Value::undefined();
128+
}
129+
));
130+
}
131+
);
132+
133+
global.setProperty(rt, "__apple__llm__generate_audio__", generateAudioFunc);
134+
} @catch (NSException *exception) {
135+
throw jsi::JSError(rt, [[NSString stringWithFormat:@"Failed to install generateAudio handler: %@", exception.reason] UTF8String]);
136+
}
137+
});
47138
}
48139

49-
- (nonnull NSNumber *)isAvailable {
50-
return @([_speech isAvailable]);
140+
- (std::shared_ptr<react::TurboModule>)getTurboModule:(const react::ObjCTurboModule::InitParams &)params {
141+
[self installGenerateFunc:params.jsInvoker];
142+
return std::make_shared<react::NativeAppleSpeechSpecJSI>(params);
51143
}
52144

53-
- (void)generate:(nonnull NSString *)text options:(JS::NativeAppleLLM::SpeechOptions &)options resolve:(nonnull RCTPromiseResolveBlock)resolve reject:(nonnull RCTPromiseRejectBlock)reject {
54-
NSDictionary *opts = @{
55-
@"language": options.language().has_value() ? @(options.language().value().c_str()) : [NSNull null],
56-
@"voice": options.voice().has_value() ? @(options.voice().value().c_str()) : [NSNull null]
57-
};
58-
59-
[_speech generate:text options:opts resolve:resolve reject:reject];
145+
- (void)getVoices:(nonnull RCTPromiseResolveBlock)resolve reject:(nonnull RCTPromiseRejectBlock)reject {
146+
[_speech getVoices:resolve reject:reject];
60147
}
61148

62149
@end

packages/apple-llm/ios/speech/AppleSpeechImpl.swift

Lines changed: 145 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,154 @@ import AVFoundation
1010

1111
@objc
1212
public class AppleSpeechImpl: NSObject {
13+
private let speechSynthesizer = AVSpeechSynthesizer()
14+
1315
@objc
14-
public func isAvailable() -> Bool {
15-
return true // AVSpeechSynthesizer is available on all iOS versions we support
16+
public func getVoices(_ resolve: @escaping ([Any]) -> Void, reject: @escaping (String, String, Error?) -> Void) {
17+
Task {
18+
if #available(iOS 17.0, *) {
19+
await withCheckedContinuation { continuation in
20+
AVSpeechSynthesizer.requestPersonalVoiceAuthorization { _ in
21+
continuation.resume()
22+
}
23+
}
24+
}
25+
26+
let allVoices = AVSpeechSynthesisVoice.speechVoices()
27+
let voiceInfos = allVoices.map { $0.toDictionary() }
28+
resolve(voiceInfos)
29+
}
1630
}
1731

1832
@objc
19-
public func generate(_ text: String, options: [String: Any]?, resolve: @escaping (Any?) -> Void, reject: @escaping (String, String, Error?) -> Void) {
20-
// TODO: Implement text-to-speech functionality
21-
resolve(nil)
33+
public func generateAudio(_ text: String, options: [String: Any], resolve: @escaping (Data) -> Void, reject: @escaping (String, String, Error?) -> Void) {
34+
let utterance = AVSpeechUtterance(string: text)
35+
36+
utterance.voice = if let voiceId = options["voice"] as? String {
37+
AVSpeechSynthesisVoice(identifier: voiceId)
38+
} else if let language = options["language"] as? String {
39+
AVSpeechSynthesisVoice(language: language)
40+
} else {
41+
nil
42+
}
43+
44+
var collectedBuffers: [AVAudioPCMBuffer] = []
45+
46+
var resolveCallback: ((Data) -> Void)? = resolve
47+
var rejectCallback: ((String, String, Error?) -> Void)? = reject
48+
49+
speechSynthesizer.write(utterance) { buffer in
50+
guard let pcm = buffer as? AVAudioPCMBuffer else { return }
51+
52+
if pcm.frameLength == 0 {
53+
guard let resolve = resolveCallback, let reject = rejectCallback else { return }
54+
55+
do {
56+
let data = try AppleSpeechImpl.wavData(from: collectedBuffers)
57+
resolve(data)
58+
} catch {
59+
reject("AppleSpeech", "Error generating WAV data", error)
60+
}
61+
62+
resolveCallback = nil
63+
rejectCallback = nil
64+
return
65+
}
66+
67+
collectedBuffers.append(pcm)
68+
}
69+
}
70+
}
71+
72+
extension AppleSpeechImpl {
73+
/// Build a single WAV file by generating the header using the first buffer's
74+
/// format and then concatenating the raw PCM payloads of all subsequent buffers.
75+
/// Assumes all buffers share the same format and are WAV-compatible.
76+
static func wavData(from buffers: [AVAudioPCMBuffer]) throws -> Data {
77+
guard let first = buffers.first else {
78+
throw NSError(domain: "WAV", code: -2,
79+
userInfo: [NSLocalizedDescriptionKey: "No audio buffers collected"])
80+
}
81+
82+
let channels = Int(first.format.channelCount)
83+
let sampleRate = Int(first.format.sampleRate)
84+
let isFloat32 = (first.format.commonFormat == .pcmFormatFloat32)
85+
let bitsPerSample = isFloat32 ? 32 : 16
86+
let byteRate = sampleRate * channels * bitsPerSample / 8
87+
let blockAlign = channels * bitsPerSample / 8
88+
89+
// Helper: little-endian encoders
90+
func le16(_ v: Int) -> [UInt8] { [UInt8(v & 0xff), UInt8((v >> 8) & 0xff)] }
91+
func le32(_ v: Int) -> [UInt8] {
92+
[UInt8(v & 0xff), UInt8((v >> 8) & 0xff),
93+
UInt8((v >> 16) & 0xff), UInt8((v >> 24) & 0xff)]
94+
}
95+
96+
// Estimate capacity from actual valid bytes in each buffer
97+
let estimatedCapacity = buffers.reduce(0) { acc, buf in
98+
let audioBuffer = buf.audioBufferList.pointee.mBuffers
99+
return acc + Int(audioBuffer.mDataByteSize)
100+
}
101+
102+
var payload = Data()
103+
payload.reserveCapacity(estimatedCapacity)
104+
105+
// Concatenate payloads using mDataByteSize, which is kept in sync with frameLength
106+
for buf in buffers {
107+
let m = buf.audioBufferList.pointee.mBuffers
108+
let byteCount = Int(m.mDataByteSize)
109+
if let p = m.mData {
110+
payload.append(contentsOf: UnsafeRawBufferPointer(start: p, count: byteCount))
111+
}
112+
}
113+
114+
let dataChunkSize = payload.count
115+
let fmtChunkSize = 16
116+
let riffChunkSize = 4 + (8 + fmtChunkSize) + (8 + dataChunkSize)
117+
118+
var header = Data()
119+
header.append(contentsOf: Array("RIFF".utf8))
120+
header.append(contentsOf: le32(riffChunkSize))
121+
header.append(contentsOf: Array("WAVE".utf8))
122+
123+
// fmt chunk
124+
header.append(contentsOf: Array("fmt ".utf8))
125+
header.append(contentsOf: le32(fmtChunkSize))
126+
header.append(contentsOf: le16(isFloat32 ? 3 : 1)) // 3 = IEEE float, 1 = PCM
127+
header.append(contentsOf: le16(channels))
128+
header.append(contentsOf: le32(sampleRate))
129+
header.append(contentsOf: le32(byteRate))
130+
header.append(contentsOf: le16(blockAlign))
131+
header.append(contentsOf: le16(bitsPerSample))
132+
133+
// data chunk
134+
header.append(contentsOf: Array("data".utf8))
135+
header.append(contentsOf: le32(dataChunkSize))
136+
137+
var out = Data(capacity: header.count + payload.count)
138+
out.append(header)
139+
out.append(payload)
140+
141+
return out
142+
}
143+
}
144+
145+
extension AVSpeechSynthesisVoice {
146+
func toDictionary() -> [String: Any] {
147+
var data = [
148+
"identifier": self.identifier,
149+
"name": self.name,
150+
"language": self.language,
151+
"quality": quality,
152+
"isPersonalVoice": false,
153+
"isNoveltyVoice": false
154+
] as [String : Any]
155+
156+
if #available(iOS 17.0, *) {
157+
data["isPersonalVoice"] = self.voiceTraits.contains(.isPersonalVoice)
158+
data["isNoveltyVoice"] = self.voiceTraits.contains(.isNoveltyVoice)
159+
}
160+
161+
return data
22162
}
23163
}

0 commit comments

Comments
 (0)