Skip to content

Commit eb3310f

Browse files
committed
Merge pull request GoogleCloudPlatform#249 from GoogleCloudPlatform/gcs-audio-input
added gcs read for audio file
2 parents e2a9208 + babbf11 commit eb3310f

File tree

5 files changed

+190
-30
lines changed

5 files changed

+190
-30
lines changed

speech/grpc/pom.xml

+6
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,12 @@ limitations under the License.
111111

112112
<!-- // [START dependency] -->
113113
<dependencies>
114+
<dependency>
115+
<groupId>junit</groupId>
116+
<artifactId>junit</artifactId>
117+
<version>4.12</version>
118+
<scope>test</scope>
119+
</dependency>
114120
<dependency>
115121
<groupId>commons-cli</groupId>
116122
<artifactId>commons-cli</artifactId>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*
2+
* Copyright 2016 Google Inc. All Rights Reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
18+
package com.google.cloud.speech.grpc.demos;
19+
20+
import com.google.cloud.speech.v1.AudioRequest;
21+
import com.google.protobuf.ByteString;
22+
23+
import java.io.IOException;
24+
import java.net.URI;
25+
import java.nio.file.Files;
26+
import java.nio.file.Path;
27+
import java.nio.file.Paths;
28+
29+
/*
30+
* AudioRequestFactory takes a URI as an input and creates an AudioRequest. The URI can point to a
31+
* local file or a file on Google Cloud Storage.
32+
*/
33+
public class AudioRequestFactory {
34+
35+
private static final String FILE_SCHEME = "file";
36+
private static final String GS_SCHEME = "gs";
37+
38+
/**
39+
* Takes an input URI of form $scheme:// and converts to audio request.
40+
*
41+
* @param uri input uri
42+
* @return AudioRequest audio request
43+
*/
44+
public static AudioRequest createRequest(URI uri)
45+
throws IOException {
46+
if (uri.getScheme() == null || uri.getScheme().equals(FILE_SCHEME)) {
47+
Path path = Paths.get(uri);
48+
return audioFromBytes(Files.readAllBytes(path));
49+
} else if (uri.getScheme().equals(GS_SCHEME)) {
50+
return AudioRequest.newBuilder().setUri(uri.toString()).build();
51+
}
52+
throw new RuntimeException("scheme not supported " + uri.getScheme());
53+
}
54+
55+
/**
56+
* Convert bytes to AudioRequest.
57+
*
58+
* @param bytes input bytes
59+
* @return AudioRequest audio request
60+
*/
61+
private static AudioRequest audioFromBytes(byte[] bytes) {
62+
return AudioRequest.newBuilder()
63+
.setContent(ByteString.copyFrom(bytes))
64+
.build();
65+
}
66+
}

speech/grpc/src/main/java/com/google/cloud/speech/grpc/demos/NonStreamingRecognizeClient.java

+13-19
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
import com.google.cloud.speech.v1.NonStreamingRecognizeResponse;
3333
import com.google.cloud.speech.v1.RecognizeRequest;
3434
import com.google.cloud.speech.v1.SpeechGrpc;
35-
import com.google.protobuf.ByteString;
3635
import com.google.protobuf.TextFormat;
3736

3837
import io.grpc.ManagedChannel;
@@ -49,9 +48,7 @@
4948
import org.apache.commons.cli.ParseException;
5049

5150
import java.io.IOException;
52-
import java.nio.file.Files;
53-
import java.nio.file.Path;
54-
import java.nio.file.Paths;
51+
import java.net.URI;
5552
import java.util.Arrays;
5653
import java.util.List;
5754
import java.util.concurrent.Executors;
@@ -72,7 +69,7 @@ public class NonStreamingRecognizeClient {
7269

7370
private final String host;
7471
private final int port;
75-
private final String file;
72+
private final URI input;
7673
private final int samplingRate;
7774

7875
private final ManagedChannel channel;
@@ -81,11 +78,11 @@ public class NonStreamingRecognizeClient {
8178
/**
8279
* Construct client connecting to Cloud Speech server at {@code host:port}.
8380
*/
84-
public NonStreamingRecognizeClient(String host, int port, String file, int samplingRate)
81+
public NonStreamingRecognizeClient(String host, int port, URI input, int samplingRate)
8582
throws IOException {
8683
this.host = host;
8784
this.port = port;
88-
this.file = file;
85+
this.input = input;
8986
this.samplingRate = samplingRate;
9087

9188
GoogleCredentials creds = GoogleCredentials.getApplicationDefault();
@@ -99,10 +96,7 @@ public NonStreamingRecognizeClient(String host, int port, String file, int sampl
9996
}
10097

10198
private AudioRequest createAudioRequest() throws IOException {
102-
Path path = Paths.get(file);
103-
return AudioRequest.newBuilder()
104-
.setContent(ByteString.copyFrom(Files.readAllBytes(path)))
105-
.build();
99+
return AudioRequestFactory.createRequest(this.input);
106100
}
107101

108102
public void shutdown() throws InterruptedException {
@@ -115,10 +109,10 @@ public void recognize() {
115109
try {
116110
audio = createAudioRequest();
117111
} catch (IOException e) {
118-
logger.log(Level.WARNING, "Failed to read audio file: " + file);
112+
logger.log(Level.WARNING, "Failed to read audio uri input: " + input);
119113
return;
120114
}
121-
logger.info("Sending " + audio.getContent().size() + " bytes from audio file: " + file);
115+
logger.info("Sending " + audio.getContent().size() + " bytes from audio uri input: " + input);
122116
InitialRecognizeRequest initial = InitialRecognizeRequest.newBuilder()
123117
.setEncoding(AudioEncoding.LINEAR16)
124118
.setSampleRate(samplingRate)
@@ -147,8 +141,8 @@ public static void main(String[] args) throws Exception {
147141
CommandLineParser parser = new DefaultParser();
148142

149143
Options options = new Options();
150-
options.addOption(OptionBuilder.withLongOpt("file")
151-
.withDescription("path to audio file")
144+
options.addOption(OptionBuilder.withLongOpt("uri")
145+
.withDescription("path to audio uri")
152146
.hasArg()
153147
.withArgName("FILE_PATH")
154148
.create());
@@ -170,10 +164,10 @@ public static void main(String[] args) throws Exception {
170164

171165
try {
172166
CommandLine line = parser.parse(options, args);
173-
if (line.hasOption("file")) {
174-
audioFile = line.getOptionValue("file");
167+
if (line.hasOption("uri")) {
168+
audioFile = line.getOptionValue("uri");
175169
} else {
176-
System.err.println("An Audio file path must be specified (e.g. /foo/baz.raw).");
170+
System.err.println("An Audio uri must be specified (e.g. file:///foo/baz.raw).");
177171
System.exit(1);
178172
}
179173

@@ -203,7 +197,7 @@ public static void main(String[] args) throws Exception {
203197
}
204198

205199
NonStreamingRecognizeClient client =
206-
new NonStreamingRecognizeClient(host, port, audioFile, sampling);
200+
new NonStreamingRecognizeClient(host, port, URI.create(audioFile), sampling);
207201
try {
208202
client.recognize();
209203
} finally {

speech/grpc/src/main/proto/google/speech/v1/cloud-speech.proto

+42-11
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,15 @@ option java_multiple_files = true;
2323
option java_outer_classname = "SpeechProto";
2424
option java_package = "com.google.cloud.speech.v1";
2525

26+
2627
// Service that implements Google Cloud Speech API.
2728
service Speech {
2829
// Perform bidirectional streaming speech recognition on audio using gRPC.
2930
rpc Recognize(stream RecognizeRequest) returns (stream RecognizeResponse);
3031

3132
// Perform non-streaming speech recognition on audio using HTTPS.
3233
rpc NonStreamingRecognize(RecognizeRequest) returns (NonStreamingRecognizeResponse) {
33-
option (.google.api.http) = { post: "/v1/speech:recognize" body: "*" };
34+
option (google.api.http) = { post: "/v1/speech:recognize" body: "*" };
3435
}
3536
}
3637

@@ -54,7 +55,7 @@ message RecognizeRequest {
5455

5556
// The audio data to be recognized. For `NonStreamingRecognize`, all the
5657
// audio data must be contained in the first (and only) `RecognizeRequest`
57-
// message. For streaming `Recognize`, sequential chunks of audio data are
58+
// message. For streaming `Recognize`, sequential chunks of audio data are
5859
// sent in sequential `RecognizeRequest` messages.
5960
AudioRequest audio_request = 2;
6061
}
@@ -64,7 +65,7 @@ message RecognizeRequest {
6465
message InitialRecognizeRequest {
6566
// Audio encoding of the data sent in the audio message.
6667
enum AudioEncoding {
67-
// Not specified. Will return result `INVALID_ARGUMENT`.
68+
// Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
6869
ENCODING_UNSPECIFIED = 0;
6970

7071
// Uncompressed 16-bit signed little-endian samples.
@@ -118,8 +119,6 @@ message InitialRecognizeRequest {
118119
// profanities, replacing all but the initial character in each filtered word
119120
// with asterisks, e.g. "f***". If set to `false` or omitted, profanities
120121
// won't be filtered out.
121-
// Note that profanity filtering is not implemented for all languages.
122-
// If the language is not supported, this setting has no effect.
123122
bool profanity_filter = 5;
124123

125124
// [Optional] If `false` or omitted, the recognizer will detect a single
@@ -146,13 +145,38 @@ message InitialRecognizeRequest {
146145
// as they become available.
147146
// If `false` or omitted, no `EndpointerEvents` are returned.
148147
bool enable_endpointer_events = 8;
148+
149+
// [Optional] URI that points to a file where the recognition result should
150+
// be stored in JSON format. If omitted or empty string, the recognition
151+
// result is returned in the response. Should be specified only for
152+
// `NonStreamingRecognize`. If specified in a `Recognize` request,
153+
// `Recognize` returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
154+
// If specified in a `NonStreamingRecognize` request,
155+
// `NonStreamingRecognize` returns immediately, and the output file
156+
// is created asynchronously once the audio processing completes.
157+
// Currently, only Google Cloud Storage URIs are supported, which must be
158+
// specified in the following format: `gs://bucket_name/object_name`
159+
// (other URI formats return [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
160+
// more information, see [Request URIs](/storage/docs/reference-uris).
161+
string output_uri = 9;
149162
}
150163

151164
// Contains audio data in the format specified in the `InitialRecognizeRequest`.
165+
// Either `content` or `uri` must be supplied. Supplying both or neither
166+
// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
152167
message AudioRequest {
153-
// [Required] The audio data bytes encoded as specified in
154-
// `InitialRecognizeRequest`.
168+
// The audio data bytes encoded as specified in
169+
// `InitialRecognizeRequest`. Note: as with all bytes fields, protobuffers
170+
// use a pure binary representation, whereas JSON representations use base64.
155171
bytes content = 1;
172+
173+
// URI that points to a file that contains audio data bytes as specified in
174+
// `InitialRecognizeRequest`. Currently, only Google Cloud Storage URIs are
175+
// supported, which must be specified in the following format:
176+
// `gs://bucket_name/object_name` (other URI formats return
177+
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
178+
// [Request URIs](/storage/docs/reference-uris).
179+
string uri = 2;
156180
}
157181

158182
// `NonStreamingRecognizeResponse` is the only message returned to the client by
@@ -191,10 +215,14 @@ message RecognizeResponse {
191215

192216
// [Output-only] If set, returns a [google.rpc.Status][] message that
193217
// specifies the error for the operation.
194-
.google.rpc.Status error = 1;
195-
196-
// [Output-only] May contain zero or one `is_final=true` result (the newly
197-
// settled portion). May also contain zero or more `is_final=false` results.
218+
google.rpc.Status error = 1;
219+
220+
// [Output-only] For `continuous=false`, this repeated list contains zero or
221+
// one result that corresponds to all of the audio processed so far. For
222+
// `continuous=true`, this repeated list contains zero or more results that
223+
// correspond to consecutive portions of the audio being processed.
224+
// In both cases, contains zero or one `is_final=true` result (the newly
225+
// settled portion), followed by zero or more `is_final=false` results.
198226
repeated SpeechRecognitionResult results = 2;
199227

200228
// [Output-only] Indicates the lowest index in the `results` array that has
@@ -206,7 +234,10 @@ message RecognizeResponse {
206234
EndpointerEvent endpoint = 4;
207235
}
208236

237+
// A speech recognition result corresponding to a portion of the audio.
209238
message SpeechRecognitionResult {
239+
// [Output-only] May contain one or more recognition hypotheses (up to the
240+
// maximum specified in `max_alternatives`).
210241
repeated SpeechRecognitionAlternative alternatives = 1;
211242

212243
// [Output-only] Set `true` if this is the final time the speech service will
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/*
2+
* Copyright 2016 Google Inc. All Rights Reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.cloud.speech.grpc.demos;
18+
19+
import static org.junit.Assert.assertEquals;
20+
21+
import com.google.cloud.speech.v1.AudioRequest;
22+
23+
import org.junit.Test;
24+
import org.junit.runner.RunWith;
25+
import org.junit.runners.JUnit4;
26+
27+
import java.io.File;
28+
import java.io.IOException;
29+
import java.net.URI;
30+
31+
/**
32+
* Unit tests for {@link AudioRequestFactory}.
33+
*/
34+
@RunWith(JUnit4.class)
35+
public class AudioRequestFactoryTest {
36+
37+
@Test
38+
public void verifyBytesInSizeFromLocalFile() throws IOException {
39+
URI uri = new File("resources/audio.raw").toURI();
40+
AudioRequest audio = AudioRequestFactory.createRequest(uri);
41+
42+
int numBytes = audio.getContent().toByteArray().length;
43+
44+
//assert the number of bytes in the audio as 57958
45+
assertEquals(57958, numBytes);
46+
}
47+
48+
@Test
49+
public void verifyBytesInSizeFromGoogleStorageFile() throws IOException {
50+
String audioUri = "gs://cloud-samples-tests/speech/audio.raw";
51+
52+
URI uri = URI.create(audioUri);
53+
AudioRequest audio = AudioRequestFactory.createRequest(uri);
54+
55+
int numBytes = audio.getContent().toByteArray().length;
56+
57+
//assert the number of bytes in the audio as 0
58+
assertEquals(0, numBytes);
59+
60+
//assert the uri
61+
assertEquals(audioUri, audio.getUri());
62+
}
63+
}

0 commit comments

Comments
 (0)