Skip to content

Commit 5d09621

Browse files
Speech v1 (#3266)
This updates our manual client library to the Speech v1 API. This entails several **backwards incompatible changes**: * The `language_code` parameter is no longer optional anywhere. It must be explicitly specified, and does _not_ default to `'en-US'`. * The `sync_recognize` method has been renamed to `recognize` on every class where it appears. * The `async_recognize` method has been renamed to `long_running_recognize` on every class where it appears. * The `sample_rate` parameter and property has been renamed to `sample_rate_hertz` everywhere it appears. Additionally, the backend API contains a backwards incompatible change which does not require a code change in the client library, but will likely require one downstream: The `START_OF_SPEECH`, `END_OF_SPEECH`, and `END_OF_AUDIO` events have been removed.
1 parent de75dea commit 5d09621

File tree

17 files changed

+346
-297
lines changed

17 files changed

+346
-297
lines changed

google-cloud-speech/google/cloud/speech/_gax.py

Lines changed: 75 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,15 @@
1414

1515
"""GAX/GAPIC module for managing Speech API requests."""
1616

17-
18-
from google.cloud.gapic.speech.v1beta1.speech_client import SpeechClient
19-
from google.cloud.proto.speech.v1beta1.cloud_speech_pb2 import RecognitionAudio
20-
from google.cloud.proto.speech.v1beta1.cloud_speech_pb2 import (
17+
from google.cloud.gapic.speech.v1.speech_client import SpeechClient
18+
from google.cloud.proto.speech.v1.cloud_speech_pb2 import RecognitionAudio
19+
from google.cloud.proto.speech.v1.cloud_speech_pb2 import (
2120
RecognitionConfig)
22-
from google.cloud.proto.speech.v1beta1.cloud_speech_pb2 import (
21+
from google.cloud.proto.speech.v1.cloud_speech_pb2 import (
2322
SpeechContext)
24-
from google.cloud.proto.speech.v1beta1.cloud_speech_pb2 import (
23+
from google.cloud.proto.speech.v1.cloud_speech_pb2 import (
2524
StreamingRecognitionConfig)
26-
from google.cloud.proto.speech.v1beta1.cloud_speech_pb2 import (
25+
from google.cloud.proto.speech.v1.cloud_speech_pb2 import (
2726
StreamingRecognizeRequest)
2827
from google.longrunning import operations_grpc
2928

@@ -62,23 +61,22 @@ def __init__(self, client=None):
6261
OPERATIONS_API_HOST,
6362
)
6463

65-
def async_recognize(self, sample, language_code=None,
66-
max_alternatives=None, profanity_filter=None,
67-
speech_context=None):
68-
"""Asychronous Recognize request to Google Speech API.
64+
def long_running_recognize(self, sample, language_code,
65+
max_alternatives=None, profanity_filter=None,
66+
speech_contexts=()):
67+
"""Long-running Recognize request to Google Speech API.
6968
70-
.. _async_recognize: https://cloud.google.com/speech/reference/\
71-
rest/v1beta1/speech/asyncrecognize
69+
.. _long_running_recognize: https://cloud.google.com/speech/reference/\
70+
rest/v1/speech/longrunningrecognize
7271
73-
See `async_recognize`_.
72+
See `long_running_recognize`_.
7473
7574
:type sample: :class:`~google.cloud.speech.sample.Sample`
7675
:param sample: Instance of ``Sample`` containing audio information.
7776
7877
:type language_code: str
79-
:param language_code: (Optional) The language of the supplied audio as
80-
BCP-47 language tag. Example: ``'en-GB'``.
81-
If omitted, defaults to ``'en-US'``.
78+
:param language_code: The language of the supplied audio as
79+
BCP-47 language tag. Example: ``'en-US'``.
8280
8381
:type max_alternatives: int
8482
:param max_alternatives: (Optional) Maximum number of recognition
@@ -94,8 +92,8 @@ def async_recognize(self, sample, language_code=None,
9492
asterisks, e.g. ``'f***'``. If False or
9593
omitted, profanities won't be filtered out.
9694
97-
:type speech_context: list
98-
:param speech_context: A list of strings (max 50) containing words and
95+
:type speech_contexts: list
96+
:param speech_contexts: A list of strings (max 50) containing words and
9997
phrases "hints" so that the speech recognition
10098
is more likely to recognize them. This can be
10199
used to improve the accuracy for specific words
@@ -106,21 +104,27 @@ def async_recognize(self, sample, language_code=None,
106104
:returns: Instance of ``Operation`` to poll for results.
107105
"""
108106
config = RecognitionConfig(
109-
encoding=sample.encoding, sample_rate=sample.sample_rate,
110-
language_code=language_code, max_alternatives=max_alternatives,
107+
encoding=sample.encoding,
108+
language_code=language_code,
109+
max_alternatives=max_alternatives,
111110
profanity_filter=profanity_filter,
112-
speech_context=SpeechContext(phrases=speech_context))
111+
sample_rate_hertz=sample.sample_rate_hertz,
112+
speech_contexts=[SpeechContext(phrases=speech_contexts)],
113+
)
113114

114115
audio = RecognitionAudio(content=sample.content,
115116
uri=sample.source_uri)
116117
api = self._gapic_api
117-
operation_future = api.async_recognize(config=config, audio=audio)
118+
operation_future = api.long_running_recognize(
119+
audio=audio,
120+
config=config,
121+
)
118122

119123
return Operation.from_pb(operation_future.last_operation_data(), self)
120124

121-
def streaming_recognize(self, sample, language_code=None,
125+
def streaming_recognize(self, sample, language_code,
122126
max_alternatives=None, profanity_filter=None,
123-
speech_context=None, single_utterance=False,
127+
speech_contexts=(), single_utterance=False,
124128
interim_results=False):
125129
"""Streaming speech recognition.
126130
@@ -136,9 +140,8 @@ def streaming_recognize(self, sample, language_code=None,
136140
:param sample: Instance of ``Sample`` containing audio information.
137141
138142
:type language_code: str
139-
:param language_code: (Optional) The language of the supplied audio as
140-
BCP-47 language tag. Example: ``'en-GB'``.
141-
If omitted, defaults to ``'en-US'``.
143+
:param language_code: The language of the supplied audio as
144+
BCP-47 language tag. Example: ``'en-US'``.
142145
143146
:type max_alternatives: int
144147
:param max_alternatives: (Optional) Maximum number of recognition
@@ -154,8 +157,8 @@ def streaming_recognize(self, sample, language_code=None,
154157
asterisks, e.g. ``'f***'``. If False or
155158
omitted, profanities won't be filtered out.
156159
157-
:type speech_context: list
158-
:param speech_context: A list of strings (max 50) containing words and
160+
:type speech_contexts: list
161+
:param speech_contexts: A list of strings (max 50) containing words and
159162
phrases "hints" so that the speech recognition
160163
is more likely to recognize them. This can be
161164
used to improve the accuracy for specific words
@@ -190,7 +193,7 @@ def streaming_recognize(self, sample, language_code=None,
190193
:raises: :class:`ValueError` if sample.content is not a file-like
191194
object. :class:`ValueError` if stream has closed.
192195
193-
:rtype: :class:`~google.cloud.grpc.speech.v1beta1\
196+
:rtype: :class:`~google.cloud.grpc.speech.v1\
194197
.cloud_speech_pb2.StreamingRecognizeResponse`
195198
:returns: ``StreamingRecognizeResponse`` instances.
196199
"""
@@ -200,29 +203,28 @@ def streaming_recognize(self, sample, language_code=None,
200203
requests = _stream_requests(sample, language_code=language_code,
201204
max_alternatives=max_alternatives,
202205
profanity_filter=profanity_filter,
203-
speech_context=speech_context,
206+
speech_contexts=speech_contexts,
204207
single_utterance=single_utterance,
205208
interim_results=interim_results)
206209
api = self._gapic_api
207210
responses = api.streaming_recognize(requests)
208211
return responses
209212

210-
def sync_recognize(self, sample, language_code=None, max_alternatives=None,
211-
profanity_filter=None, speech_context=None):
213+
def recognize(self, sample, language_code, max_alternatives=None,
214+
profanity_filter=None, speech_contexts=()):
212215
"""Synchronous Speech Recognition.
213216
214-
.. _sync_recognize: https://cloud.google.com/speech/reference/\
215-
rest/v1beta1/speech/syncrecognize
217+
.. _recognize: https://cloud.google.com/speech/reference/\
218+
rest/v1/speech/recognize
216219
217-
See `sync_recognize`_.
220+
See `recognize`_.
218221
219222
:type sample: :class:`~google.cloud.speech.sample.Sample`
220223
:param sample: Instance of ``Sample`` containing audio information.
221224
222225
:type language_code: str
223-
:param language_code: (Optional) The language of the supplied audio as
224-
BCP-47 language tag. Example: ``'en-GB'``.
225-
If omitted, defaults to ``'en-US'``.
226+
:param language_code: The language of the supplied audio as
227+
BCP-47 language tag. Example: ``'en-US'``.
226228
227229
:type max_alternatives: int
228230
:param max_alternatives: (Optional) Maximum number of recognition
@@ -238,8 +240,8 @@ def sync_recognize(self, sample, language_code=None, max_alternatives=None,
238240
asterisks, e.g. ``'f***'``. If False or
239241
omitted, profanities won't be filtered out.
240242
241-
:type speech_context: list
242-
:param speech_context: A list of strings (max 50) containing words and
243+
:type speech_contexts: list
244+
:param speech_contexts: A list of strings (max 50) containing words and
243245
phrases "hints" so that the speech recognition
244246
is more likely to recognize them. This can be
245247
used to improve the accuracy for specific words
@@ -252,14 +254,17 @@ def sync_recognize(self, sample, language_code=None, max_alternatives=None,
252254
:raises: ValueError if there are no results.
253255
"""
254256
config = RecognitionConfig(
255-
encoding=sample.encoding, sample_rate=sample.sample_rate,
256-
language_code=language_code, max_alternatives=max_alternatives,
257+
encoding=sample.encoding,
258+
language_code=language_code,
259+
max_alternatives=max_alternatives,
257260
profanity_filter=profanity_filter,
258-
speech_context=SpeechContext(phrases=speech_context))
261+
sample_rate_hertz=sample.sample_rate_hertz,
262+
speech_contexts=[SpeechContext(phrases=speech_contexts)],
263+
)
259264
audio = RecognitionAudio(content=sample.content,
260265
uri=sample.source_uri)
261266
api = self._gapic_api
262-
api_response = api.sync_recognize(config=config, audio=audio)
267+
api_response = api.recognize(config=config, audio=audio)
263268

264269
# Sanity check: If we got no results back, raise an error.
265270
if len(api_response.results) == 0:
@@ -269,18 +274,17 @@ def sync_recognize(self, sample, language_code=None, max_alternatives=None,
269274
return [Result.from_pb(result) for result in api_response.results]
270275

271276

272-
def _stream_requests(sample, language_code=None, max_alternatives=None,
273-
profanity_filter=None, speech_context=None,
277+
def _stream_requests(sample, language_code, max_alternatives=None,
278+
profanity_filter=None, speech_contexts=(),
274279
single_utterance=None, interim_results=None):
275280
"""Generate stream of requests from sample.
276281
277282
:type sample: :class:`~google.cloud.speech.sample.Sample`
278283
:param sample: Instance of ``Sample`` containing audio information.
279284
280285
:type language_code: str
281-
:param language_code: (Optional) The language of the supplied audio as
282-
BCP-47 language tag. Example: ``'en-GB'``.
283-
If omitted, defaults to ``'en-US'``.
286+
:param language_code: The language of the supplied audio as
287+
BCP-47 language tag. Example: ``'en-US'``.
284288
285289
:type max_alternatives: int
286290
:param max_alternatives: (Optional) Maximum number of recognition
@@ -296,13 +300,14 @@ def _stream_requests(sample, language_code=None, max_alternatives=None,
296300
asterisks, e.g. ``'f***'``. If False or
297301
omitted, profanities won't be filtered out.
298302
299-
:type speech_context: list
300-
:param speech_context: (Optional) A list of strings (max 50) containing
301-
words and phrases "hints" so that the speech
302-
recognition is more likely to recognize them.
303-
This can be used to improve the accuracy for
304-
specific words and phrases. This can also be used to
305-
add new words to the vocabulary of the recognizer.
303+
:type speech_contexts: list
304+
:param speech_contexts: (Optional) A list of strings (max 50) containing
305+
words and phrases "hints" so that the speech
306+
recognition is more likely to recognize them.
307+
This can be used to improve the accuracy for
308+
specific words and phrases. This can also be used
309+
to add new words to the vocabulary of the
310+
recognizer.
306311
307312
:type single_utterance: bool
308313
:param single_utterance: (Optional) If false or omitted, the recognizer
@@ -333,7 +338,7 @@ def _stream_requests(sample, language_code=None, max_alternatives=None,
333338
config_request = _make_streaming_request(
334339
sample, language_code=language_code, max_alternatives=max_alternatives,
335340
profanity_filter=profanity_filter,
336-
speech_context=SpeechContext(phrases=speech_context),
341+
speech_contexts=[SpeechContext(phrases=speech_contexts)],
337342
single_utterance=single_utterance, interim_results=interim_results)
338343

339344
# The config request MUST go first and not contain any audio data.
@@ -348,7 +353,7 @@ def _stream_requests(sample, language_code=None, max_alternatives=None,
348353

349354
def _make_streaming_request(sample, language_code,
350355
max_alternatives, profanity_filter,
351-
speech_context, single_utterance,
356+
speech_contexts, single_utterance,
352357
interim_results):
353358
"""Build streaming request.
354359
@@ -374,8 +379,8 @@ def _make_streaming_request(sample, language_code,
374379
asterisks, e.g. ``'f***'``. If False or
375380
omitted, profanities won't be filtered out.
376381
377-
:type speech_context: list
378-
:param speech_context: A list of strings (max 50) containing words and
382+
:type speech_contexts: list
383+
:param speech_contexts: A list of strings (max 50) containing words and
379384
phrases "hints" so that the speech recognition
380385
is more likely to recognize them. This can be
381386
used to improve the accuracy for specific words
@@ -409,13 +414,17 @@ def _make_streaming_request(sample, language_code,
409414
returned.
410415
411416
:rtype:
412-
:class:`~grpc.speech.v1beta1.cloud_speech_pb2.StreamingRecognizeRequest`
417+
:class:`~grpc.speech.v1.cloud_speech_pb2.StreamingRecognizeRequest`
413418
:returns: Instance of ``StreamingRecognizeRequest``.
414419
"""
415420
config = RecognitionConfig(
416-
encoding=sample.encoding, sample_rate=sample.sample_rate,
417-
language_code=language_code, max_alternatives=max_alternatives,
418-
profanity_filter=profanity_filter, speech_context=speech_context)
421+
encoding=sample.encoding,
422+
language_code=language_code,
423+
max_alternatives=max_alternatives,
424+
profanity_filter=profanity_filter,
425+
sample_rate_hertz=sample.sample_rate_hertz,
426+
speech_contexts=speech_contexts,
427+
)
419428

420429
streaming_config = StreamingRecognitionConfig(
421430
config=config, single_utterance=single_utterance,

0 commit comments

Comments
 (0)