Skip to content

Commit 49229af

Browse files
authored
Merge pull request #2680 from daspecster/add-speech-streaming-gapic
Add speech streaming recognition.
2 parents 6eebe67 + e0ce461 commit 49229af

File tree

4 files changed

+476
-25
lines changed

4 files changed

+476
-25
lines changed

packages/google-cloud-speech/google/cloud/speech/_gax.py

+91
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,97 @@ def async_recognize(self, sample, language_code=None,
106106

107107
return Operation.from_pb(response, self)
108108

109+
def streaming_recognize(self, sample, language_code=None,
110+
max_alternatives=None, profanity_filter=None,
111+
speech_context=None, single_utterance=False,
112+
interim_results=False):
113+
"""Streaming speech recognition.
114+
115+
.. note::
116+
117+
Streaming recognition requests are limited to 1 minute of audio.
118+
See: https://cloud.google.com/speech/limits#content
119+
120+
Yields :class:`~streaming_response.StreamingSpeechResponse` containing
121+
results and metadata from the streaming request.
122+
123+
:type sample: :class:`~google.cloud.speech.sample.Sample`
124+
:param sample: Instance of ``Sample`` containing audio information.
125+
126+
:type language_code: str
127+
:param language_code: (Optional) The language of the supplied audio as
128+
BCP-47 language tag. Example: ``'en-GB'``.
129+
If omitted, defaults to ``'en-US'``.
130+
131+
:type max_alternatives: int
132+
:param max_alternatives: (Optional) Maximum number of recognition
133+
hypotheses to be returned. The server may
134+
return fewer than maxAlternatives.
135+
Valid values are 0-30. A value of 0 or 1
136+
will return a maximum of 1. Defaults to 1
137+
138+
:type profanity_filter: bool
139+
:param profanity_filter: If True, the server will attempt to filter
140+
out profanities, replacing all but the
141+
initial character in each filtered word with
142+
asterisks, e.g. ``'f***'``. If False or
143+
omitted, profanities won't be filtered out.
144+
145+
:type speech_context: list
146+
:param speech_context: A list of strings (max 50) containing words and
147+
phrases "hints" so that the speech recognition
148+
is more likely to recognize them. This can be
149+
used to improve the accuracy for specific words
150+
and phrases. This can also be used to add new
151+
words to the vocabulary of the recognizer.
152+
153+
:type single_utterance: bool
154+
:param single_utterance: (Optional) If false or omitted, the recognizer
155+
will perform continuous recognition
156+
(continuing to process audio even if the user
157+
pauses speaking) until the client closes the
158+
output stream (gRPC API) or when the maximum
159+
time limit has been reached. Multiple
160+
SpeechRecognitionResults with the is_final
161+
flag set to true may be returned.
162+
If true, the recognizer will detect a single
163+
spoken utterance. When it detects that the
164+
user has paused or stopped speaking, it will
165+
return an END_OF_UTTERANCE event and cease
166+
recognition. It will return no more than one
167+
SpeechRecognitionResult with the is_final flag
168+
set to true.
169+
170+
:type interim_results: bool
171+
:param interim_results: (Optional) If true, interim results (tentative
172+
hypotheses) may be returned as they become
173+
available (these interim results are indicated
174+
with the is_final=false flag). If false or
175+
omitted, only is_final=true result(s) are
176+
returned.
177+
178+
:raises: :class:`ValueError` if sample.content is not a file-like
179+
object. :class:`ValueError` if stream has closed.
180+
181+
:rtype: :class:`~google.cloud.grpc.speech.v1beta1\
182+
.cloud_speech_pb2.StreamingRecognizeResponse`
183+
:returns: ``StreamingRecognizeResponse`` instances.
184+
"""
185+
if getattr(sample.content, 'closed', None) is None:
186+
raise ValueError('Please use file-like object for data stream.')
187+
if sample.content.closed:
188+
raise ValueError('Stream is closed.')
189+
190+
requests = _stream_requests(sample, language_code=language_code,
191+
max_alternatives=max_alternatives,
192+
profanity_filter=profanity_filter,
193+
speech_context=speech_context,
194+
single_utterance=single_utterance,
195+
interim_results=interim_results)
196+
api = self._gapic_api
197+
responses = api.streaming_recognize(requests)
198+
return responses
199+
109200
def sync_recognize(self, sample, language_code=None, max_alternatives=None,
110201
profanity_filter=None, speech_context=None):
111202
"""Synchronous Speech Recognition.

packages/google-cloud-speech/google/cloud/speech/client.py

+86
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,92 @@ def speech_api(self):
159159
self._speech_api = _JSONSpeechAPI(self)
160160
return self._speech_api
161161

162+
def streaming_recognize(self, sample, language_code=None,
163+
max_alternatives=None, profanity_filter=None,
164+
speech_context=None, single_utterance=False,
165+
interim_results=False):
166+
"""Streaming speech recognition.
167+
168+
.. note::
169+
170+
Streaming recognition requests are limited to 1 minute of audio.
171+
See: https://cloud.google.com/speech/limits#content
172+
173+
Yields: list of :class:`~google.cloud.speech.alternative.Alternatives`
174+
containing results and metadata from the streaming request.
175+
176+
:type sample: :class:`~google.cloud.speech.sample.Sample`
177+
:param sample: Instance of ``Sample`` containing audio information.
178+
179+
:type language_code: str
180+
:param language_code: (Optional) The language of the supplied audio as
181+
BCP-47 language tag. Example: ``'en-GB'``.
182+
If omitted, defaults to ``'en-US'``.
183+
184+
:type max_alternatives: int
185+
:param max_alternatives: (Optional) Maximum number of recognition
186+
hypotheses to be returned. The server may
187+
return fewer than maxAlternatives.
188+
Valid values are 0-30. A value of 0 or 1
189+
will return a maximum of 1. Defaults to 1
190+
191+
:type profanity_filter: bool
192+
:param profanity_filter: If True, the server will attempt to filter
193+
out profanities, replacing all but the
194+
initial character in each filtered word with
195+
asterisks, e.g. ``'f***'``. If False or
196+
omitted, profanities won't be filtered out.
197+
198+
:type speech_context: list
199+
:param speech_context: A list of strings (max 50) containing words and
200+
phrases "hints" so that the speech recognition
201+
is more likely to recognize them. This can be
202+
used to improve the accuracy for specific words
203+
and phrases. This can also be used to add new
204+
words to the vocabulary of the recognizer.
205+
206+
:type single_utterance: bool
207+
:param single_utterance: (Optional) If false or omitted, the recognizer
208+
will perform continuous recognition
209+
(continuing to process audio even if the user
210+
pauses speaking) until the client closes the
211+
output stream (gRPC API) or when the maximum
212+
time limit has been reached. Multiple
213+
SpeechRecognitionResults with the is_final
214+
flag set to true may be returned.
215+
If true, the recognizer will detect a single
216+
spoken utterance. When it detects that the
217+
user has paused or stopped speaking, it will
218+
return an END_OF_UTTERANCE event and cease
219+
recognition. It will return no more than one
220+
SpeechRecognitionResult with the is_final flag
221+
set to true.
222+
223+
:type interim_results: bool
224+
:param interim_results: (Optional) If true, interim results (tentative
225+
hypotheses) may be returned as they become
226+
available (these interim results are indicated
227+
with the ``is_final=False`` flag). If false or
228+
omitted, only is_final=true result(s) are
229+
returned.
230+
231+
:raises: EnvironmentError if gRPC is not available.
232+
"""
233+
if not self._use_gax:
234+
raise EnvironmentError('gRPC is required to use this API.')
235+
236+
responses = self.speech_api.streaming_recognize(sample, language_code,
237+
max_alternatives,
238+
profanity_filter,
239+
speech_context,
240+
single_utterance,
241+
interim_results)
242+
for response in responses:
243+
for result in response.results:
244+
if result.is_final or interim_results:
245+
yield [Alternative.from_pb(alternative)
246+
for alternative in result.alternatives]
247+
162248
def sync_recognize(self, sample, language_code=None,
163249
max_alternatives=None, profanity_filter=None,
164250
speech_context=None):

packages/google-cloud-speech/unit_tests/test__gax.py

+36
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,35 @@
1515
import unittest
1616

1717

18+
class TestGAPICSpeechAPI(unittest.TestCase):
19+
SAMPLE_RATE = 16000
20+
21+
def _getTargetClass(self):
22+
from google.cloud.speech._gax import GAPICSpeechAPI
23+
24+
return GAPICSpeechAPI
25+
26+
def _makeOne(self, *args, **kw):
27+
return self._getTargetClass()(*args, **kw)
28+
29+
def test_use_bytes_instead_of_file_like_object(self):
30+
from google.cloud import speech
31+
from google.cloud.speech.sample import Sample
32+
33+
credentials = {}
34+
client = speech.Client(credentials=credentials, use_gax=True)
35+
client.connection = _Connection()
36+
client.connection.credentials = credentials
37+
38+
sample = Sample(content=b'', encoding=speech.Encoding.FLAC,
39+
sample_rate=self.SAMPLE_RATE)
40+
41+
api = self._makeOne(client)
42+
with self.assertRaises(ValueError):
43+
api.streaming_recognize(sample)
44+
self.assertEqual(client.connection._requested, [])
45+
46+
1847
class TestSpeechGAXMakeRequests(unittest.TestCase):
1948
SAMPLE_RATE = 16000
2049
HINTS = ['hi']
@@ -137,3 +166,10 @@ def test_stream_requests(self):
137166
self.assertEqual(streaming_request.audio_content, self.AUDIO_CONTENT)
138167
self.assertIsInstance(config_request.streaming_config,
139168
StreamingRecognitionConfig)
169+
170+
171+
class _Connection(object):
172+
173+
def __init__(self, *responses):
174+
self._responses = responses
175+
self._requested = []

0 commit comments

Comments
 (0)