Skip to content

Commit

Permalink
Merge pull request #2680 from daspecster/add-speech-streaming-gapic
Browse files Browse the repository at this point in the history
Add speech streaming recognition.
  • Loading branch information
daspecster authored Nov 8, 2016
2 parents 8a01e38 + 9f47e5d commit 1dd7032
Show file tree
Hide file tree
Showing 6 changed files with 603 additions and 25 deletions.
80 changes: 80 additions & 0 deletions docs/speech-usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -151,5 +151,85 @@ words to the vocabulary of the recognizer.
transcript: Hello, this is a test
confidence: 0.81
Streaming Recognition
---------------------

The :meth:`~google.cloud.speech.Client.streaming_recognize` method converts
speech data to possible text alternatives on the fly.

.. note::
Streaming recognition requests are limited to 1 minute of audio.

See: https://cloud.google.com/speech/limits#content

.. code-block:: python
>>> from google.cloud import speech
>>> client = speech.Client()
>>> with open('./hello.wav', 'rb') as stream:
... sample = client.sample(content=stream,
... encoding=speech.Encoding.LINEAR16,
... sample_rate=16000)
... alternatives = list(client.streaming_recognize(sample))
>>> print(alternatives[0].transcript)
'hello'
>>> print(alternatives[0].confidence)
0.973458576
By default the API will perform continuous recognition
(continuing to process audio even if the speaker in the audio pauses speaking)
until the client closes the output stream or until the maximum time limit has
been reached.

If you only want to recognize a single utterance you can set
``single_utterance`` to :data:`True` and only one result will be returned.

See: `Single Utterance`_

.. code-block:: python
>>> with open('./hello_pause_goodbye.wav', 'rb') as stream:
... sample = client.sample(content=stream,
... encoding=speech.Encoding.LINEAR16,
... sample_rate=16000)
... responses = client.streaming_recognize(sample,
... single_utterance=True)
... alternatives = list(responses)
>>> print(alternatives[0].transcript)
hello
>>> print(alternatives[0].confidence)
0.96523453546
If ``interim_results`` is set to :data:`True`, interim results
(tentative hypotheses) may be returned as they become available.

.. code-block:: python
>>> from google.cloud import speech
>>> client = speech.Client()
>>> with open('./hello.wav', 'rb') as stream:
... sample = client.sample(content=stream,
... encoding=speech.Encoding.LINEAR16,
... sample_rate=16000)
... for alternatives in client.streaming_recognize(sample,
... interim_results=True):
... print('=' * 20)
... print(alternatives[0].transcript)
... print(alternatives[0].confidence)
====================
'he'
None
====================
'hell'
None
====================
'hello'
0.973458576
.. _Single Utterance: https://cloud.google.com/speech/reference/rpc/google.cloud.speech.v1beta1#streamingrecognitionconfig
.. _sync_recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/syncrecognize
.. _Speech Asynchronous Recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/asyncrecognize
91 changes: 91 additions & 0 deletions speech/google/cloud/speech/_gax.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,97 @@ def async_recognize(self, sample, language_code=None,

return Operation.from_pb(response, self)

def streaming_recognize(self, sample, language_code=None,
max_alternatives=None, profanity_filter=None,
speech_context=None, single_utterance=False,
interim_results=False):
"""Streaming speech recognition.
.. note::
Streaming recognition requests are limited to 1 minute of audio.
See: https://cloud.google.com/speech/limits#content
Yields :class:`~streaming_response.StreamingSpeechResponse` containing
results and metadata from the streaming request.
:type sample: :class:`~google.cloud.speech.sample.Sample`
:param sample: Instance of ``Sample`` containing audio information.
:type language_code: str
:param language_code: (Optional) The language of the supplied audio as
BCP-47 language tag. Example: ``'en-GB'``.
If omitted, defaults to ``'en-US'``.
:type max_alternatives: int
:param max_alternatives: (Optional) Maximum number of recognition
hypotheses to be returned. The server may
return fewer than maxAlternatives.
Valid values are 0-30. A value of 0 or 1
will return a maximum of 1. Defaults to 1
:type profanity_filter: bool
:param profanity_filter: If True, the server will attempt to filter
out profanities, replacing all but the
initial character in each filtered word with
asterisks, e.g. ``'f***'``. If False or
omitted, profanities won't be filtered out.
:type speech_context: list
:param speech_context: A list of strings (max 50) containing words and
phrases "hints" so that the speech recognition
is more likely to recognize them. This can be
used to improve the accuracy for specific words
and phrases. This can also be used to add new
words to the vocabulary of the recognizer.
:type single_utterance: bool
:param single_utterance: (Optional) If false or omitted, the recognizer
will perform continuous recognition
(continuing to process audio even if the user
pauses speaking) until the client closes the
output stream (gRPC API) or when the maximum
time limit has been reached. Multiple
SpeechRecognitionResults with the is_final
flag set to true may be returned.
If true, the recognizer will detect a single
spoken utterance. When it detects that the
user has paused or stopped speaking, it will
return an END_OF_UTTERANCE event and cease
recognition. It will return no more than one
SpeechRecognitionResult with the is_final flag
set to true.
:type interim_results: bool
:param interim_results: (Optional) If true, interim results (tentative
hypotheses) may be returned as they become
available (these interim results are indicated
with the is_final=false flag). If false or
omitted, only is_final=true result(s) are
returned.
:raises: :class:`ValueError` if sample.content is not a file-like
object. :class:`ValueError` if stream has closed.
:rtype: :class:`~google.cloud.grpc.speech.v1beta1\
.cloud_speech_pb2.StreamingRecognizeResponse`
:returns: ``StreamingRecognizeResponse`` instances.
"""
if getattr(sample.content, 'closed', None) is None:
raise ValueError('Please use file-like object for data stream.')
if sample.content.closed:
raise ValueError('Stream is closed.')

requests = _stream_requests(sample, language_code=language_code,
max_alternatives=max_alternatives,
profanity_filter=profanity_filter,
speech_context=speech_context,
single_utterance=single_utterance,
interim_results=interim_results)
api = self._gapic_api
responses = api.streaming_recognize(requests)
return responses

def sync_recognize(self, sample, language_code=None, max_alternatives=None,
profanity_filter=None, speech_context=None):
"""Synchronous Speech Recognition.
Expand Down
86 changes: 86 additions & 0 deletions speech/google/cloud/speech/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,92 @@ def speech_api(self):
self._speech_api = _JSONSpeechAPI(self)
return self._speech_api

def streaming_recognize(self, sample, language_code=None,
max_alternatives=None, profanity_filter=None,
speech_context=None, single_utterance=False,
interim_results=False):
"""Streaming speech recognition.
.. note::
Streaming recognition requests are limited to 1 minute of audio.
See: https://cloud.google.com/speech/limits#content
Yields: list of :class:`~google.cloud.speech.alternative.Alternatives`
containing results and metadata from the streaming request.
:type sample: :class:`~google.cloud.speech.sample.Sample`
:param sample: Instance of ``Sample`` containing audio information.
:type language_code: str
:param language_code: (Optional) The language of the supplied audio as
BCP-47 language tag. Example: ``'en-GB'``.
If omitted, defaults to ``'en-US'``.
:type max_alternatives: int
:param max_alternatives: (Optional) Maximum number of recognition
hypotheses to be returned. The server may
return fewer than maxAlternatives.
Valid values are 0-30. A value of 0 or 1
will return a maximum of 1. Defaults to 1
:type profanity_filter: bool
:param profanity_filter: If True, the server will attempt to filter
out profanities, replacing all but the
initial character in each filtered word with
asterisks, e.g. ``'f***'``. If False or
omitted, profanities won't be filtered out.
:type speech_context: list
:param speech_context: A list of strings (max 50) containing words and
phrases "hints" so that the speech recognition
is more likely to recognize them. This can be
used to improve the accuracy for specific words
and phrases. This can also be used to add new
words to the vocabulary of the recognizer.
:type single_utterance: bool
:param single_utterance: (Optional) If false or omitted, the recognizer
will perform continuous recognition
(continuing to process audio even if the user
pauses speaking) until the client closes the
output stream (gRPC API) or when the maximum
time limit has been reached. Multiple
SpeechRecognitionResults with the is_final
flag set to true may be returned.
If true, the recognizer will detect a single
spoken utterance. When it detects that the
user has paused or stopped speaking, it will
return an END_OF_UTTERANCE event and cease
recognition. It will return no more than one
SpeechRecognitionResult with the is_final flag
set to true.
:type interim_results: bool
:param interim_results: (Optional) If true, interim results (tentative
hypotheses) may be returned as they become
available (these interim results are indicated
with the ``is_final=False`` flag). If false or
omitted, only is_final=true result(s) are
returned.
:raises: EnvironmentError if gRPC is not available.
"""
if not self._use_gax:
raise EnvironmentError('gRPC is required to use this API.')

responses = self.speech_api.streaming_recognize(sample, language_code,
max_alternatives,
profanity_filter,
speech_context,
single_utterance,
interim_results)
for response in responses:
for result in response.results:
if result.is_final or interim_results:
yield [Alternative.from_pb(alternative)
for alternative in result.alternatives]

def sync_recognize(self, sample, language_code=None,
max_alternatives=None, profanity_filter=None,
speech_context=None):
Expand Down
36 changes: 36 additions & 0 deletions speech/unit_tests/test__gax.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,35 @@
import unittest


class TestGAPICSpeechAPI(unittest.TestCase):
SAMPLE_RATE = 16000

def _getTargetClass(self):
from google.cloud.speech._gax import GAPICSpeechAPI

return GAPICSpeechAPI

def _makeOne(self, *args, **kw):
return self._getTargetClass()(*args, **kw)

def test_use_bytes_instead_of_file_like_object(self):
from google.cloud import speech
from google.cloud.speech.sample import Sample

credentials = {}
client = speech.Client(credentials=credentials, use_gax=True)
client.connection = _Connection()
client.connection.credentials = credentials

sample = Sample(content=b'', encoding=speech.Encoding.FLAC,
sample_rate=self.SAMPLE_RATE)

api = self._makeOne(client)
with self.assertRaises(ValueError):
api.streaming_recognize(sample)
self.assertEqual(client.connection._requested, [])


class TestSpeechGAXMakeRequests(unittest.TestCase):
SAMPLE_RATE = 16000
HINTS = ['hi']
Expand Down Expand Up @@ -137,3 +166,10 @@ def test_stream_requests(self):
self.assertEqual(streaming_request.audio_content, self.AUDIO_CONTENT)
self.assertIsInstance(config_request.streaming_config,
StreamingRecognitionConfig)


class _Connection(object):

def __init__(self, *responses):
self._responses = responses
self._requested = []
Loading

0 comments on commit 1dd7032

Please sign in to comment.