Merge pull request #2680 from daspecster/add-speech-streaming-gapic

daspecster · web-flow · commit 49229af545ca · 2016-11-08T14:54:03.000-05:00
Add speech streaming recognition.
diff --git a/packages/google-cloud-speech/google/cloud/speech/_gax.py b/packages/google-cloud-speech/google/cloud/speech/_gax.py
@@ -106,6 +106,97 @@ def async_recognize(self, sample, language_code=None,
 
         return Operation.from_pb(response, self)
 
+    def streaming_recognize(self, sample, language_code=None,
+                            max_alternatives=None, profanity_filter=None,
+                            speech_context=None, single_utterance=False,
+                            interim_results=False):
+        """Streaming speech recognition.
+
+        .. note::
+
+            Streaming recognition requests are limited to 1 minute of audio.
+            See: https://cloud.google.com/speech/limits#content
+
+        Yields :class:`~streaming_response.StreamingSpeechResponse` containing
+        results and metadata from the streaming request.
+
+        :type sample: :class:`~google.cloud.speech.sample.Sample`
+        :param sample: Instance of ``Sample`` containing audio information.
+
+        :type language_code: str
+        :param language_code: (Optional) The language of the supplied audio as
+                              BCP-47 language tag. Example: ``'en-GB'``.
+                              If omitted, defaults to ``'en-US'``.
+
+        :type max_alternatives: int
+        :param max_alternatives: (Optional) Maximum number of recognition
+                                 hypotheses to be returned. The server may
+                                 return fewer than maxAlternatives.
+                                 Valid values are 0-30. A value of 0 or 1
+                                 will return a maximum of 1. Defaults to 1
+
+        :type profanity_filter: bool
+        :param profanity_filter: If True, the server will attempt to filter
+                                 out profanities, replacing all but the
+                                 initial character in each filtered word with
+                                 asterisks, e.g. ``'f***'``. If False or
+                                 omitted, profanities won't be filtered out.
+
+        :type speech_context: list
+        :param speech_context: A list of strings (max 50) containing words and
+                               phrases "hints" so that the speech recognition
+                               is more likely to recognize them. This can be
+                               used to improve the accuracy for specific words
+                               and phrases. This can also be used to add new
+                               words to the vocabulary of the recognizer.
+
+        :type single_utterance: bool
+        :param single_utterance: (Optional) If false or omitted, the recognizer
+                                 will perform continuous recognition
+                                 (continuing to process audio even if the user
+                                 pauses speaking) until the client closes the
+                                 output stream (gRPC API) or when the maximum
+                                 time limit has been reached. Multiple
+                                 SpeechRecognitionResults with the is_final
+                                 flag set to true may be returned.
+                                 If true, the recognizer will detect a single
+                                 spoken utterance. When it detects that the
+                                 user has paused or stopped speaking, it will
+                                 return an END_OF_UTTERANCE event and cease
+                                 recognition. It will return no more than one
+                                 SpeechRecognitionResult with the is_final flag
+                                 set to true.
+
+        :type interim_results: bool
+        :param interim_results: (Optional) If true, interim results (tentative
+                                hypotheses) may be returned as they become
+                                available (these interim results are indicated
+                                with the is_final=false flag). If false or
+                                omitted, only is_final=true result(s) are
+                                returned.
+
+        :raises: :class:`ValueError` if sample.content is not a file-like
+                 object. :class:`ValueError` if stream has closed.
+
+        :rtype: :class:`~google.cloud.grpc.speech.v1beta1\
+                       .cloud_speech_pb2.StreamingRecognizeResponse`
+        :returns: ``StreamingRecognizeResponse`` instances.
+        """
+        if getattr(sample.content, 'closed', None) is None:
+            raise ValueError('Please use file-like object for data stream.')
+        if sample.content.closed:
+            raise ValueError('Stream is closed.')
+
+        requests = _stream_requests(sample, language_code=language_code,
+                                    max_alternatives=max_alternatives,
+                                    profanity_filter=profanity_filter,
+                                    speech_context=speech_context,
+                                    single_utterance=single_utterance,
+                                    interim_results=interim_results)
+        api = self._gapic_api
+        responses = api.streaming_recognize(requests)
+        return responses
+
     def sync_recognize(self, sample, language_code=None, max_alternatives=None,
                        profanity_filter=None, speech_context=None):
         """Synchronous Speech Recognition.
diff --git a/packages/google-cloud-speech/google/cloud/speech/client.py b/packages/google-cloud-speech/google/cloud/speech/client.py
@@ -159,6 +159,92 @@ def speech_api(self):
                 self._speech_api = _JSONSpeechAPI(self)
         return self._speech_api
 
+    def streaming_recognize(self, sample, language_code=None,
+                            max_alternatives=None, profanity_filter=None,
+                            speech_context=None, single_utterance=False,
+                            interim_results=False):
+        """Streaming speech recognition.
+
+        .. note::
+
+            Streaming recognition requests are limited to 1 minute of audio.
+            See: https://cloud.google.com/speech/limits#content
+
+        Yields: list of :class:`~google.cloud.speech.alternative.Alternatives`
+                containing results and metadata from the streaming request.
+
+        :type sample: :class:`~google.cloud.speech.sample.Sample`
+        :param sample: Instance of ``Sample`` containing audio information.
+
+        :type language_code: str
+        :param language_code: (Optional) The language of the supplied audio as
+                              BCP-47 language tag. Example: ``'en-GB'``.
+                              If omitted, defaults to ``'en-US'``.
+
+        :type max_alternatives: int
+        :param max_alternatives: (Optional) Maximum number of recognition
+                                 hypotheses to be returned. The server may
+                                 return fewer than maxAlternatives.
+                                 Valid values are 0-30. A value of 0 or 1
+                                 will return a maximum of 1. Defaults to 1
+
+        :type profanity_filter: bool
+        :param profanity_filter: If True, the server will attempt to filter
+                                 out profanities, replacing all but the
+                                 initial character in each filtered word with
+                                 asterisks, e.g. ``'f***'``. If False or
+                                 omitted, profanities won't be filtered out.
+
+        :type speech_context: list
+        :param speech_context: A list of strings (max 50) containing words and
+                               phrases "hints" so that the speech recognition
+                               is more likely to recognize them. This can be
+                               used to improve the accuracy for specific words
+                               and phrases. This can also be used to add new
+                               words to the vocabulary of the recognizer.
+
+        :type single_utterance: bool
+        :param single_utterance: (Optional) If false or omitted, the recognizer
+                                 will perform continuous recognition
+                                 (continuing to process audio even if the user
+                                 pauses speaking) until the client closes the
+                                 output stream (gRPC API) or when the maximum
+                                 time limit has been reached. Multiple
+                                 SpeechRecognitionResults with the is_final
+                                 flag set to true may be returned.
+                                 If true, the recognizer will detect a single
+                                 spoken utterance. When it detects that the
+                                 user has paused or stopped speaking, it will
+                                 return an END_OF_UTTERANCE event and cease
+                                 recognition. It will return no more than one
+                                 SpeechRecognitionResult with the is_final flag
+                                 set to true.
+
+        :type interim_results: bool
+        :param interim_results: (Optional) If true, interim results (tentative
+                                hypotheses) may be returned as they become
+                                available (these interim results are indicated
+                                with the ``is_final=False`` flag). If false or
+                                omitted, only is_final=true result(s) are
+                                returned.
+
+        :raises: EnvironmentError if gRPC is not available.
+        """
+        if not self._use_gax:
+            raise EnvironmentError('gRPC is required to use this API.')
+
+        responses = self.speech_api.streaming_recognize(sample, language_code,
+                                                        max_alternatives,
+                                                        profanity_filter,
+                                                        speech_context,
+                                                        single_utterance,
+                                                        interim_results)
+        for response in responses:
+            for result in response.results:
+                if result.is_final or interim_results:
+                    yield [Alternative.from_pb(alternative)
+                           for alternative in result.alternatives]
+
     def sync_recognize(self, sample, language_code=None,
                        max_alternatives=None, profanity_filter=None,
                        speech_context=None):
diff --git a/packages/google-cloud-speech/unit_tests/test__gax.py b/packages/google-cloud-speech/unit_tests/test__gax.py
@@ -15,6 +15,35 @@
 import unittest
 
 
+class TestGAPICSpeechAPI(unittest.TestCase):
+    SAMPLE_RATE = 16000
+
+    def _getTargetClass(self):
+        from google.cloud.speech._gax import GAPICSpeechAPI
+
+        return GAPICSpeechAPI
+
+    def _makeOne(self, *args, **kw):
+        return self._getTargetClass()(*args, **kw)
+
+    def test_use_bytes_instead_of_file_like_object(self):
+        from google.cloud import speech
+        from google.cloud.speech.sample import Sample
+
+        credentials = {}
+        client = speech.Client(credentials=credentials, use_gax=True)
+        client.connection = _Connection()
+        client.connection.credentials = credentials
+
+        sample = Sample(content=b'', encoding=speech.Encoding.FLAC,
+                        sample_rate=self.SAMPLE_RATE)
+
+        api = self._makeOne(client)
+        with self.assertRaises(ValueError):
+            api.streaming_recognize(sample)
+        self.assertEqual(client.connection._requested, [])
+
+
 class TestSpeechGAXMakeRequests(unittest.TestCase):
     SAMPLE_RATE = 16000
     HINTS = ['hi']
@@ -137,3 +166,10 @@ def test_stream_requests(self):
         self.assertEqual(streaming_request.audio_content, self.AUDIO_CONTENT)
         self.assertIsInstance(config_request.streaming_config,
                               StreamingRecognitionConfig)
+
+
+class _Connection(object):
+
+    def __init__(self, *responses):
+        self._responses = responses
+        self._requested = []
diff --git a/packages/google-cloud-speech/unit_tests/test_client.py b/packages/google-cloud-speech/unit_tests/test_client.py