From 8ed152d67b21dba86cf34b7f573cce90c2b492d3 Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Fri, 14 Jul 2017 16:16:07 -0700 Subject: [PATCH] Speech gapic client library [(#1012)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1012) * Migrate quickstart to GAPIC client library * Migrate transcribe to GAPIC client library * Migrate transcribe_async to GAPIC client library * Migrate transcribe_streaming to GAPIC client library * clean up * clean up * Import from google.cloud.speech * update transcribe samples * import in alphabetic order * remove unused variable * use strings instead of enums * restructure code * comment on sreaming requests * import style * flake * correct indent * migrate transcribe_streaming_mic to gapic * update google-cloud-speech version requirement * addressing review comments * at the end of the audio stream, put None to signal to the generator * flake * addressing github review comments * add region tags for migration guide * update README * rst format * bullet * addressing PR review comments * use enums * remove a word --- samples/snippets/README.rst | 4 + samples/snippets/README.rst.in | 6 ++ samples/snippets/quickstart.py | 22 ++-- samples/snippets/requirements.txt | 2 +- samples/snippets/transcribe.py | 45 +++++--- samples/snippets/transcribe_async.py | 54 +++++----- samples/snippets/transcribe_streaming.py | 43 +++++--- samples/snippets/transcribe_streaming_mic.py | 103 ++++++++++++------- 8 files changed, 181 insertions(+), 98 deletions(-) diff --git a/samples/snippets/README.rst b/samples/snippets/README.rst index 70fdd92b..761ebdf6 100644 --- a/samples/snippets/README.rst +++ b/samples/snippets/README.rst @@ -5,6 +5,10 @@ Google Cloud Speech API Python Samples This directory contains samples for Google Cloud Speech API. The `Google Cloud Speech API`_ enables easy integration of Google speech recognition technologies into developer applications. Send audio and receive a text transcription from the Cloud Speech API service. +- See the `migration guide`_ for information about migrating to Python client library v0.27. + +.. _migration guide: https://cloud.google.com/speech/docs/python-client-migration + diff --git a/samples/snippets/README.rst.in b/samples/snippets/README.rst.in index 70259318..9b671369 100644 --- a/samples/snippets/README.rst.in +++ b/samples/snippets/README.rst.in @@ -9,6 +9,12 @@ product: recognition technologies into developer applications. Send audio and receive a text transcription from the Cloud Speech API service. + + - See the `migration guide`_ for information about migrating to Python client library v0.27. + + + .. _migration guide: https://cloud.google.com/speech/docs/python-client-migration + setup: - auth - install_deps diff --git a/samples/snippets/quickstart.py b/samples/snippets/quickstart.py index 81966cf8..388e7ffc 100644 --- a/samples/snippets/quickstart.py +++ b/samples/snippets/quickstart.py @@ -21,10 +21,16 @@ def run_quickstart(): import os # Imports the Google Cloud client library + # [START migration_import] from google.cloud import speech + from google.cloud.speech import enums + from google.cloud.speech import types + # [END migration_import] # Instantiates a client - speech_client = speech.Client() + # [START migration_client] + client = speech.SpeechClient() + # [END migration_client] # The name of the audio file to transcribe file_name = os.path.join( @@ -35,14 +41,16 @@ def run_quickstart(): # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() - sample = speech_client.sample( - content, - source_uri=None, - encoding='LINEAR16', - sample_rate_hertz=16000) + audio = types.RecognitionAudio(content=content) + + config = types.RecognitionConfig( + encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code='en-US') # Detects speech in the audio file - alternatives = sample.recognize('en-US') + response = client.recognize(config, audio) + alternatives = response.results[0].alternatives for alternative in alternatives: print('Transcript: {}'.format(alternative.transcript)) diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index 7e574b31..92970530 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1 +1 @@ -google-cloud-speech==0.26.0 +google-cloud-speech==0.27.0 diff --git a/samples/snippets/transcribe.py b/samples/snippets/transcribe.py index 7c138ec9..6bd74354 100644 --- a/samples/snippets/transcribe.py +++ b/samples/snippets/transcribe.py @@ -31,33 +31,50 @@ def transcribe_file(speech_file): """Transcribe the given audio file.""" from google.cloud import speech - speech_client = speech.Client() + from google.cloud.speech import enums + from google.cloud.speech import types + client = speech.SpeechClient() + # [START migration_sync_request] + # [START migration_audio_config_file] with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() - audio_sample = speech_client.sample( - content=content, - source_uri=None, - encoding='LINEAR16', - sample_rate_hertz=16000) - alternatives = audio_sample.recognize('en-US') + audio = types.RecognitionAudio(content=content) + config = types.RecognitionConfig( + encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code='en-US') + # [END migration_audio_config_file] + + # [START migration_sync_response] + response = client.recognize(config, audio) + # [END migration_sync_request] + alternatives = response.results[0].alternatives + for alternative in alternatives: print('Transcript: {}'.format(alternative.transcript)) + # [END migration_sync_response] def transcribe_gcs(gcs_uri): """Transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech - speech_client = speech.Client() + from google.cloud.speech import enums + from google.cloud.speech import types + client = speech.SpeechClient() + + # [START migration_audio_config_gcs] + audio = types.RecognitionAudio(uri=gcs_uri) + config = types.RecognitionConfig( + encoding=enums.RecognitionConfig.AudioEncoding.FLAC, + sample_rate_hertz=16000, + language_code='en-US') + # [END migration_audio_config_gcs] - audio_sample = speech_client.sample( - content=None, - source_uri=gcs_uri, - encoding='FLAC', - sample_rate_hertz=16000) + response = client.recognize(config, audio) + alternatives = response.results[0].alternatives - alternatives = audio_sample.recognize('en-US') for alternative in alternatives: print('Transcript: {}'.format(alternative.transcript)) diff --git a/samples/snippets/transcribe_async.py b/samples/snippets/transcribe_async.py index fd0a0340..65215e90 100644 --- a/samples/snippets/transcribe_async.py +++ b/samples/snippets/transcribe_async.py @@ -30,63 +30,69 @@ def transcribe_file(speech_file): """Transcribe the given audio file asynchronously.""" from google.cloud import speech - speech_client = speech.Client() + from google.cloud.speech import enums + from google.cloud.speech import types + client = speech.SpeechClient() + # [START migration_async_request] with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() - audio_sample = speech_client.sample( - content, - source_uri=None, - encoding='LINEAR16', - sample_rate_hertz=16000) - operation = audio_sample.long_running_recognize('en-US') + audio = types.RecognitionAudio(content=content) + config = types.RecognitionConfig( + encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code='en-US') + # [START migration_async_response] + operation = client.long_running_recognize(config, audio) + # [END migration_async_request] + + # Sleep and poll operation.done() retry_count = 100 - while retry_count > 0 and not operation.complete: + while retry_count > 0 and not operation.done(): retry_count -= 1 time.sleep(2) - operation.poll() - if not operation.complete: + if not operation.done(): print('Operation not complete and retry limit reached.') return - alternatives = operation.results + alternatives = operation.result().results[0].alternatives for alternative in alternatives: print('Transcript: {}'.format(alternative.transcript)) print('Confidence: {}'.format(alternative.confidence)) - # [END send_request] + # [END migration_async_response] def transcribe_gcs(gcs_uri): """Asynchronously transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech - speech_client = speech.Client() + from google.cloud.speech import enums + from google.cloud.speech import types + client = speech.SpeechClient() - audio_sample = speech_client.sample( - content=None, - source_uri=gcs_uri, - encoding='FLAC', - sample_rate_hertz=16000) + audio = types.RecognitionAudio(uri=gcs_uri) + config = types.RecognitionConfig( + encoding=enums.RecognitionConfig.AudioEncoding.FLAC, + sample_rate_hertz=16000, + language_code='en-US') - operation = audio_sample.long_running_recognize('en-US') + operation = client.long_running_recognize(config, audio) retry_count = 100 - while retry_count > 0 and not operation.complete: + while retry_count > 0 and not operation.done(): retry_count -= 1 time.sleep(2) - operation.poll() - if not operation.complete: + if not operation.done(): print('Operation not complete and retry limit reached.') return - alternatives = operation.results + alternatives = operation.result().results[0].alternatives for alternative in alternatives: print('Transcript: {}'.format(alternative.transcript)) print('Confidence: {}'.format(alternative.confidence)) - # [END send_request_gcs] if __name__ == '__main__': diff --git a/samples/snippets/transcribe_streaming.py b/samples/snippets/transcribe_streaming.py index 429db791..455a470f 100644 --- a/samples/snippets/transcribe_streaming.py +++ b/samples/snippets/transcribe_streaming.py @@ -29,20 +29,39 @@ def transcribe_streaming(stream_file): """Streams transcription of the given audio file.""" from google.cloud import speech - speech_client = speech.Client() + from google.cloud.speech import enums + from google.cloud.speech import types + client = speech.SpeechClient() + # [START migration_streaming_request] with io.open(stream_file, 'rb') as audio_file: - audio_sample = speech_client.sample( - stream=audio_file, - encoding=speech.encoding.Encoding.LINEAR16, - sample_rate_hertz=16000) - alternatives = audio_sample.streaming_recognize('en-US') - - for alternative in alternatives: - print('Finished: {}'.format(alternative.is_final)) - print('Stability: {}'.format(alternative.stability)) - print('Confidence: {}'.format(alternative.confidence)) - print('Transcript: {}'.format(alternative.transcript)) + content = audio_file.read() + + # In practice, stream should be a generator yielding chunks of audio data. + stream = [content] + requests = (types.StreamingRecognizeRequest(audio_content=chunk) + for chunk in stream) + + config = types.RecognitionConfig( + encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code='en-US') + streaming_config = types.StreamingRecognitionConfig(config=config) + + # streaming_recognize returns a generator. + # [START migration_streaming_response] + responses = client.streaming_recognize(streaming_config, requests) + # [END migration_streaming_request] + + for response in responses: + for result in response.results: + print('Finished: {}'.format(result.is_final)) + print('Stability: {}'.format(result.stability)) + alternatives = result.alternatives + for alternative in alternatives: + print('Confidence: {}'.format(alternative.confidence)) + print('Transcript: {}'.format(alternative.transcript)) + # [END migration_streaming_response] if __name__ == '__main__': diff --git a/samples/snippets/transcribe_streaming_mic.py b/samples/snippets/transcribe_streaming_mic.py index 3edd7588..bde8b30f 100644 --- a/samples/snippets/transcribe_streaming_mic.py +++ b/samples/snippets/transcribe_streaming_mic.py @@ -32,6 +32,8 @@ import sys from google.cloud import speech +from google.cloud.speech import enums +from google.cloud.speech import types import pyaudio from six.moves import queue # [END import_libraries] @@ -41,8 +43,8 @@ CHUNK = int(RATE / 10) # 100ms -class MicAsFile(object): - """Opens a recording stream as a file-like object.""" +class MicrophoneStream(object): + """Opens a recording stream as a generator yielding the audio chunks.""" def __init__(self, rate, chunk): self._rate = rate self._chunk = chunk @@ -73,7 +75,8 @@ def __exit__(self, type, value, traceback): self._audio_stream.stop_stream() self._audio_stream.close() self.closed = True - # Flush out the read, just in case + # Signal the generator to terminate so that the client's + # streaming_recognize method will not block the process termination. self._buff.put(None) self._audio_interface.terminate() @@ -82,31 +85,39 @@ def _fill_buffer(self, in_data, frame_count, time_info, status_flags): self._buff.put(in_data) return None, pyaudio.paContinue - def read(self, chunk_size): - if self.closed: - return - - # Use a blocking get() to ensure there's at least one chunk of data. - data = [self._buff.get()] - - # Now consume whatever other data's still buffered. - while True: - try: - data.append(self._buff.get(block=False)) - except queue.Empty: - break - - if self.closed: - return - return b''.join(data) + def generator(self): + while not self.closed: + # Use a blocking get() to ensure there's at least one chunk of + # data, and stop iteration if the chunk is None, indicating the + # end of the audio stream. + chunk = self._buff.get() + if chunk is None: + return + data = [chunk] + + # Now consume whatever other data's still buffered. + while True: + try: + chunk = self._buff.get(block=False) + if chunk is None: + return + data.append(chunk) + except queue.Empty: + break + + yield b''.join(data) # [END audio_stream] -def listen_print_loop(results_gen): +def listen_print_loop(responses): """Iterates through server responses and prints them. - The results_gen passed is a generator that will block until a response - is provided by the server. When the transcription response comes, print it. + The responses passed is a generator that will block until a response + is provided by the server. + + Each response may contain multiple results, and each result may contain + multiple alternatives; for details, see https://goo.gl/tjCPAU. Here we + print only the transcription for the top alternative of the top result. In this case, responses are provided for interim results as well. If the response is an interim one, print a line feed at the end of it, to allow @@ -114,19 +125,24 @@ def listen_print_loop(results_gen): final one, print a newline to preserve the finalized transcription. """ num_chars_printed = 0 - for result in results_gen: + for response in responses: + if not response.results: + continue + + # There could be multiple results in each response. + result = response.results[0] if not result.alternatives: continue - # Display the top transcription - transcript = result.transcript + # Display the transcription of the top alternative. + transcript = result.alternatives[0].transcript # Display interim results, but with a carriage return at the end of the # line, so subsequent lines will overwrite them. # # If the previous result was longer than this one, we need to print # some extra spaces to overwrite the previous result - overwrite_chars = ' ' * max(0, num_chars_printed - len(transcript)) + overwrite_chars = ' ' * (num_chars_printed - len(transcript)) if not result.is_final: sys.stdout.write(transcript + overwrite_chars + '\r') @@ -147,21 +163,28 @@ def listen_print_loop(results_gen): def main(): - speech_client = speech.Client() - - with MicAsFile(RATE, CHUNK) as stream: - audio_sample = speech_client.sample( - stream=stream, - encoding=speech.encoding.Encoding.LINEAR16, - sample_rate_hertz=RATE) - # See http://g.co/cloud/speech/docs/languages - # for a list of supported languages. - language_code = 'en-US' # a BCP-47 language tag - results_gen = audio_sample.streaming_recognize( - language_code=language_code, interim_results=True) + # See http://g.co/cloud/speech/docs/languages + # for a list of supported languages. + language_code = 'en-US' # a BCP-47 language tag + + client = speech.SpeechClient() + config = types.RecognitionConfig( + encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=RATE, + language_code=language_code) + streaming_config = types.StreamingRecognitionConfig( + config=config, + interim_results=True) + + with MicrophoneStream(RATE, CHUNK) as stream: + audio_generator = stream.generator() + requests = (types.StreamingRecognizeRequest(audio_content=content) + for content in audio_generator) + + responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. - listen_print_loop(results_gen) + listen_print_loop(responses) if __name__ == '__main__':