Diarization Output Modified [(#1586)](#1586)

happyhuman · telpirion · commit 2f887b52e169 · 2023-03-13T19:55:23.000Z
* Printing the last paragraph only.

* Python3 print

* Removing sample rate setting

* Adding the missing output parameter in the example

* Changes based on the comments

* Removed filenames as input parameters

* Removed unused args

* Updated README file

* Updated the inline comment

* Modified code to make it more readable

* Simplified the response object processing.

* Fixing the long line issue.
diff --git a/speech/snippets/README.rst b/speech/snippets/README.rst
@@ -221,25 +221,22 @@ To run this sample:
 
     $ python beta_snippets.py
 
-    usage: beta_snippets.py [-h] command path first second
+    usage: beta_snippets.py [-h] command
 
     Google Cloud Speech API sample that demonstrates enhanced models
     and recognition metadata.
 
     Example usage:
-        python beta_snippets.py enhanced-model resources/commercial_mono.wav
-        python beta_snippets.py metadata resources/commercial_mono.wav
-        python beta_snippets.py punctuation resources/commercial_mono.wav
-        python beta_snippets.py diarization resources/commercial_mono.wav
-        python beta_snippets.py multi-channel resources/commercial_mono.wav
-        python beta_snippets.py multi-language resources/multi.wav en-US es
-        python beta_snippets.py word-level-conf resources/commercial_mono.wav
+        python beta_snippets.py enhanced-model
+        python beta_snippets.py metadata
+        python beta_snippets.py punctuation
+        python beta_snippets.py diarization
+        python beta_snippets.py multi-channel
+        python beta_snippets.py multi-language
+        python beta_snippets.py word-level-conf
 
     positional arguments:
       command
-      path        File for audio file to be recognized
-      first       First language in audio file to be recognized
-      second      Second language in audio file to be recognized
 
     optional arguments:
       -h, --help  show this help message and exit
diff --git a/speech/snippets/beta_snippets.py b/speech/snippets/beta_snippets.py
@@ -18,27 +18,26 @@
 and recognition metadata.
 
 Example usage:
-    python beta_snippets.py enhanced-model resources/commercial_mono.wav
-    python beta_snippets.py metadata resources/commercial_mono.wav
-    python beta_snippets.py punctuation resources/commercial_mono.wav
-    python beta_snippets.py diarization resources/commercial_mono.wav
-    python beta_snippets.py multi-channel resources/commercial_mono.wav
-    python beta_snippets.py multi-language resources/multi.wav en-US es
-    python beta_snippets.py word-level-conf resources/commercial_mono.wav
+    python beta_snippets.py enhanced-model
+    python beta_snippets.py metadata
+    python beta_snippets.py punctuation
+    python beta_snippets.py diarization
+    python beta_snippets.py multi-channel
+    python beta_snippets.py multi-language
+    python beta_snippets.py word-level-conf
 """
 
 import argparse
 import io
 
 
-def transcribe_file_with_enhanced_model(speech_file):
+def transcribe_file_with_enhanced_model():
     """Transcribe the given audio file using an enhanced model."""
     # [START speech_transcribe_file_with_enhanced_model]
     from google.cloud import speech_v1p1beta1 as speech
     client = speech.SpeechClient()
 
-    # TODO(developer): Uncomment and set to a path to your audio file.
-    # speech_file = 'path/to/file.wav'
+    speech_file = 'resources/commercial_mono.wav'
 
     with io.open(speech_file, 'rb') as audio_file:
         content = audio_file.read()
@@ -64,14 +63,13 @@ def transcribe_file_with_enhanced_model(speech_file):
     # [END speech_transcribe_file_with_enhanced_model]
 
 
-def transcribe_file_with_metadata(speech_file):
+def transcribe_file_with_metadata():
     """Send a request that includes recognition metadata."""
     # [START speech_transcribe_file_with_metadata]
     from google.cloud import speech_v1p1beta1 as speech
     client = speech.SpeechClient()
 
-    # TODO(developer): Uncomment and set to a path to your audio file.
-    # speech_file = 'path/to/file.wav'
+    speech_file = 'resources/commercial_mono.wav'
 
     with io.open(speech_file, 'rb') as audio_file:
         content = audio_file.read()
@@ -110,14 +108,13 @@ def transcribe_file_with_metadata(speech_file):
     # [END speech_transcribe_file_with_metadata]
 
 
-def transcribe_file_with_auto_punctuation(speech_file):
+def transcribe_file_with_auto_punctuation():
     """Transcribe the given audio file with auto punctuation enabled."""
     # [START speech_transcribe_file_with_auto_punctuation]
     from google.cloud import speech_v1p1beta1 as speech
     client = speech.SpeechClient()
 
-    # TODO(developer): Uncomment and set to a path to your audio file.
-    # speech_file = 'path/to/file.wav'
+    speech_file = 'resources/commercial_mono.wav'
 
     with io.open(speech_file, 'rb') as audio_file:
         content = audio_file.read()
@@ -140,14 +137,13 @@ def transcribe_file_with_auto_punctuation(speech_file):
     # [END speech_transcribe_file_with_auto_punctuation]
 
 
-def transcribe_file_with_diarization(speech_file):
+def transcribe_file_with_diarization():
     """Transcribe the given audio file synchronously with diarization."""
     # [START speech_transcribe_diarization]
     from google.cloud import speech_v1p1beta1 as speech
     client = speech.SpeechClient()
 
-    # TODO(developer): Uncomment and set to a path to your audio file.
-    # speech_file = 'path/to/file.wav'
+    speech_file = 'resources/commercial_mono.wav'
 
     with open(speech_file, 'rb') as audio_file:
         content = audio_file.read()
@@ -156,33 +152,37 @@ def transcribe_file_with_diarization(speech_file):
 
     config = speech.types.RecognitionConfig(
         encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
-        sample_rate_hertz=16000,
+        sample_rate_hertz=8000,
         language_code='en-US',
         enable_speaker_diarization=True,
         diarization_speaker_count=2)
 
     print('Waiting for operation to complete...')
     response = client.recognize(config, audio)
 
-    for i, result in enumerate(response.results):
-        alternative = result.alternatives[0]
-        print('-' * 20)
-        print('First alternative of result {}: {}'
-              .format(i, alternative.transcript))
-        print('Speaker Tag for the first word: {}'
-              .format(alternative.words[0].speaker_tag))
+    # The transcript within each result is separate and sequential per result.
+    # However, the words list within an alternative includes all the words
+    # from all the results thus far. Thus, to get all the words with speaker
+    # tags, you only have to take the words list from the last result:
+    result = response.results[-1]
+
+    words_info = result.alternatives[0].words
+
+    # Printing out the output:
+    for word_info in words_info:
+        print("word: '{}', speaker_tag: {}".format(word_info.word,
+                                                   word_info.speaker_tag))
     # [END speech_transcribe_diarization]
 
 
-def transcribe_file_with_multichannel(speech_file):
+def transcribe_file_with_multichannel():
     """Transcribe the given audio file synchronously with
       multi channel."""
     # [START speech_transcribe_multichannel]
     from google.cloud import speech_v1p1beta1 as speech
     client = speech.SpeechClient()
 
-    # TODO(developer): Uncomment and set to a path to your audio file.
-    # speech_file = 'path/to/file.wav'
+    speech_file = 'resources/Google_Gnome.wav'
 
     with open(speech_file, 'rb') as audio_file:
         content = audio_file.read()
@@ -207,17 +207,16 @@ def transcribe_file_with_multichannel(speech_file):
     # [END speech_transcribe_multichannel]
 
 
-def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):
+def transcribe_file_with_multilanguage():
     """Transcribe the given audio file synchronously with
       multi language."""
     # [START speech_transcribe_multilanguage]
     from google.cloud import speech_v1p1beta1 as speech
     client = speech.SpeechClient()
 
-    # TODO(developer): Uncomment and set to a path to your audio file.
-    # speech_file = 'path/to/file.wav'
-    # first_lang = first language code, e,g, 'en-US'
-    # second_lang = first language code, e,g, 'es'
+    speech_file = 'resources/multi.wav'
+    first_lang = 'en-US'
+    second_lang = 'es'
 
     with open(speech_file, 'rb') as audio_file:
         content = audio_file.read()
@@ -226,6 +225,7 @@ def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):
 
     config = speech.types.RecognitionConfig(
         encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
+        sample_rate_hertz=44100,
         audio_channel_count=2,
         language_code=first_lang,
         alternative_language_codes=[second_lang])
@@ -241,15 +241,14 @@ def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):
     # [END speech_transcribe_multilanguage]
 
 
-def transcribe_file_with_word_level_confidence(speech_file):
+def transcribe_file_with_word_level_confidence():
     """Transcribe the given audio file synchronously with
       word level confidence."""
     # [START speech_transcribe_word_level_confidence]
     from google.cloud import speech_v1p1beta1 as speech
     client = speech.SpeechClient()
 
-    # TODO(developer): Uncomment and set to a path to your audio file.
-    # speech_file = 'path/to/file.wav'
+    speech_file = 'resources/Google_Gnome.wav'
 
     with open(speech_file, 'rb') as audio_file:
         content = audio_file.read()
@@ -279,28 +278,20 @@ def transcribe_file_with_word_level_confidence(speech_file):
         description=__doc__,
         formatter_class=argparse.RawDescriptionHelpFormatter)
     parser.add_argument('command')
-    parser.add_argument(
-        'path', help='File for audio file to be recognized')
-    parser.add_argument(
-        'first', help='First language in audio file to be recognized',
-        nargs='?')
-    parser.add_argument(
-        'second', help='Second language in audio file to be recognized',
-        nargs='?')
 
     args = parser.parse_args()
 
     if args.command == 'enhanced-model':
-        transcribe_file_with_enhanced_model(args.path)
+        transcribe_file_with_enhanced_model()
     elif args.command == 'metadata':
-        transcribe_file_with_metadata(args.path)
+        transcribe_file_with_metadata()
     elif args.command == 'punctuation':
-        transcribe_file_with_auto_punctuation(args.path)
+        transcribe_file_with_auto_punctuation()
     elif args.command == 'diarization':
-        transcribe_file_with_diarization(args.path)
+        transcribe_file_with_diarization()
     elif args.command == 'multi-channel':
-        transcribe_file_with_multichannel(args.path)
+        transcribe_file_with_multichannel()
     elif args.command == 'multi-language':
-        transcribe_file_with_multilanguage(args.path, args.first, args.second)
+        transcribe_file_with_multilanguage()
     elif args.command == 'word-level-conf':
-        transcribe_file_with_word_level_confidence(args.path)
+        transcribe_file_with_word_level_confidence()
diff --git a/speech/snippets/beta_snippets_test.py b/speech/snippets/beta_snippets_test.py
@@ -26,56 +26,49 @@
 
 
 def test_transcribe_file_with_enhanced_model(capsys):
-    transcribe_file_with_enhanced_model(
-        os.path.join(RESOURCES, 'commercial_mono.wav'))
+    transcribe_file_with_enhanced_model()
     out, _ = capsys.readouterr()
 
     assert 'Chrome' in out
 
 
 def test_transcribe_file_with_metadata(capsys):
-    transcribe_file_with_metadata(
-        os.path.join(RESOURCES, 'commercial_mono.wav'))
+    transcribe_file_with_metadata()
     out, _ = capsys.readouterr()
 
     assert 'Chrome' in out
 
 
 def test_transcribe_file_with_auto_punctuation(capsys):
-    transcribe_file_with_auto_punctuation(
-        os.path.join(RESOURCES, 'commercial_mono.wav'))
+    transcribe_file_with_auto_punctuation()
     out, _ = capsys.readouterr()
 
     assert 'Okay. Sure.' in out
 
 
 def test_transcribe_diarization(capsys):
-    transcribe_file_with_diarization(
-        os.path.join(RESOURCES, 'Google_Gnome.wav'))
+    transcribe_file_with_diarization()
     out, err = capsys.readouterr()
 
-    assert 'OK Google stream stranger things from Netflix to my TV' in out
+    assert "word: 'here', speaker_tag: 1" in out
 
 
 def test_transcribe_multichannel_file(capsys):
-    transcribe_file_with_multichannel(
-        os.path.join(RESOURCES, 'Google_Gnome.wav'))
+    transcribe_file_with_multichannel()
     out, err = capsys.readouterr()
 
     assert 'OK Google stream stranger things from Netflix to my TV' in out
 
 
 def test_transcribe_multilanguage_file(capsys):
-    transcribe_file_with_multilanguage(
-        os.path.join(RESOURCES, 'multi.wav'), 'en-US', 'es')
+    transcribe_file_with_multilanguage()
     out, err = capsys.readouterr()
 
     assert 'how are you doing estoy bien e tu' in out
 
 
 def test_transcribe_word_level_confidence(capsys):
-    transcribe_file_with_word_level_confidence(
-        os.path.join(RESOURCES, 'Google_Gnome.wav'))
+    transcribe_file_with_word_level_confidence()
     out, err = capsys.readouterr()
 
     assert 'OK Google stream stranger things from Netflix to my TV' in out