Skip to content

Commit

Permalink
Diarization Output Modified (#1586)
Browse files Browse the repository at this point in the history
* Printing the last paragraph only.

* Python3 print

* Removing sample rate setting

* Adding the missing output parameter in the example

* Changes based on the comments

* Removed filenames as input parameters

* Removed unused args

* Updated README file

* Updated the inline comment

* Modified code to make it more readable

* Simplified the response object processing.

* Fixing the long line issue.
  • Loading branch information
happyhuman authored Jul 20, 2018
1 parent 809b232 commit c310941
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 80 deletions.
19 changes: 8 additions & 11 deletions speech/cloud-client/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -221,25 +221,22 @@ To run this sample:
$ python beta_snippets.py
usage: beta_snippets.py [-h] command path first second
usage: beta_snippets.py [-h] command
Google Cloud Speech API sample that demonstrates enhanced models
and recognition metadata.
Example usage:
python beta_snippets.py enhanced-model resources/commercial_mono.wav
python beta_snippets.py metadata resources/commercial_mono.wav
python beta_snippets.py punctuation resources/commercial_mono.wav
python beta_snippets.py diarization resources/commercial_mono.wav
python beta_snippets.py multi-channel resources/commercial_mono.wav
python beta_snippets.py multi-language resources/multi.wav en-US es
python beta_snippets.py word-level-conf resources/commercial_mono.wav
python beta_snippets.py enhanced-model
python beta_snippets.py metadata
python beta_snippets.py punctuation
python beta_snippets.py diarization
python beta_snippets.py multi-channel
python beta_snippets.py multi-language
python beta_snippets.py word-level-conf
positional arguments:
command
path File for audio file to be recognized
first First language in audio file to be recognized
second Second language in audio file to be recognized
optional arguments:
-h, --help show this help message and exit
Expand Down
97 changes: 44 additions & 53 deletions speech/cloud-client/beta_snippets.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,26 @@
and recognition metadata.
Example usage:
python beta_snippets.py enhanced-model resources/commercial_mono.wav
python beta_snippets.py metadata resources/commercial_mono.wav
python beta_snippets.py punctuation resources/commercial_mono.wav
python beta_snippets.py diarization resources/commercial_mono.wav
python beta_snippets.py multi-channel resources/commercial_mono.wav
python beta_snippets.py multi-language resources/multi.wav en-US es
python beta_snippets.py word-level-conf resources/commercial_mono.wav
python beta_snippets.py enhanced-model
python beta_snippets.py metadata
python beta_snippets.py punctuation
python beta_snippets.py diarization
python beta_snippets.py multi-channel
python beta_snippets.py multi-language
python beta_snippets.py word-level-conf
"""

import argparse
import io


def transcribe_file_with_enhanced_model(speech_file):
def transcribe_file_with_enhanced_model():
"""Transcribe the given audio file using an enhanced model."""
# [START speech_transcribe_file_with_enhanced_model]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/commercial_mono.wav'

with io.open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand All @@ -64,14 +63,13 @@ def transcribe_file_with_enhanced_model(speech_file):
# [END speech_transcribe_file_with_enhanced_model]


def transcribe_file_with_metadata(speech_file):
def transcribe_file_with_metadata():
"""Send a request that includes recognition metadata."""
# [START speech_transcribe_file_with_metadata]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/commercial_mono.wav'

with io.open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand Down Expand Up @@ -110,14 +108,13 @@ def transcribe_file_with_metadata(speech_file):
# [END speech_transcribe_file_with_metadata]


def transcribe_file_with_auto_punctuation(speech_file):
def transcribe_file_with_auto_punctuation():
"""Transcribe the given audio file with auto punctuation enabled."""
# [START speech_transcribe_file_with_auto_punctuation]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/commercial_mono.wav'

with io.open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand All @@ -140,14 +137,13 @@ def transcribe_file_with_auto_punctuation(speech_file):
# [END speech_transcribe_file_with_auto_punctuation]


def transcribe_file_with_diarization(speech_file):
def transcribe_file_with_diarization():
"""Transcribe the given audio file synchronously with diarization."""
# [START speech_transcribe_diarization]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/commercial_mono.wav'

with open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand All @@ -156,33 +152,37 @@ def transcribe_file_with_diarization(speech_file):

config = speech.types.RecognitionConfig(
encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
sample_rate_hertz=8000,
language_code='en-US',
enable_speaker_diarization=True,
diarization_speaker_count=2)

print('Waiting for operation to complete...')
response = client.recognize(config, audio)

for i, result in enumerate(response.results):
alternative = result.alternatives[0]
print('-' * 20)
print('First alternative of result {}: {}'
.format(i, alternative.transcript))
print('Speaker Tag for the first word: {}'
.format(alternative.words[0].speaker_tag))
# The transcript within each result is separate and sequential per result.
# However, the words list within an alternative includes all the words
# from all the results thus far. Thus, to get all the words with speaker
# tags, you only have to take the words list from the last result:
result = response.results[-1]

words_info = result.alternatives[0].words

# Printing out the output:
for word_info in words_info:
print("word: '{}', speaker_tag: {}".format(word_info.word,
word_info.speaker_tag))
# [END speech_transcribe_diarization]


def transcribe_file_with_multichannel(speech_file):
def transcribe_file_with_multichannel():
"""Transcribe the given audio file synchronously with
multi channel."""
# [START speech_transcribe_multichannel]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/Google_Gnome.wav'

with open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand All @@ -207,17 +207,16 @@ def transcribe_file_with_multichannel(speech_file):
# [END speech_transcribe_multichannel]


def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):
def transcribe_file_with_multilanguage():
"""Transcribe the given audio file synchronously with
multi language."""
# [START speech_transcribe_multilanguage]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
# first_lang = first language code, e,g, 'en-US'
# second_lang = first language code, e,g, 'es'
speech_file = 'resources/multi.wav'
first_lang = 'en-US'
second_lang = 'es'

with open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand All @@ -226,6 +225,7 @@ def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):

config = speech.types.RecognitionConfig(
encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=44100,
audio_channel_count=2,
language_code=first_lang,
alternative_language_codes=[second_lang])
Expand All @@ -241,15 +241,14 @@ def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):
# [END speech_transcribe_multilanguage]


def transcribe_file_with_word_level_confidence(speech_file):
def transcribe_file_with_word_level_confidence():
"""Transcribe the given audio file synchronously with
word level confidence."""
# [START speech_transcribe_word_level_confidence]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/Google_Gnome.wav'

with open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand Down Expand Up @@ -279,28 +278,20 @@ def transcribe_file_with_word_level_confidence(speech_file):
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('command')
parser.add_argument(
'path', help='File for audio file to be recognized')
parser.add_argument(
'first', help='First language in audio file to be recognized',
nargs='?')
parser.add_argument(
'second', help='Second language in audio file to be recognized',
nargs='?')

args = parser.parse_args()

if args.command == 'enhanced-model':
transcribe_file_with_enhanced_model(args.path)
transcribe_file_with_enhanced_model()
elif args.command == 'metadata':
transcribe_file_with_metadata(args.path)
transcribe_file_with_metadata()
elif args.command == 'punctuation':
transcribe_file_with_auto_punctuation(args.path)
transcribe_file_with_auto_punctuation()
elif args.command == 'diarization':
transcribe_file_with_diarization(args.path)
transcribe_file_with_diarization()
elif args.command == 'multi-channel':
transcribe_file_with_multichannel(args.path)
transcribe_file_with_multichannel()
elif args.command == 'multi-language':
transcribe_file_with_multilanguage(args.path, args.first, args.second)
transcribe_file_with_multilanguage()
elif args.command == 'word-level-conf':
transcribe_file_with_word_level_confidence(args.path)
transcribe_file_with_word_level_confidence()
23 changes: 8 additions & 15 deletions speech/cloud-client/beta_snippets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,56 +26,49 @@


def test_transcribe_file_with_enhanced_model(capsys):
transcribe_file_with_enhanced_model(
os.path.join(RESOURCES, 'commercial_mono.wav'))
transcribe_file_with_enhanced_model()
out, _ = capsys.readouterr()

assert 'Chrome' in out


def test_transcribe_file_with_metadata(capsys):
transcribe_file_with_metadata(
os.path.join(RESOURCES, 'commercial_mono.wav'))
transcribe_file_with_metadata()
out, _ = capsys.readouterr()

assert 'Chrome' in out


def test_transcribe_file_with_auto_punctuation(capsys):
transcribe_file_with_auto_punctuation(
os.path.join(RESOURCES, 'commercial_mono.wav'))
transcribe_file_with_auto_punctuation()
out, _ = capsys.readouterr()

assert 'Okay. Sure.' in out


def test_transcribe_diarization(capsys):
transcribe_file_with_diarization(
os.path.join(RESOURCES, 'Google_Gnome.wav'))
transcribe_file_with_diarization()
out, err = capsys.readouterr()

assert 'OK Google stream stranger things from Netflix to my TV' in out
assert "word: 'here', speaker_tag: 1" in out


def test_transcribe_multichannel_file(capsys):
transcribe_file_with_multichannel(
os.path.join(RESOURCES, 'Google_Gnome.wav'))
transcribe_file_with_multichannel()
out, err = capsys.readouterr()

assert 'OK Google stream stranger things from Netflix to my TV' in out


def test_transcribe_multilanguage_file(capsys):
transcribe_file_with_multilanguage(
os.path.join(RESOURCES, 'multi.wav'), 'en-US', 'es')
transcribe_file_with_multilanguage()
out, err = capsys.readouterr()

assert 'how are you doing estoy bien e tu' in out


def test_transcribe_word_level_confidence(capsys):
transcribe_file_with_word_level_confidence(
os.path.join(RESOURCES, 'Google_Gnome.wav'))
transcribe_file_with_word_level_confidence()
out, err = capsys.readouterr()

assert 'OK Google stream stranger things from Netflix to my TV' in out
2 changes: 1 addition & 1 deletion texttospeech/cloud-client/audio_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
Example usage:
python audio_profile.py --text "hello" --effects_profile_id
"telephony-class-application"
"telephony-class-application" --output "output.mp3"
"""

import argparse
Expand Down

0 comments on commit c310941

Please sign in to comment.