2727import com .google .cloud .speech .v1p1beta1 .RecognitionMetadata .MicrophoneDistance ;
2828import com .google .cloud .speech .v1p1beta1 .RecognitionMetadata .RecordingDeviceType ;
2929import com .google .cloud .speech .v1p1beta1 .RecognizeResponse ;
30+ import com .google .cloud .speech .v1p1beta1 .SpeakerDiarizationConfig ;
3031import com .google .cloud .speech .v1p1beta1 .SpeechClient ;
32+
3133import com .google .cloud .speech .v1p1beta1 .SpeechRecognitionAlternative ;
3234import com .google .cloud .speech .v1p1beta1 .SpeechRecognitionResult ;
35+ import com .google .cloud .speech .v1p1beta1 .WordInfo ;
3336import com .google .protobuf .ByteString ;
3437
3538import java .nio .file .Files ;
3639import java .nio .file .Path ;
3740import java .nio .file .Paths ;
3841import java .util .ArrayList ;
39- import java .util .List ;
4042
4143public class Recognize {
4244
@@ -154,32 +156,52 @@ public static void transcribeDiarization(String fileName) throws Exception {
154156 RecognitionAudio recognitionAudio =
155157 RecognitionAudio .newBuilder ().setContent (ByteString .copyFrom (content )).build ();
156158
159+ SpeakerDiarizationConfig speakerDiarizationConfig = SpeakerDiarizationConfig .newBuilder ()
160+ .setEnableSpeakerDiarization (true )
161+ .setMinSpeakerCount (2 )
162+ .setMaxSpeakerCount (2 )
163+ .build ();
164+
157165 // Configure request to enable Speaker diarization
158- RecognitionConfig config =
159- RecognitionConfig .newBuilder ()
166+ RecognitionConfig config = RecognitionConfig .newBuilder ()
160167 .setEncoding (AudioEncoding .LINEAR16 )
161168 .setLanguageCode ("en-US" )
162169 .setSampleRateHertz (8000 )
163- .setEnableSpeakerDiarization (true )
164- .setDiarizationSpeakerCount (2 )
170+ .setDiarizationConfig (speakerDiarizationConfig )
165171 .build ();
166172
167173 // Perform the transcription request
168174 RecognizeResponse recognizeResponse = speechClient .recognize (config , recognitionAudio );
169175
170- // Print out the results
171- for (SpeechRecognitionResult result : recognizeResponse .getResultsList ()) {
172- // There can be several alternative transcripts for a given chunk of speech. Just
173- // use the first (most likely) one here.
174- SpeechRecognitionAlternative alternative = result .getAlternatives (0 );
175- System .out .format ("Transcript : %s\n " , alternative .getTranscript ());
176- // The words array contains the entire transcript up until that point.
177- // Referencing the last spoken word to get the associated Speaker tag
178- System .out .format (
179- "Speaker Tag %s: %s\n " ,
180- alternative .getWords ((alternative .getWordsCount () - 1 )).getSpeakerTag (),
181- alternative .getTranscript ());
176+ // Speaker Tags are only included in the last result object, which has only one alternative.
177+ SpeechRecognitionAlternative alternative =
178+ recognizeResponse .getResults (
179+ recognizeResponse .getResultsCount () - 1 ).getAlternatives (0 );
180+
181+ // The alternative is made up of WordInfo objects that contain the speaker_tag.
182+ WordInfo wordInfo = alternative .getWords (0 );
183+ int currentSpeakerTag = wordInfo .getSpeakerTag ();
184+
185+ // For each word, get all the words associated with one speaker, once the speaker changes,
186+ // add a new line with the new speaker and their spoken words.
187+ StringBuilder speakerWords = new StringBuilder (
188+ String .format ("Speaker %d: %s" , wordInfo .getSpeakerTag (), wordInfo .getWord ()));
189+
190+ for (int i = 1 ; i < alternative .getWordsCount (); i ++) {
191+ wordInfo = alternative .getWords (i );
192+ if (currentSpeakerTag == wordInfo .getSpeakerTag ()) {
193+ speakerWords .append (" " );
194+ speakerWords .append (wordInfo .getWord ());
195+ } else {
196+ speakerWords .append (
197+ String .format ("\n Speaker %d: %s" ,
198+ wordInfo .getSpeakerTag (),
199+ wordInfo .getWord ()));
200+ currentSpeakerTag = wordInfo .getSpeakerTag ();
201+ }
182202 }
203+
204+ System .out .println (speakerWords .toString ());
183205 }
184206 }
185207 // [END speech_transcribe_diarization_beta]
@@ -192,14 +214,19 @@ public static void transcribeDiarization(String fileName) throws Exception {
192214 */
193215 public static void transcribeDiarizationGcs (String gcsUri ) throws Exception {
194216 try (SpeechClient speechClient = SpeechClient .create ()) {
217+ SpeakerDiarizationConfig speakerDiarizationConfig = SpeakerDiarizationConfig .newBuilder ()
218+ .setEnableSpeakerDiarization (true )
219+ .setMinSpeakerCount (2 )
220+ .setMaxSpeakerCount (2 )
221+ .build ();
222+
195223 // Configure request to enable Speaker diarization
196224 RecognitionConfig config =
197225 RecognitionConfig .newBuilder ()
198226 .setEncoding (AudioEncoding .LINEAR16 )
199227 .setLanguageCode ("en-US" )
200228 .setSampleRateHertz (8000 )
201- .setEnableSpeakerDiarization (true )
202- .setDiarizationSpeakerCount (2 )
229+ .setDiarizationConfig (speakerDiarizationConfig )
203230 .build ();
204231
205232 // Set the remote path for the audio file
@@ -214,17 +241,37 @@ public static void transcribeDiarizationGcs(String gcsUri) throws Exception {
214241 Thread .sleep (10000 );
215242 }
216243
217- for (SpeechRecognitionResult result : response .get ().getResultsList ()) {
218- // There can be several alternative transcripts for a given chunk of speech. Just
219- // use the first (most likely) one here.
220- SpeechRecognitionAlternative alternative = result .getAlternatives (0 );
221- // The words array contains the entire transcript up until that point.
222- // Referencing the last spoken word to get the associated Speaker tag
223- System .out .format (
224- "Speaker Tag %s:%s\n " ,
225- alternative .getWords ((alternative .getWordsCount () - 1 )).getSpeakerTag (),
226- alternative .getTranscript ());
244+ // Speaker Tags are only included in the last result object, which has only one alternative.
245+ LongRunningRecognizeResponse longRunningRecognizeResponse = response .get ();
246+ SpeechRecognitionAlternative alternative =
247+ longRunningRecognizeResponse .getResults (
248+ longRunningRecognizeResponse .getResultsCount () - 1 )
249+ .getAlternatives (0 );
250+
251+ // The alternative is made up of WordInfo objects that contain the speaker_tag.
252+ WordInfo wordInfo = alternative .getWords (0 );
253+ int currentSpeakerTag = wordInfo .getSpeakerTag ();
254+
255+ // For each word, get all the words associated with one speaker, once the speaker changes,
256+ // add a new line with the new speaker and their spoken words.
257+ StringBuilder speakerWords = new StringBuilder (
258+ String .format ("Speaker %d: %s" , wordInfo .getSpeakerTag (), wordInfo .getWord ()));
259+
260+ for (int i = 1 ; i < alternative .getWordsCount (); i ++) {
261+ wordInfo = alternative .getWords (i );
262+ if (currentSpeakerTag == wordInfo .getSpeakerTag ()) {
263+ speakerWords .append (" " );
264+ speakerWords .append (wordInfo .getWord ());
265+ } else {
266+ speakerWords .append (
267+ String .format ("\n Speaker %d: %s" ,
268+ wordInfo .getSpeakerTag (),
269+ wordInfo .getWord ()));
270+ currentSpeakerTag = wordInfo .getSpeakerTag ();
271+ }
227272 }
273+
274+ System .out .println (speakerWords .toString ());
228275 }
229276 }
230277 // [END speech_transcribe_diarization_gcs_beta]
@@ -454,7 +501,7 @@ public static void transcribeWordLevelConfidenceGcs(String gcsUri) throws Except
454501 RecognitionConfig config =
455502 RecognitionConfig .newBuilder ()
456503 .setEncoding (AudioEncoding .FLAC )
457- .setSampleRateHertz (16000 )
504+ .setSampleRateHertz (44100 )
458505 .setLanguageCode ("en-US" )
459506 .setEnableWordConfidence (true )
460507 .build ();
0 commit comments