1919import com .google .api .gax .longrunning .OperationFuture ;
2020import com .google .api .gax .rpc .ApiStreamObserver ;
2121import com .google .api .gax .rpc .BidiStreamingCallable ;
22+ import com .google .api .gax .rpc .ClientStream ;
23+ import com .google .api .gax .rpc .ResponseObserver ;
24+ import com .google .api .gax .rpc .StreamController ;
2225import com .google .cloud .speech .v1p1beta1 .LongRunningRecognizeMetadata ;
2326import com .google .cloud .speech .v1p1beta1 .LongRunningRecognizeResponse ;
2427import com .google .cloud .speech .v1p1beta1 .RecognitionAudio ;
4750import java .util .ArrayList ;
4851import java .util .List ;
4952
53+ import javax .sound .sampled .AudioFormat ;
54+ import javax .sound .sampled .AudioInputStream ;
55+ import javax .sound .sampled .AudioSystem ;
56+ import javax .sound .sampled .DataLine ;
57+ import javax .sound .sampled .DataLine .Info ;
58+ import javax .sound .sampled .TargetDataLine ;
59+
5060public class Recognize {
5161
5262 /** Run speech recognition tasks. */
@@ -56,9 +66,10 @@ public static void main(String... args) throws Exception {
5666 System .out .printf (
5767 "\t java %s \" <command>\" \" <path-to-image>\" \n "
5868 + "Commands:\n "
59- + "\t syncrecognize | asyncrecognize | streamrecognize | wordoffsets\n "
60- + "\t | model-selection | auto-punctuation | stream-punctuation | enhanced-model\n "
61- + "\t | metadata | diarization | multi-channel | multi-language | word-level-conf"
69+ + "\t syncrecognize | asyncrecognize | streamrecognize | micstreamrecognize \n "
70+ + "\t | wordoffsets | model-selection | auto-punctuation | stream-punctuation \n "
71+ + "\t | enhanced-model| metadata | diarization | multi-channel | multi-language \n "
72+ + "\t | word-level-conf"
6273 + "Path:\n \t A file path (ex: ./resources/audio.raw) or a URI "
6374 + "for a Cloud Storage resource (gs://...)\n " ,
6475 Recognize .class .getCanonicalName ());
@@ -88,6 +99,8 @@ public static void main(String... args) throws Exception {
8899 }
89100 } else if (command .equals ("streamrecognize" )) {
90101 streamingRecognizeFile (path );
102+ } else if (command .equals ("micstreamrecognize" )) {
103+ streamingMicRecognize ();
91104 } else if (command .equals ("model-selection" )) {
92105 if (path .startsWith ("gs://" )) {
93106 transcribeModelSelectionGcs (path );
@@ -704,6 +717,97 @@ public SettableFuture<List<T>> future() {
704717 }
705718 // [END speech_stream_recognize_punctuation]
706719
720+ // [START speech_streaming_mic_recognize]
721+ /** Performs microphone streaming speech recognition with a duration of 1 minute. */
722+ public static void streamingMicRecognize () throws Exception {
723+
724+ ResponseObserver <StreamingRecognizeResponse > responseObserver = null ;
725+ try (SpeechClient client = SpeechClient .create ()) {
726+
727+ responseObserver =
728+ new ResponseObserver <StreamingRecognizeResponse >() {
729+ ArrayList <StreamingRecognizeResponse > responses = new ArrayList <>();
730+
731+ public void onStart (StreamController controller ) {}
732+
733+ public void onResponse (StreamingRecognizeResponse response ) {
734+ responses .add (response );
735+ }
736+
737+ public void onComplete () {
738+ for (StreamingRecognizeResponse response : responses ) {
739+ StreamingRecognitionResult result = response .getResultsList ().get (0 );
740+ SpeechRecognitionAlternative alternative = result .getAlternativesList ().get (0 );
741+ System .out .printf ("Transcript : %s\n " , alternative .getTranscript ());
742+ }
743+ }
744+
745+ public void onError (Throwable t ) {
746+ System .out .println (t );
747+ }
748+ };
749+
750+ ClientStream <StreamingRecognizeRequest > clientStream =
751+ client .streamingRecognizeCallable ().splitCall (responseObserver );
752+
753+ RecognitionConfig recognitionConfig =
754+ RecognitionConfig .newBuilder ()
755+ .setEncoding (RecognitionConfig .AudioEncoding .LINEAR16 )
756+ .setLanguageCode ("en-US" )
757+ .setSampleRateHertz (16000 )
758+ .build ();
759+ StreamingRecognitionConfig streamingRecognitionConfig =
760+ StreamingRecognitionConfig .newBuilder ().setConfig (recognitionConfig ).build ();
761+
762+ StreamingRecognizeRequest request =
763+ StreamingRecognizeRequest .newBuilder ()
764+ .setStreamingConfig (streamingRecognitionConfig )
765+ .build (); // The first request in a streaming call has to be a config
766+
767+ clientStream .send (request );
768+ // SampleRate:16000Hz, SampleSizeInBits: 16, Number of channels: 1, Signed: true,
769+ // bigEndian: false
770+ AudioFormat audioFormat = new AudioFormat (16000 , 16 , 1 , true , false );
771+ DataLine .Info targetInfo =
772+ new Info (
773+ TargetDataLine .class ,
774+ audioFormat ); // Set the system information to read from the microphone audio stream
775+
776+ if (!AudioSystem .isLineSupported (targetInfo )) {
777+ System .out .println ("Microphone not supported" );
778+ System .exit (0 );
779+ }
780+ // Target data line captures the audio stream the microphone produces.
781+ TargetDataLine targetDataLine = (TargetDataLine ) AudioSystem .getLine (targetInfo );
782+ targetDataLine .open (audioFormat );
783+ targetDataLine .start ();
784+ System .out .println ("Start speaking" );
785+ long startTime = System .currentTimeMillis ();
786+ // Audio Input Stream
787+ AudioInputStream audio = new AudioInputStream (targetDataLine );
788+ while (true ) {
789+ long estimatedTime = System .currentTimeMillis () - startTime ;
790+ byte [] data = new byte [6400 ];
791+ audio .read (data );
792+ if (estimatedTime > 60000 ) { // 60 seconds
793+ System .out .println ("Stop speaking." );
794+ targetDataLine .stop ();
795+ targetDataLine .close ();
796+ break ;
797+ }
798+ request =
799+ StreamingRecognizeRequest .newBuilder ()
800+ .setAudioContent (ByteString .copyFrom (data ))
801+ .build ();
802+ clientStream .send (request );
803+ }
804+ } catch (Exception e ) {
805+ System .out .println (e );
806+ }
807+ responseObserver .onComplete ();
808+ }
809+ // [END speech_streaming_mic_recognize]
810+
707811 // [START speech_transcribe_file_with_enhanced_model]
708812 /**
709813 * Transcribe the given audio file using an enhanced model.
@@ -833,8 +937,9 @@ public static void transcribeDiarization(String fileName) throws Exception {
833937 SpeechRecognitionAlternative alternative = result .getAlternatives (0 );
834938 System .out .format ("Transcript : %s\n " , alternative .getTranscript ());
835939 // The words array contains the entire transcript up until that point.
836- //Referencing the last spoken word to get the associated Speaker tag
837- System .out .format ("Speaker Tag %s: %s\n " ,
940+ // Referencing the last spoken word to get the associated Speaker tag
941+ System .out .format (
942+ "Speaker Tag %s: %s\n " ,
838943 alternative .getWords ((alternative .getWordsCount () - 1 )).getSpeakerTag (),
839944 alternative .getTranscript ());
840945 }
@@ -877,8 +982,9 @@ public static void transcribeDiarizationGcs(String gcsUri) throws Exception {
877982 // use the first (most likely) one here.
878983 SpeechRecognitionAlternative alternative = result .getAlternatives (0 );
879984 // The words array contains the entire transcript up until that point.
880- //Referencing the last spoken word to get the associated Speaker tag
881- System .out .format ("Speaker Tag %s:%s\n " ,
985+ // Referencing the last spoken word to get the associated Speaker tag
986+ System .out .format (
987+ "Speaker Tag %s:%s\n " ,
882988 alternative .getWords ((alternative .getWordsCount () - 1 )).getSpeakerTag (),
883989 alternative .getTranscript ());
884990 }
0 commit comments