GoogleCloudPlatform · piotrgregor · Jul 9, 2024 · Jul 9, 2024 · Jul 9, 2024 · Jul 9, 2024
@@ -31,6 +31,9 @@ ParseResult ParseArguments(int argc, char* argv[]) {
       ("bitrate", po::value<int>()->default_value(16000),
        "the sample rate in Hz")
       //
+      ("ptime", po::value<int>()->default_value(0),
+       "(optional) packetization time in milliseconds")
+      //
       ("language-code", po::value<std::string>()->default_value("en"),
        "the language code for the audio")
       //
@@ -47,6 +50,7 @@ ParseResult ParseArguments(int argc, char* argv[]) {
 
   auto const path = vm["path"].as<std::string>();
   auto const bitrate = vm["bitrate"].as<int>();
+  auto const ptime = vm["ptime"].as<int>();
   auto const language_code = vm["language-code"].as<std::string>();
   // Validate the command-line options.
   if (path.empty()) {
@@ -57,11 +61,19 @@ ParseResult ParseArguments(int argc, char* argv[]) {
         "--bitrate option must be a positive number, value=" +
         std::to_string(bitrate));
   }
+  if (ptime < 0) {
+    throw std::runtime_error(
+        "--ptime option must be a positive number, value=" +
+        std::to_string(ptime));
+  }
 
   ParseResult result;
   result.path = path;
   result.config.set_language_code(language_code);
   result.config.set_sample_rate_hertz(bitrate);
+  result.bitrate = 0;
+  result.ptime = 0;
+  result.sample_size = 0;
 
   // Use the audio file extension to configure the encoding.
   auto const ext = [&] {
@@ -75,8 +87,14 @@ ParseResult ParseArguments(int argc, char* argv[]) {
 
   if (ext.empty() || ext == ".raw") {
     result.config.set_encoding(RecognitionConfig::LINEAR16);
+    result.bitrate = bitrate;
+    result.ptime = ptime;
+    result.sample_size = 2;
   } else if (ext == ".ulaw") {
     result.config.set_encoding(RecognitionConfig::MULAW);
+    result.bitrate = bitrate;
+    result.ptime = ptime;
+    result.sample_size = 1;
   } else if (ext == ".flac") {
     result.config.set_encoding(RecognitionConfig::FLAC);
   } else if (ext == ".amr") {

@@ -23,6 +23,9 @@
 struct ParseResult {
   google::cloud::speech::v1::RecognitionConfig config;
   std::string path;
+  int bitrate;
+  int ptime;
+  int sample_size;
 };
 
 ParseResult ParseArguments(int argc, char* argv[]);

@@ -27,25 +27,34 @@ using RecognizeStream = ::google::cloud::AsyncStreamingReadWriteRpc<
     speech::v1::StreamingRecognizeResponse>;
 
 auto constexpr kUsage = R"""(Usage:
-  streaming_transcribe [--bitrate N] audio.(raw|ulaw|flac|amr|awb)
+  streaming_transcribe [--bitrate N] [--ptime N] audio.(raw|ulaw|flac|amr|awb)
 )""";
 
-// Write the audio in 64k chunks at a time, simulating audio content arriving
-// from a microphone.
-void MicrophoneThreadMain(RecognizeStream& stream,
-                          std::string const& file_path) {
+// Write the audio packet every ptime ms, simulating audio content arriving
+// from a microphone in ptime ms intervals
+void MicrophoneThreadMain(RecognizeStream& stream, std::string const& file_path,
+                          const int bitrate, const int ptime,
+                          const int sample_size) {
   speech::v1::StreamingRecognizeRequest request;
   std::ifstream file_stream(file_path, std::ios::binary);
-  auto constexpr kChunkSize = 64 * 1024;
-  std::vector<char> chunk(kChunkSize);
+  // By default, read 64k bytes every 1 second
+  auto bytes_n = 64 * 1024;
+  auto wait_ms = 1000;
+  // If ptime is configured read packet every ptime ms (may only be set for
+  // "raw" and "ulaw")
+  if (ptime) {
+    wait_ms = ptime;
+    bytes_n = (bitrate / 1000) * sample_size * ptime;
+  }
+  std::vector<char> chunk(bytes_n);
   while (true) {
     // Read another chunk from the file.
     file_stream.read(chunk.data(), chunk.size());
     auto const bytes_read = file_stream.gcount();
     // And write the chunk to the stream.
     if (bytes_read > 0) {
       request.set_audio_content(chunk.data(), bytes_read);
-      std::cout << "Sending " << bytes_read / 1024 << "k bytes." << std::endl;
+      std::cout << "Sending " << bytes_read << " bytes." << std::endl;
       if (!stream.Write(request, grpc::WriteOptions()).get()) break;
     }
     if (!file_stream) {
@@ -54,7 +63,7 @@ void MicrophoneThreadMain(RecognizeStream& stream,
       break;
     }
     // Wait a second before writing the next chunk.
-    std::this_thread::sleep_for(std::chrono::seconds(1));
+    std::this_thread::sleep_for(std::chrono::milliseconds(wait_ms));
   }
 }
 
@@ -65,6 +74,9 @@ int main(int argc, char** argv) try {
   // Parse command line arguments.
   auto args = ParseArguments(argc, argv);
   auto const file_path = args.path;
+  auto const bitrate = args.bitrate;
+  auto const ptime = args.ptime;
+  auto const sample_size = args.sample_size;
 
   speech::v1::StreamingRecognizeRequest request;
   auto& streaming_config = *request.mutable_streaming_config();
@@ -81,8 +93,8 @@ int main(int argc, char** argv) try {
   }
 
   // Simulate a microphone thread using the file as input.
-  auto microphone =
-      std::thread(MicrophoneThreadMain, std::ref(*stream), file_path);
+  auto microphone = std::thread(MicrophoneThreadMain, std::ref(*stream),
+                                file_path, bitrate, ptime, sample_size);
   // Read responses.
   auto read = [&stream] { return stream->Read().get(); };
   for (auto response = read(); response.has_value(); response = read()) {