Anon-Artist
diff --git a/‎Kaldi/SpeechRecognition/Dockerfile‎
Lines changed: 4 additions & 7 deletions b/‎Kaldi/SpeechRecognition/Dockerfile‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎Kaldi/SpeechRecognition/Dockerfile.client‎
Lines changed: 6 additions & 4 deletions b/‎Kaldi/SpeechRecognition/Dockerfile.client‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎Kaldi/SpeechRecognition/Dockerfile.notebook‎
Lines changed: 31 additions & 0 deletions b/‎Kaldi/SpeechRecognition/Dockerfile.notebook‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎Kaldi/SpeechRecognition/README.md‎
Lines changed: 22 additions & 17 deletions b/‎Kaldi/SpeechRecognition/README.md‎
Lines changed: 22 additions & 17 deletions
diff --git a/‎Kaldi/SpeechRecognition/kaldi-asr-client/CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions b/‎Kaldi/SpeechRecognition/kaldi-asr-client/CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎Kaldi/SpeechRecognition/kaldi-asr-client/asr_client_imp.cc‎
Lines changed: 67 additions & 19 deletions b/‎Kaldi/SpeechRecognition/kaldi-asr-client/asr_client_imp.cc‎
Lines changed: 67 additions & 19 deletions
diff --git a/‎Kaldi/SpeechRecognition/kaldi-asr-client/asr_client_imp.h‎
Lines changed: 4 additions & 2 deletions b/‎Kaldi/SpeechRecognition/kaldi-asr-client/asr_client_imp.h‎
Lines changed: 4 additions & 2 deletions
@@ -11,13 +11,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvcr.io/nvidia/kaldi:19.12-online-beta-py3 as kb
+FROM nvcr.io/nvidia/kaldi:20.03-py3 as kb
+FROM nvcr.io/nvidia/tritonserver:20.03-py3
 ENV DEBIAN_FRONTEND=noninteractive
 
-ARG PYVER=3.6
-
-FROM nvcr.io/nvidia/tensorrtserver:19.12-py3
-
 # Kaldi dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         automake \
@@ -27,8 +24,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         gawk \
         libatlas3-base \
         libtool \
-        python$PYVER \
-        python$PYVER-dev \
+        python3.6 \
+        python3.6-dev \
         sox \
         subversion \
         unzip \
 
@@ -11,8 +11,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvcr.io/nvidia/kaldi:19.12-online-beta-py3 as kb
-FROM nvcr.io/nvidia/tensorrtserver:19.12-py3-clientsdk
+FROM nvcr.io/nvidia/kaldi:20.03-py3 as kb
+FROM nvcr.io/nvidia/tritonserver:20.03-py3-clientsdk
 
 # Kaldi dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -23,8 +23,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         gawk \
         libatlas3-base \
         libtool \
-        python$PYVER \
-        python$PYVER-dev \
+        python3.6 \
+        python3.6-dev \
         sox \
         subversion \
         unzip \
@@ -36,6 +36,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 COPY --from=kb /opt/kaldi /opt/kaldi
 ENV LD_LIBRARY_PATH /opt/kaldi/src/lib/:$LD_LIBRARY_PATH
 
+COPY scripts /workspace/scripts
+
 COPY kaldi-asr-client /workspace/src/clients/c++/kaldi-asr-client
 RUN echo "add_subdirectory(kaldi-asr-client)" >> "/workspace/src/clients/c++/CMakeLists.txt"
 RUN cd /workspace/build/ && make -j16 trtis-clients
@@ -0,0 +1,31 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM nvcr.io/nvidia/tritonserver:20.03-py3-clientsdk
+
+# Kaldi dependencies
+RUN apt-get update && apt-get install -y jupyter \
+                   python3-pyaudio \
+                   python-pyaudio \
+                   libasound-dev \
+                   portaudio19-dev \
+                   libportaudio2 \
+                   libportaudiocpp0 \
+                   libsndfile1 \
+                   alsa-base \
+                   alsa-utils \
+                   vim
+
+RUN python3 -m pip uninstall -y pip
+RUN apt install python3-pip --reinstall
+RUN pip3 install matplotlib soundfile librosa sounddevice
@@ -1,6 +1,6 @@
-# Kaldi ASR Integration With TensorRT Inference Server
+# Kaldi ASR Integration With Triton
 
-This repository provides a Kaldi ASR custom backend for the NVIDIA TensorRT Inference Server (TRTIS). It can be used to demonstrate high-performance online inference on Kaldi ASR models. This includes handling the gRPC communication between the TensorRT Inference Server and clients, and the dynamic batching of inference requests. This repository is tested and maintained by NVIDIA.
+This repository provides a Kaldi ASR custom backend for the NVIDIA Triton (former TensorRT Inference Server). It can be used to demonstrate high-performance online inference on Kaldi ASR models. This includes handling the gRPC communication between the Triton and clients, and the dynamic batching of inference requests. This repository is tested and maintained by NVIDIA.
 
 ## Table Of Contents
 
@@ -33,9 +33,9 @@ This repository provides a Kaldi ASR custom backend for the NVIDIA TensorRT Infe
 
 This repository provides a wrapper around the online GPU-accelerated ASR pipeline from the paper [GPU-Accelerated Viterbi Exact Lattice Decoder for Batched Online and Offline Speech Recognition](https://arxiv.org/abs/1910.10032). That work includes a high-performance implementation of a GPU HMM Decoder, a low-latency Neural Net driver, fast Feature Extraction for preprocessing, and new ASR pipelines tailored for GPUs. These different modules have been integrated into the Kaldi ASR framework.
 
-This repository contains a TensorRT Inference Server custom backend for the Kaldi ASR framework. This custom backend calls the high-performance online GPU pipeline from the Kaldi ASR framework. This TensorRT Inference Server integration provides ease-of-use to Kaldi ASR inference: gRPC streaming server, dynamic sequence batching, and multi-instances support. A client connects to the gRPC server, streams audio by sending chunks to the server, and gets back the inferred text as an answer (see [Input/Output](#input-output)). More information about the TensorRT Inference Server can be found [here](https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-guide/docs/).  
+This repository contains a Triton custom backend for the Kaldi ASR framework. This custom backend calls the high-performance online GPU pipeline from the Kaldi ASR framework. This Triton integration provides ease-of-use to Kaldi ASR inference: gRPC streaming server, dynamic sequence batching, and multi-instances support. A client connects to the gRPC server, streams audio by sending chunks to the server, and gets back the inferred text as an answer (see [Input/Output](#input-output)). More information about the Triton can be found [here](https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-guide/docs/).  
 
-This TensorRT Inference Server integration is meant to be used with the LibriSpeech model for demonstration purposes. We include a pre-trained version of this model to allow you to easily test this work (see [Quick Start Guide](#quick-start-guide)). Both the TensorRT Inference Server integration and the underlying Kaldi ASR online GPU pipeline are a work in progress and will support more functionalities in the future. This includes online iVectors not currently supported in the Kaldi ASR GPU online pipeline and being replaced by a zero vector (see [Known issues](#known-issues)). Support for a custom Kaldi model is experimental (see [Using a custom Kaldi model](#using-custom-kaldi-model)).
+This Triton integration is meant to be used with the LibriSpeech model for demonstration purposes. We include a pre-trained version of this model to allow you to easily test this work (see [Quick Start Guide](#quick-start-guide)). Both the Triton integration and the underlying Kaldi ASR online GPU pipeline are a work in progress and will support more functionalities in the future. Support for a custom Kaldi model is experimental (see [Using a custom Kaldi model](#using-custom-kaldi-model)).
 
 ### Reference model
 
@@ -60,7 +60,7 @@ Details about parameters can be found in the [Parameters](#parameters) section.
 
 ### Requirements 
 
-This repository contains Dockerfiles which extends the Kaldi and TensorRT Inference Server NVIDIA GPU Cloud (NGC) containers and encapsulates some dependencies. Aside from these dependencies, ensure you have [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker) installed.
+This repository contains Dockerfiles which extends the Kaldi and Triton NVIDIA GPU Cloud (NGC) containers and encapsulates some dependencies. Aside from these dependencies, ensure you have [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker) installed.
 
 
 For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
@@ -108,7 +108,7 @@ The following command will stream 1000 parallel streams to the server. The `-p`
 
 ### Parameters
 
-The configuration is done through the `config.pbtxt` file available in `model-repo/` directory. It allows you to specify the following:
+The configuration is done through the `config.pbtxt` file available in the `model-repo/kaldi_online/` directory. It allows you to specify the following:
 
 ####  Model path
 
@@ -141,7 +141,7 @@ The inference engine configuration parameters configure the inference engine. Th
 
 ### Inference process
 
-Inference is done through simulating concurrent users. Each user is attributed to one utterance from the LibriSpeech dataset. It streams that utterance by cutting it into chunks and gets the final `TEXT` output once the final chunk has been sent. A parameter sets the number of active users being simulated in parallel.  
+Inference is done through simulating concurrent users. Each user is attributed to one utterance from the LibriSpeech dataset. It streams that utterance by cutting it into chunks and gets the final `TEXT` output once the final chunk has been sent. The `-c` parameter sets the number of active users being simulated in parallel.  
 
 ### Client command-line parameters
 
@@ -187,7 +187,8 @@ Even if only the best path is used, we are still generating a full lattice for b
 
 Support for Kaldi ASR models that are different from the provided LibriSpeech model is experimental. However, it is possible to modify the [Model Path](#model-path) section of the config file `model-repo/kaldi_online/config.pbtxt` to set up your own model. 
 
-The models and Kaldi allocators are currently not shared between instances. This means that if your model is large, you may end up with not enough memory on the GPU to store two different instances. If that's the case, you can set `count` to `1` in the `instance_group` section of the config file.
+The models and Kaldi allocators are currently not shared between instances. This means that if your model is large, you may end up with not enough memory on the GPU to store two different instances. If that's the case, 
+you can set `count` to `1` in the [`instance_group` section](https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-guide/docs/model_configuration.html#instance-groups) of the config file.
 
 ## Performance
 
@@ -218,16 +219,17 @@ Our results were obtained by:
 1. Building and starting the server as described in [Quick Start Guide](#quick-start-guide).
 2. Running  `scripts/run_inference_all_v100.sh` and  `scripts/run_inference_all_t4.sh`
 
+
 | GPU | Realtime I/O | Number of parallel audio channels | Throughput (RTFX) | Latency | | | |
 | ------ | ------ | ------ | ------ | ------ | ------ | ------ |------ |
 | | | | | 90% | 95% | 99% | Avg |
-| V100 | No | 2000 | 1769.8 | N/A | N/A | N/A | N/A |
-| V100 | Yes | 1500 |  1220 | 0.424 | 0.473 | 0.758 | 0.345 |
-| V100 | Yes | 1000 |  867.4 | 0.358 | 0.405 | 0.707 | 0.276 |
-| V100 | Yes | 800 |  647.8 | 0.304 | 0.325 | 0.517 | 0.238 |
-| T4 | No | 1000 | 906.7 | N/A | N/A | N/A| N/A |
-| T4 | Yes | 700 | 629.6 | 0.629 | 0.782 | 1.01 | 0.463 |
-| T4 | Yes | 400 | 373.7 | 0.417 | 0.441 | 0.690 | 0.349 |
+| V100 | No | 2000 | 1506.5 | N/A | N/A | N/A | N/A |
+| V100 | Yes | 1500 |  1243.2 | 0.582 | 0.699 | 1.04 | 0.400 |
+| V100 | Yes | 1000 |  884.1 | 0.379 | 0.393 | 0.788 | 0.333 |
+| V100 | Yes | 800 |  660.2 | 0.334 | 0.340 | 0.438 | 0.288 |
+| T4 | No | 1000 | 675.2 | N/A | N/A | N/A| N/A |
+| T4 | Yes | 700 | 629.2 | 0.945 | 1.08 | 1.27 | 0.645 |
+| T4 | Yes | 400 | 373.7 | 0.579 | 0.624 | 0.758 | 0.452 |
 
 ## Release notes
 
@@ -236,6 +238,9 @@ Our results were obtained by:
 January 2020
 * Initial release
 
-### Known issues
+April 2020
+* Printing WER accuracy in Triton client
+* Using the latest Kaldi GPU ASR pipeline, extended support for features (ivectors, fbanks)
 
-Only mfcc features are supported at this time. The reference model used in the benchmark scripts requires both mfcc and iVector features to deliver the best accuracy. Support for iVector features will be added in a future release.
+### Known issues
+* No multi-gpu support for the Triton integration
@@ -32,6 +32,7 @@ target_include_directories(
   /opt/kaldi/src/
 )
 
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w") # openfst yields many warnings
 target_include_directories(
   kaldi_asr_parallel_client 
   PRIVATE
@@ -58,6 +59,10 @@ target_link_libraries(
   PRIVATE /opt/kaldi/src/lib/libkaldi-base.so
 )
 
+target_link_libraries(
+  kaldi_asr_parallel_client
+  PRIVATE /opt/kaldi/src/lat/libkaldi-lat.so
+)
 
 install(
   TARGETS kaldi_asr_parallel_client
 
@@ -18,6 +18,11 @@
 #include <cstring>
 #include <iomanip>
 #include <numeric>
+#include <sstream>
+
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+#include "util/kaldi-table.h"
 
 #define FAIL_IF_ERR(X, MSG)                                        \
   {                                                                \
@@ -31,11 +36,12 @@
 void TRTISASRClient::CreateClientContext() {
   contextes_.emplace_back();
   ClientContext& client = contextes_.back();
-  FAIL_IF_ERR(nic::InferGrpcStreamContext::Create(
-                  &client.trtis_context, /*corr_id*/ -1, url_, model_name_,
-                  /*model_version*/ -1,
-                  /*verbose*/ false),
-              "unable to create context");
+  FAIL_IF_ERR(
+      nic::InferGrpcStreamContext::Create(&client.trtis_context,
+                                          /*corr_id*/ -1, url_, model_name_,
+                                          /*model_version*/ -1,
+                                          /*verbose*/ false),
+      "unable to create context");
 }
 
 void TRTISASRClient::SendChunk(ni::CorrelationID corr_id,
@@ -59,6 +65,8 @@ void TRTISASRClient::SendChunk(ni::CorrelationID corr_id,
     options->SetFlag(ni::InferRequestHeader::FLAG_SEQUENCE_END,
                      end_of_sequence);
     for (const auto& output : context.Outputs()) {
+      if (output->Name() == "TEXT" && !print_results_)
+        continue;  // no need for text output if not printing
       options->AddRawResult(output);
     }
   }
@@ -89,27 +97,33 @@ void TRTISASRClient::SendChunk(ni::CorrelationID corr_id,
   total_audio_ += (static_cast<double>(nsamples) / 16000.);  // TODO freq
   double start = gettime_monotonic();
   FAIL_IF_ERR(context.AsyncRun([corr_id, end_of_sequence, start, this](
-                  nic::InferContext* ctx,
-                  const std::shared_ptr<nic::InferContext::Request>& request) {
+                                   nic::InferContext* ctx,
+                                   const std::shared_ptr<
+                                       nic::InferContext::Request>& request) {
     if (end_of_sequence) {
       double elapsed = gettime_monotonic() - start;
-      std::string out;
       std::map<std::string, std::unique_ptr<nic::InferContext::Result>> results;
       ctx->GetAsyncRunResults(request, &results);
 
-      if (results.size() != 1) {
-        std::cerr << "Warning: Could not read output for corr_id " << corr_id
-                  << std::endl;
+      if (results.empty()) {
+        std::cerr << "Warning: Could not read "
+                     "output for corr_id "
+                  << corr_id << std::endl;
       } else {
-        FAIL_IF_ERR(results["TEXT"]->GetRawAtCursor(0, &out),
-                    "unable to get TEXT output");
         if (print_results_) {
+	  std::string text;
+	  FAIL_IF_ERR(results["TEXT"]->GetRawAtCursor(0, &text),
+			  "unable to get TEXT output");
           std::lock_guard<std::mutex> lk(stdout_m_);
-          std::cout << "CORR_ID " << corr_id << "\t\t" << out << std::endl;
+          std::cout << "CORR_ID " << corr_id << "\t\t" << text << std::endl;
         }
+
+        std::string lattice_bytes;
+        FAIL_IF_ERR(results["RAW_LATTICE"]->GetRawAtCursor(0, &lattice_bytes),
+                    "unable to get RAW_LATTICE output");
         {
           std::lock_guard<std::mutex> lk(results_m_);
-          results_.insert({corr_id, {std::move(out), elapsed}});
+          results_.insert({corr_id, {std::move(lattice_bytes), elapsed}});
         }
       }
       n_in_flight_.fetch_sub(1, std::memory_order_relaxed);
@@ -125,7 +139,7 @@ void TRTISASRClient::WaitForCallbacks() {
   }
 }
 
-void TRTISASRClient::PrintStats() {
+void TRTISASRClient::PrintStats(bool print_latency_stats) {
   double now = gettime_monotonic();
   double diff = now - started_at_;
   double rtf = total_audio_ / diff;
@@ -150,9 +164,16 @@ void TRTISASRClient::PrintStats() {
                latencies.size();
 
   std::cout << std::setprecision(3);
-  std::cout << "Latencies:\t90\t\t95\t\t99\t\tAvg\n";
-  std::cout << "\t\t" << lat_90 << "\t\t" << lat_95 << "\t\t" << lat_99
-            << "\t\t" << avg << std::endl;
+  std::cout << "Latencies:\t90%\t\t95%\t\t99%\t\tAvg\n";
+  if (print_latency_stats) {
+    std::cout << "\t\t" << lat_90 << "\t\t" << lat_95 << "\t\t" << lat_99
+              << "\t\t" << avg << std::endl;
+  } else {
+    std::cout << "\t\tN/A\t\tN/A\t\tN/A\t\tN/A" << std::endl;
+    std::cout << "Latency statistics are printed only when the "
+                 "online option is set (-o)."
+              << std::endl;
+  }
 }
 
 TRTISASRClient::TRTISASRClient(const std::string& url,
@@ -175,3 +196,30 @@ TRTISASRClient::TRTISASRClient(const std::string& url,
   started_at_ = gettime_monotonic();
   total_audio_ = 0;
 }
+
+void TRTISASRClient::WriteLatticesToFile(
+    const std::string& clat_wspecifier,
+    const std::unordered_map<ni::CorrelationID, std::string>&
+        corr_id_and_keys) {
+  kaldi::CompactLatticeWriter clat_writer;
+  clat_writer.Open(clat_wspecifier);
+  std::lock_guard<std::mutex> lk(results_m_);
+  for (auto& p : corr_id_and_keys) {
+    ni::CorrelationID corr_id = p.first;
+    const std::string& key = p.second;
+    auto it = results_.find(corr_id);
+    if(it == results_.end()) {
+	    std::cerr << "Cannot find lattice for corr_id " << corr_id << std::endl;
+	    continue;
+    }
+    const std::string& raw_lattice = it->second.raw_lattice;
+    // We could in theory write directly the binary hold in raw_lattice (it is
+    // in the kaldi lattice format) However getting back to a CompactLattice
+    // object allows us to us CompactLatticeWriter
+    std::istringstream iss(raw_lattice);
+    kaldi::CompactLattice* clat = NULL;
+    kaldi::ReadCompactLattice(iss, true, &clat);
+    clat_writer.Write(key, *clat);
+  }
+  clat_writer.Close();
+}
@@ -15,6 +15,7 @@
 #include <queue>
 #include <string>
 #include <vector>
+#include <unordered_map>
 
 #include "request_grpc.h"
 
@@ -52,7 +53,7 @@ class TRTISASRClient {
   std::mutex stdout_m_;
 
   struct Result {
-    std::string text;
+    std::string raw_lattice;
     double latency;
   };
 
@@ -64,7 +65,8 @@ class TRTISASRClient {
   void SendChunk(uint64_t corr_id, bool start_of_sequence, bool end_of_sequence,
                  float* chunk, int chunk_byte_size);
   void WaitForCallbacks();
-  void PrintStats();
+  void PrintStats(bool print_latency_stats);
+  void WriteLatticesToFile(const std::string &clat_wspecifier, const std::unordered_map<ni::CorrelationID, std::string> &corr_id_and_keys);
 
   TRTISASRClient(const std::string& url, const std::string& model_name,
                  const int ncontextes, bool print_results);