faroit · jonashaag · Sep 4, 2020
diff --git a/Dockerfile b/Dockerfile
@@ -1,12 +1,17 @@
 FROM conda/miniconda3
 
-RUN apt-get update && \
-    apt-get install -y libsndfile1
+RUN apt update && apt install -y g++
+
+# Copy requirements.txt and run pip first so that changes to the application
+# code do not require a rebuild of the entire image
+COPY requirements.txt /app/
+RUN conda update conda && \
+    conda install "keras<2.4" "numpy<2" "scikit-learn<0.23" && \
+    conda install -c conda-forge librosa theano
 
 ADD . /app
 WORKDIR /app
 
 VOLUME /data
 
-RUN pip install --upgrade pip && \
-    pip install -r requirements.txt
+ENV KERAS_BACKEND=theano
diff --git a/README.md b/README.md
@@ -39,7 +39,8 @@ This repository provides the [keras](https://keras.io/) model to be used from Py
 [Docker](https://www.docker.com/) makes it easy to reproduce the results and install all requirements. If you have docker installed, run the following steps to predict a count from the provided test sample.
 
 * Build the docker image: `docker build -t countnet .`
-* Predict from example: `docker run -i countnet python predict.py --model CRNN examples/5_speakers.wav`
+* Run like this: `docker run -it countnet python predict.py ...` (see usage details below)
+* Mount your data into the container: `docker run -v /path/to/your/data:/data -it countnet python predict.py ... /data/your_audio.wav`
 
 ### Manual Installation 
 
@@ -49,7 +50,46 @@ To install the requirements using Anaconda Python, run
 
 You can now run the command line script and process wav files using the pre-trained model `CRNN` (best peformance).
 
-`python predict.py examples/5_speakers.wav --model CRNN`
+```
+python predict.py --model CRNN examples/5_speakers.wav
+# => Speaker Count Estimate: examples/5_speakers.wav 5
+```
+
+You can also pass multiple files at once.
+
+```
+python predict.py --model CRNN examples/5_speakers.wav examples/5_speakers.wav
+# => Speaker Count Estimate: examples/5_speakers.wav 5
+# => Speaker Count Estimate: examples/5_speakers.wav 5
+```
+
+There is also a simple JSON API to send audio data to (not production ready; only for development!). To run the server:
+
+```
+python predict_api.py --model CRNN
+
+# With Docker:
+docker run -p5000:5000 -it countnet python predict_api.py --model CRNN
+```
+
+The server expects a JSON list of base64 encoded arrays of 16 kHz, float32 audio arrays. It returns a JSON list of integers. If estimation failed for any of the arrays, its result is set to `null` instead.
+
+```py
+import base64
+import requests
+import librosa
+
+audio_data1 = librosa.core.load("/path/to/5_speakers.wav", sr=16000, dtype="float32")[0]
+response = requests.post(
+    "http://localhost:5000",
+    json=[
+        base64.b64encode(audio_data1.tobytes())
+    ]
+)
+print(response.json())
+# => [5]
+```
+
 
 ## Reproduce Paper Results using the LibriCount Dataset
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1216072.svg)](https://doi.org/10.5281/zenodo.1216072)

diff --git a/predict.py b/predict.py
@@ -1,5 +1,4 @@
 import numpy as np
-import soundfile as sf
 import argparse
 import os
 import keras
@@ -20,6 +19,25 @@ def class_mae(y_true, y_pred):
     )
 
 
+def load_scaler():
+    scaler = sklearn.preprocessing.StandardScaler()
+    with np.load(os.path.join("models", 'scaler.npz')) as data:
+        scaler.mean_ = data['arr_0']
+        scaler.scale_ = data['arr_1']
+    return scaler
+
+
+def load_model(model_name):
+    path = os.path.join('models', model_name + '.h5')
+    return keras.models.load_model(
+        path,
+        custom_objects={
+            'class_mae': class_mae,
+            'exp': K.exp
+        }
+    )
+
+
 def count(audio, model, scaler):
     # compute STFT
     X = np.abs(librosa.stft(audio, n_fft=400, hop_length=160)).T
@@ -51,38 +69,31 @@ def count(audio, model, scaler):
 
     parser.add_argument(
         'audio',
-        help='audio file (samplerate 16 kHz) of 5 seconds duration'
+        help='audio file (samplerate 16 kHz) of 5 seconds duration',
+        nargs='+',
     )
 
     parser.add_argument(
         '--model', default='CRNN',
         help='model name'
     )
 
+    parser.add_argument('--print-summary', action='store_true')
+
     args = parser.parse_args()
 
     # load model
-    model = keras.models.load_model(
-        os.path.join('models', args.model + '.h5'),
-        custom_objects={
-            'class_mae': class_mae,
-            'exp': K.exp
-        }
-    )
+    model = load_model(args.model)
 
-    # print model configuration
-    model.summary()
-    # save as svg file
-    # load standardisation parameters
-    scaler = sklearn.preprocessing.StandardScaler()
-    with np.load(os.path.join("models", 'scaler.npz')) as data:
-        scaler.mean_ = data['arr_0']
-        scaler.scale_ = data['arr_1']
+    if args.print_summary:
+        # print model configuration
+        model.summary()
 
-    # compute audio
-    audio, rate = sf.read(args.audio, always_2d=True)
+    # load standardisation parameters
+    scaler = load_scaler()
 
-    # downmix to mono
-    audio = np.mean(audio, axis=1)
-    estimate = count(audio, model, scaler)
-    print("Speaker Count Estimate: ", estimate)
+    for f in args.audio:
+        # compute audio
+        audio = librosa.load(f, sr=16000)[0]
+        estimate = count(audio, model, scaler)
+        print("Speaker Count Estimate:", f, estimate)
diff --git a/predict_api.py b/predict_api.py
@@ -0,0 +1,45 @@
+import base64
+import json
+import numpy as np
+from werkzeug.wrappers import Request, Response
+import predict
+
+
+def decode_audio(audio_bytes):
+    return np.frombuffer(base64.b64decode(audio_bytes), dtype="float32")
+
+
+def make_app(estimate_func):
+    def app(environ, start_response):
+        inputs = json.loads(Request(environ).get_data())
+
+        outputs = []
+        for inp in inputs:
+            try:
+                est = int(estimate_func(decode_audio(inp)))
+            except Exception as e:
+                print(f"Error estimating speaker count for input {len(outputs)}: {e}")
+                est = None
+            outputs.append(est)
+
+        return Response(json.dumps(outputs))(environ, start_response)
+
+    return app
+
+
+if __name__ == "__main__":
+    import argparse
+    import functools
+    from werkzeug.serving import run_simple
+
+    parser = argparse.ArgumentParser(
+        description="Run simple JSON api server to predict speaker count"
+    )
+    parser.add_argument("--model", default="CRNN", help="model name")
+    args = parser.parse_args()
+
+    model = predict.load_model(args.model)
+    scaler = predict.load_scaler()
+
+    app = make_app(functools.partial(predict.count, model=model, scaler=scaler))
+    run_simple("0.0.0.0", 5000, app, use_debugger=True)
diff --git a/predict_audio.py b/predict_audio.py
diff --git a/requirements.txt b/requirements.txt