Skip to content
This repository was archived by the owner on Feb 12, 2022. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
FROM conda/miniconda3

RUN apt-get update && \
apt-get install -y libsndfile1
RUN apt update && apt install -y g++

# Copy requirements.txt and run pip first so that changes to the application
# code do not require a rebuild of the entire image
COPY requirements.txt /app/
RUN conda update conda && \
conda install "keras<2.4" "numpy<2" "scikit-learn<0.23" && \
conda install -c conda-forge librosa theano

ADD . /app
WORKDIR /app

VOLUME /data

RUN pip install --upgrade pip && \
pip install -r requirements.txt
ENV KERAS_BACKEND=theano
44 changes: 42 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ This repository provides the [keras](https://keras.io/) model to be used from Py
[Docker](https://www.docker.com/) makes it easy to reproduce the results and install all requirements. If you have docker installed, run the following steps to predict a count from the provided test sample.

* Build the docker image: `docker build -t countnet .`
* Predict from example: `docker run -i countnet python predict.py --model CRNN examples/5_speakers.wav`
* Run like this: `docker run -it countnet python predict.py ...` (see usage details below)
* Mount your data into the container: `docker run -v /path/to/your/data:/data -it countnet python predict.py ... /data/your_audio.wav`

### Manual Installation

Expand All @@ -49,7 +50,46 @@ To install the requirements using Anaconda Python, run

You can now run the command line script and process wav files using the pre-trained model `CRNN` (best peformance).

`python predict.py examples/5_speakers.wav --model CRNN`
```
python predict.py --model CRNN examples/5_speakers.wav
# => Speaker Count Estimate: examples/5_speakers.wav 5
```

You can also pass multiple files at once.

```
python predict.py --model CRNN examples/5_speakers.wav examples/5_speakers.wav
# => Speaker Count Estimate: examples/5_speakers.wav 5
# => Speaker Count Estimate: examples/5_speakers.wav 5
```

There is also a simple JSON API to send audio data to (not production ready; only for development!). To run the server:

```
python predict_api.py --model CRNN

# With Docker:
docker run -p5000:5000 -it countnet python predict_api.py --model CRNN
```

The server expects a JSON list of base64 encoded arrays of 16 kHz, float32 audio arrays. It returns a JSON list of integers. If estimation failed for any of the arrays, its result is set to `null` instead.

```py
import base64
import requests
import librosa

audio_data1 = librosa.core.load("/path/to/5_speakers.wav", sr=16000, dtype="float32")[0]
response = requests.post(
"http://localhost:5000",
json=[
base64.b64encode(audio_data1.tobytes())
]
)
print(response.json())
# => [5]
```


## Reproduce Paper Results using the LibriCount Dataset
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1216072.svg)](https://doi.org/10.5281/zenodo.1216072)
Expand Down
57 changes: 34 additions & 23 deletions predict.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import numpy as np
import soundfile as sf
import argparse
import os
import keras
Expand All @@ -20,6 +19,25 @@ def class_mae(y_true, y_pred):
)


def load_scaler():
scaler = sklearn.preprocessing.StandardScaler()
with np.load(os.path.join("models", 'scaler.npz')) as data:
scaler.mean_ = data['arr_0']
scaler.scale_ = data['arr_1']
return scaler


def load_model(model_name):
path = os.path.join('models', model_name + '.h5')
return keras.models.load_model(
path,
custom_objects={
'class_mae': class_mae,
'exp': K.exp
}
)


def count(audio, model, scaler):
# compute STFT
X = np.abs(librosa.stft(audio, n_fft=400, hop_length=160)).T
Expand Down Expand Up @@ -51,38 +69,31 @@ def count(audio, model, scaler):

parser.add_argument(
'audio',
help='audio file (samplerate 16 kHz) of 5 seconds duration'
help='audio file (samplerate 16 kHz) of 5 seconds duration',
nargs='+',
)

parser.add_argument(
'--model', default='CRNN',
help='model name'
)

parser.add_argument('--print-summary', action='store_true')

args = parser.parse_args()

# load model
model = keras.models.load_model(
os.path.join('models', args.model + '.h5'),
custom_objects={
'class_mae': class_mae,
'exp': K.exp
}
)
model = load_model(args.model)

# print model configuration
model.summary()
# save as svg file
# load standardisation parameters
scaler = sklearn.preprocessing.StandardScaler()
with np.load(os.path.join("models", 'scaler.npz')) as data:
scaler.mean_ = data['arr_0']
scaler.scale_ = data['arr_1']
if args.print_summary:
# print model configuration
model.summary()

# compute audio
audio, rate = sf.read(args.audio, always_2d=True)
# load standardisation parameters
scaler = load_scaler()

# downmix to mono
audio = np.mean(audio, axis=1)
estimate = count(audio, model, scaler)
print("Speaker Count Estimate: ", estimate)
for f in args.audio:
# compute audio
audio = librosa.load(f, sr=16000)[0]
estimate = count(audio, model, scaler)
print("Speaker Count Estimate:", f, estimate)
45 changes: 45 additions & 0 deletions predict_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import base64
import json
import numpy as np
from werkzeug.wrappers import Request, Response
import predict


def decode_audio(audio_bytes):
return np.frombuffer(base64.b64decode(audio_bytes), dtype="float32")


def make_app(estimate_func):
def app(environ, start_response):
inputs = json.loads(Request(environ).get_data())

outputs = []
for inp in inputs:
try:
est = int(estimate_func(decode_audio(inp)))
except Exception as e:
print(f"Error estimating speaker count for input {len(outputs)}: {e}")
est = None
outputs.append(est)

return Response(json.dumps(outputs))(environ, start_response)

return app


if __name__ == "__main__":
import argparse
import functools
from werkzeug.serving import run_simple

parser = argparse.ArgumentParser(
description="Run simple JSON api server to predict speaker count"
)
parser.add_argument("--model", default="CRNN", help="model name")
args = parser.parse_args()

model = predict.load_model(args.model)
scaler = predict.load_scaler()

app = make_app(functools.partial(predict.count, model=model, scaler=scaler))
run_simple("0.0.0.0", 5000, app, use_debugger=True)
67 changes: 0 additions & 67 deletions predict_audio.py

This file was deleted.

8 changes: 0 additions & 8 deletions requirements.txt

This file was deleted.