Skip to content

Add Docker environment & web demo #9

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Wave-U-Net (Pytorch)
<a href="https://replicate.ai/f90/wave-u-net-pytorch"><img src="https://img.shields.io/static/v1?label=Replicate&message=Demo and Docker Image&color=darkgreen" height=20></a>

Improved version of the [Wave-U-Net](https://arxiv.org/abs/1806.03185) for audio source separation, implemented in Pytorch.

Click [here](www.github.com/f90/Wave-U-Net) for the original Wave-U-Net implementation in Tensorflow.
You can find more information about the model and results there as well.
You can find more information about the model and results there as well.

# Improvements

Expand All @@ -24,7 +25,9 @@ GPU strongly recommended to avoid very long training times.
System requirements:
* Linux-based OS
* Python 3.6

* [libsndfile](http://mega-nerd.com/libsndfile/)

* [ffmpeg](https://www.ffmpeg.org/)
* CUDA 10.1 for GPU usage

Expand Down Expand Up @@ -68,6 +71,7 @@ You can of course use your own datasets for training, but for this you would nee
# Training the models

To train a Wave-U-Net, the basic command to use is

```
python3.6 train.py --dataset_dir /PATH/TO/MUSDB18HQ
```
Expand All @@ -86,7 +90,7 @@ After training, the model is evaluated on the MUSDB18HQ test set, and SDR/SIR/SA

# <a name="test"></a> Test trained models on songs!

We provide the default model in a pre-trained form as download so you can separate your own songs right away.
We provide the default model in a pre-trained form as download so you can separate your own songs right away.

## Downloading our pretrained models

Expand Down
20 changes: 20 additions & 0 deletions cog.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
build:
python_version: "3.6"
gpu: false
python_packages:
- future==0.18.2
- numpy==1.19.5
- librosa==0.8.1
- soundfile==0.10.3.post1
- musdb==0.4.0
- museval==0.4.0
- h5py==3.1.0
- tqdm==4.62.1
- torch==1.4.0
- torchvision==0.5.0
- tensorboard==2.6.0
- sortedcontainers==2.4.0
system_packages:
- libsndfile-dev
- ffmpeg
predict: "cog_predict.py:waveunetPredictor"
144 changes: 144 additions & 0 deletions cog_predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import os
import cog
import tempfile
import zipfile
from pathlib import Path
import argparse
import data.utils
import model.utils as model_utils
from test import predict_song
from model.waveunet import Waveunet


class waveunetPredictor(cog.Predictor):
def setup(self):
"""Init wave u net model"""
parser = argparse.ArgumentParser()
parser.add_argument(
"--instruments",
type=str,
nargs="+",
default=["bass", "drums", "other", "vocals"],
help='List of instruments to separate (default: "bass drums other vocals")',
)
parser.add_argument(
"--cuda", action="store_true", help="Use CUDA (default: False)"
)
parser.add_argument(
"--features",
type=int,
default=32,
help="Number of feature channels per layer",
)
parser.add_argument(
"--load_model",
type=str,
default="checkpoints/waveunet/model",
help="Reload a previously trained model",
)
parser.add_argument("--batch_size", type=int, default=4, help="Batch size")
parser.add_argument(
"--levels", type=int, default=6, help="Number of DS/US blocks"
)
parser.add_argument(
"--depth", type=int, default=1, help="Number of convs per block"
)
parser.add_argument("--sr", type=int, default=44100, help="Sampling rate")
parser.add_argument(
"--channels", type=int, default=2, help="Number of input audio channels"
)
parser.add_argument(
"--kernel_size",
type=int,
default=5,
help="Filter width of kernels. Has to be an odd number",
)
parser.add_argument(
"--output_size", type=float, default=2.0, help="Output duration"
)
parser.add_argument(
"--strides", type=int, default=4, help="Strides in Waveunet"
)
parser.add_argument(
"--conv_type",
type=str,
default="gn",
help="Type of convolution (normal, BN-normalised, GN-normalised): normal/bn/gn",
)
parser.add_argument(
"--res",
type=str,
default="fixed",
help="Resampling strategy: fixed sinc-based lowpass filtering or learned conv layer: fixed/learned",
)
parser.add_argument(
"--separate",
type=int,
default=1,
help="Train separate model for each source (1) or only one (0)",
)
parser.add_argument(
"--feature_growth",
type=str,
default="double",
help="How the features in each layer should grow, either (add) the initial number of features each time, or multiply by 2 (double)",
)
"""
parser.add_argument('--input', type=str, default=str(input),
help="Path to input mixture to be separated")
parser.add_argument('--output', type=str, default=out_path, help="Output path (same folder as input path if not set)")
"""
args = parser.parse_args([])
self.args = args

num_features = (
[args.features * i for i in range(1, args.levels + 1)]
if args.feature_growth == "add"
else [args.features * 2 ** i for i in range(0, args.levels)]
)
target_outputs = int(args.output_size * args.sr)
self.model = Waveunet(
args.channels,
num_features,
args.channels,
args.instruments,
kernel_size=args.kernel_size,
target_output_size=target_outputs,
depth=args.depth,
strides=args.strides,
conv_type=args.conv_type,
res=args.res,
separate=args.separate,
)

if args.cuda:
self.model = model_utils.DataParallel(model)
print("move model to gpu")
self.model.cuda()

print("Loading model from checkpoint " + str(args.load_model))
state = model_utils.load_model(self.model, None, args.load_model, args.cuda)
print("Step", state["step"])

@cog.input("input", type=Path, help="audio mixture path")
def predict(self, input):
"""Separate tracks from input mixture audio"""

out_path = Path(tempfile.mkdtemp())
zip_path = Path(tempfile.mkdtemp()) / "output.zip"

preds = predict_song(self.args, input, self.model)

out_names = []
for inst in preds.keys():
temp_n = os.path.join(
str(out_path), os.path.basename(str(input)) + "_" + inst + ".wav"
)
data.utils.write_wav(temp_n, preds[inst], self.args.sr)
out_names.append(temp_n)

with zipfile.ZipFile(str(zip_path), "w") as zf:
for i in out_names:
zf.write(str(i))

return zip_path