basic bwe training framework added

xiph · Apr 25, 2024 · 479795e · 479795e
1 parent 0dc559f
commit 479795e
Show file tree

Hide file tree

Showing 16 changed files with 932 additions and 31 deletions.
diff --git a/dnn/torch/osce/concatenator.py b/dnn/torch/osce/concatenator.py
@@ -0,0 +1,85 @@
+import os
+import argparse
+
+import numpy as np
+from scipy import signal
+from scipy.io import wavfile
+import resampy
+
+
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("filelist", type=str, help="file with filenames for concatenation in WAVE format")
+parser.add_argument("target_fs", type=int, help="target sampling rate of concatenated file")
+parser.add_argument("output", type=str, help="binary output file (integer16)")
+parser.add_argument("--basedir", type=str, help="basedir for filenames in filelist, defaults to ./", default="./")
+parser.add_argument("--normalize", action="store_true", help="apply normalization")
+parser.add_argument("--db_max", type=float, help="max DB for random normalization", default=0)
+parser.add_argument("--db_min", type=float, help="min DB for random normalization", default=0)
+parser.add_argument("--verbose", action="store_true")
+
+def read_filelist(basedir, filelist):
+    with open(filelist, "r") as f:
+        files = f.readlines()
+
+    fullfiles = [os.path.join(basedir, f.rstrip('\n')) for f in files if len(f.rstrip('\n')) > 0]
+
+    return fullfiles
+
+def read_wave(file, target_fs):
+    fs, x = wavfile.read(file)
+
+    if fs < target_fs:
+        return None
+        print(f"[read_wave] warning: file {file} will be up-sampled from {fs} to {target_fs} Hz")
+
+    if fs != target_fs:
+        x = resampy.resample(x, fs, target_fs)
+
+    return x.astype(np.float32)
+
+def random_normalize(x, db_min, db_max, max_val=2**15 - 1):
+    db = np.random.uniform(db_min, db_max, 1)
+    m = np.abs(x).max()
+    c = 10**(db/20) * max_val / m
+
+    return c * x
+
+
+def concatenate(filelist : str, output : str, target_fs: int, normalize=True, db_min=0, db_max=0, verbose=False):
+
+    overlap_size = int(40 * target_fs / 8000)
+    overlap_mem = np.zeros(overlap_size, dtype=np.float32)
+    overlap_win1 = (0.5 + 0.5 * np.cos(np.arange(0, overlap_size) * np.pi / overlap_size)).astype(np.float32)
+    overlap_win2 = np.flipud(overlap_win1)
+
+    with open(output, 'wb') as f:
+        for file in filelist:
+            x = read_wave(file, target_fs)
+            if x is None: continue
+
+            if len(x) < 10 * overlap_size:
+                if verbose: print(f"skipping {file}...")
+                continue
+            elif verbose:
+                print(f"processing {file}...")
+
+            if normalize:
+                x = random_normalize(x, db_min, db_max)
+
+            x1 = x[:-overlap_size]
+            x1[:overlap_size] = overlap_win1 * overlap_mem + overlap_win2 * x1[:overlap_size]
+
+            f.write(x1.astype(np.int16).tobytes())
+
+            overlap_mem = x1[-overlap_size]
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    filelist = read_filelist(args.basedir, args.filelist)
+
+    concatenate(filelist, args.output, args.target_fs, normalize=args.normalize, db_min=args.db_min, db_max=args.db_max, verbose=args.verbose)
diff --git a/dnn/torch/osce/data/__init__.py b/dnn/torch/osce/data/__init__.py
@@ -1,2 +1,3 @@
 from .silk_enhancement_set import SilkEnhancementSet
-from .lpcnet_vocoding_dataset import LPCNetVocodingDataset
+from .lpcnet_vocoding_dataset import LPCNetVocodingDataset
+from .simple_bwe_dataset import SimpleBWESet
diff --git a/dnn/torch/osce/data/simple_bwe_dataset.py b/dnn/torch/osce/data/simple_bwe_dataset.py
@@ -0,0 +1,85 @@
+"""
+/* Copyright (c) 2024 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import os
+
+from torch.utils.data import Dataset
+import numpy as np
+
+from utils.bwe_features import bwe_feature_factory
+
+
+class SimpleBWESet(Dataset):
+    FRAME_SIZE_16K = 160
+    def __init__(self,
+                 path,
+                 frames_per_sample=100,
+                 spec_num_bands=32,
+                 max_instafreq_bin=40
+                 ):
+
+        self.frames_per_sample = frames_per_sample
+        self.signal_16k = np.fromfile(os.path.join(path, 'signal_16kHz.s16'), dtype=np.int16)
+        self.signal_48k = np.fromfile(os.path.join(path, 'signal_48kHz.s16'), dtype=np.int16)
+
+        num_frames = min(len(self.signal_16k) // self.FRAME_SIZE_16K,
+                         len(self.signal_48k) // (3 * self.FRAME_SIZE_16K))
+
+        self.create_features = bwe_feature_factory(spec_num_bands=spec_num_bands, max_instafreq_bin=max_instafreq_bin)
+
+        self.frame_offset = 4
+
+        self.len = (num_frames - self.frame_offset) // frames_per_sample
+
+    def __len__(self):
+        return self.len
+
+    def __getitem__(self, index):
+
+        frame_start = self.frames_per_sample * index + self.frame_offset
+        frame_stop  = frame_start + self.frames_per_sample
+
+        signal_start16 = frame_start * self.FRAME_SIZE_16K
+        signal_stop16  = frame_stop  * self.FRAME_SIZE_16K
+
+        x_16 = self.signal_16k[signal_start16 : signal_stop16].astype(np.float32) / 2**15
+        history_16 = self.signal_16k[signal_start16 - 320 : signal_start16].astype(np.float32) / 2**15
+
+        x_48 = self.signal_48k[3 * signal_start16 : 3 * signal_stop16].astype(np.float32) / 2**15
+
+        features = self.create_features(
+              x_16,
+              history_16
+        )
+
+        return {
+            'features'    : features,
+            'x_16'        : x_16.astype(np.float32),
+            'x_48'        : x_48.astype(np.float32),
+            }
diff --git a/dnn/torch/osce/engine/bwe_engine.py b/dnn/torch/osce/engine/bwe_engine.py
@@ -0,0 +1,95 @@
+import torch
+from tqdm import tqdm
+import sys
+
+def train_one_epoch(model, criterion, optimizer, dataloader, device, scheduler, log_interval=10):
+
+    model.to(device)
+    model.train()
+
+    running_loss = 0
+    previous_running_loss = 0
+
+
+    with tqdm(dataloader, unit='batch', file=sys.stdout) as tepoch:
+
+        for i, batch in enumerate(tepoch):
+
+            # set gradients to zero
+            optimizer.zero_grad()
+
+            # push batch to device
+            for key in batch:
+                batch[key] = batch[key].to(device)
+
+            target = batch['x_48']
+
+            # calculate model output
+            output = model(batch['x_16'].unsqueeze(1), batch['features'])
+
+            # calculate loss
+            loss = criterion(target, output.squeeze(1))
+
+            # calculate gradients
+            loss.backward()
+
+            # update weights
+            optimizer.step()
+
+            # update learning rate
+            scheduler.step()
+
+            # sparsification
+            if hasattr(model, 'sparsifier'):
+                model.sparsifier()
+
+            # update running loss
+            running_loss += float(loss.cpu())
+
+            # update status bar
+            if i % log_interval == 0:
+                tepoch.set_postfix(running_loss=f"{running_loss/(i + 1):8.7f}", current_loss=f"{(running_loss - previous_running_loss)/log_interval:8.7f}")
+                previous_running_loss = running_loss
+
+
+    running_loss /= len(dataloader)
+
+    return running_loss
+
+def evaluate(model, criterion, dataloader, device, log_interval=10):
+
+    model.to(device)
+    model.eval()
+
+    running_loss = 0
+    previous_running_loss = 0
+
+    with torch.no_grad():
+        with tqdm(dataloader, unit='batch', file=sys.stdout) as tepoch:
+
+            for i, batch in enumerate(tepoch):
+
+                # push batch to device
+                for key in batch:
+                    batch[key] = batch[key].to(device)
+
+                target = batch['x_48']
+
+                # calculate model output
+                output = model(batch['x_16'].unsqueeze(1), batch['features'])
+
+                # calculate loss
+                loss = criterion(target, output.squeeze(1))
+
+                # update running loss
+                running_loss += float(loss.cpu())
+
+                # update status bar
+                if i % log_interval == 0:
+                    tepoch.set_postfix(running_loss=f"{running_loss/(i + 1):8.7f}", current_loss=f"{(running_loss - previous_running_loss)/log_interval:8.7f}")
+                    previous_running_loss = running_loss
+
+
+        running_loss /= len(dataloader)
+
+        return running_loss
diff --git a/dnn/torch/osce/extract_setup.py b/dnn/torch/osce/extract_setup.py
@@ -0,0 +1,18 @@
+import torch
+import yaml
+import argparse
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('checkpoint', type=str, help='model checkpoint')
+parser.add_argument('setup', type=str, help='setup filename')
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    ckpt = torch.load(args.checkpoint, map_location='cpu')
+
+    setup = ckpt['setup']
+
+    with open(args.setup, "w") as f:
+        yaml.dump(setup, f)
diff --git a/dnn/torch/osce/losses/td_lowpass.py b/dnn/torch/osce/losses/td_lowpass.py
@@ -7,28 +7,27 @@
 class TDLowpass(torch.nn.Module):
     def __init__(self, numtaps, cutoff, power=2):
         super().__init__()
-        
+
         self.b = scipy.signal.firwin(numtaps, cutoff)
-        self.weight = torch.from_numpy(self.b).float().view(1, 1, -1)
+        self.weight = torch.nn.Parameter(torch.from_numpy(self.b).float().view(1, 1, -1), requires_grad=False)
         self.power = power
-        
+
     def forward(self, y_true, y_pred):
-        
+
         assert len(y_true.shape) == 3 and len(y_pred.shape) == 3
-        
+
         diff = y_true - y_pred
         diff_lp = torch.nn.functional.conv1d(diff, self.weight)
-        
+
         loss = torch.mean(torch.abs(diff_lp ** self.power))
-        
+
         return loss, diff_lp
-    
+
     def get_freqz(self):
         freq, response = scipy.signal.freqz(self.b)
-        
+
         return freq, response
-
-
-
-
-
+
+
+
+
diff --git a/dnn/torch/osce/make_default_setup.py b/dnn/torch/osce/make_default_setup.py
@@ -66,7 +66,7 @@
 parser = argparse.ArgumentParser()
 
 parser.add_argument('name', type=str, help='name of default setup file')
-parser.add_argument('--model', choices=['lace', 'nolace', 'lavoce'], help='model name', default='lace')
+parser.add_argument('--model', choices=['lace', 'nolace', 'lavoce', 'bwenet'], help='model name', default='lace')
 parser.add_argument('--adversarial', action='store_true', help='setup for adversarial training')
 parser.add_argument('--path2dataset', type=str, help='dataset path', default=None)
 

diff --git a/dnn/torch/osce/models/__init__.py b/dnn/torch/osce/models/__init__.py
@@ -32,11 +32,13 @@
 from .lavoce import LaVoce
 from .lavoce_400 import LaVoce400
 from .fd_discriminator import TFDMultiResolutionDiscriminator as FDMResDisc
+from .bwe_net import BWENet
 
 model_dict = {
     'lace': LACE,
     'nolace': NoLACE,
     'lavoce': LaVoce,
     'lavoce400': LaVoce400,
     'fdmresdisc': FDMResDisc,
+    'bwenet' : BWENet
 }