separate identification and verification; fix bugs

VITA-Group · Aug 31, 2020 · 86c2412 · 86c2412
1 parent 7051c3e
commit 86c2412
Show file tree

Hide file tree

Showing 20 changed files with 473 additions and 63 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,4 @@ models/__pycache__/**
 __pycache__/**
 config/__pycache__/**
 data_objects/__pycache__/**
-logs_scratch/**
+logs_scratch/**
diff --git a/config/default.py b/config/default.py
@@ -31,12 +31,11 @@
 # DATASET related params
 _C.DATASET = CN()
 _C.DATASET.DATA_DIR = ''
-_C.DATASET.DATASET = ''
+_C.DATASET.SUB_DIR = ''
 _C.DATASET.TEST_DATA_DIR = ''
 _C.DATASET.TEST_DATASET = ''
 _C.DATASET.NUM_WORKERS = 0
 _C.DATASET.PARTIAL_N_FRAMES = 32
-_C.DATASET.FEATURE_DIM = 40
 
 
 # train
@@ -60,7 +59,6 @@
 _C.TRAIN.END_EPOCH = 140
 
 
-
 def update_config(cfg, args):
     cfg.defrost()
     cfg.merge_from_file(args.cfg)

diff --git a/data_objects/DeepSpeakerDataset.py b/data_objects/DeepSpeakerDataset.py
@@ -17,10 +17,10 @@ def find_classes(speakers):
 
 class DeepSpeakerDataset(data.Dataset):
 
-    def __init__(self, data_dir, partial_n_frames, partition=None, is_test=False):
+    def __init__(self, data_dir, sub_dir, partial_n_frames, partition=None, is_test=False):
         super(DeepSpeakerDataset, self).__init__()
         self.data_dir = data_dir
-        self.root = data_dir.joinpath('feature')
+        self.root = data_dir.joinpath('feature', sub_dir)
         self.partition = partition
         self.partial_n_frames = partial_n_frames
         self.is_test = is_test

diff --git a/data_objects/VoxcelebTestset.py b/data_objects/VoxcelebTestset.py
@@ -9,7 +9,7 @@ def get_test_paths(pairs_path, db_dir):
     def convert_folder_name(path):
         basename = os.path.splitext(path)[0]
         items = basename.split('/')
-        speaker_dir = 'wav_{}'.format(items[0])
+        speaker_dir = items[0]
         fname = '{}_{}.npy'.format(items[1], items[2])
         p = os.path.join(speaker_dir, fname)
         return p
@@ -38,12 +38,13 @@ def convert_folder_name(path):
 
     return path_list
 
+
 class VoxcelebTestset(data.Dataset):
     def __init__(self, data_dir, partial_n_frames):
         super(VoxcelebTestset, self).__init__()
         self.data_dir = data_dir
-        self.root = data_dir.joinpath('feature')
-        self.test_pair_txt_fpath = data_dir.joinpath('veri_test.txt')
+        self.root = data_dir.joinpath('feature', 'test')
+        self.test_pair_txt_fpath = data_dir.joinpath('veri_test2.txt')
         self.test_pairs = get_test_paths(self.test_pair_txt_fpath, self.root)
         self.partial_n_frames = partial_n_frames
         mean = np.load(self.data_dir.joinpath('mean.npy'))

diff --git a/data_objects/params_data.py b/data_objects/params_data.py
@@ -8,9 +8,7 @@
 ## Audio
 sampling_rate = 16000
 # Number of spectrogram frames in a partial utterance
-partials_n_frames = 160     # 1600 ms
-# Number of spectrogram frames at inference
-inference_n_frames = 80     #  800 ms
+partials_n_frames = 300     # 3000 ms
 
 
 ## Audio volume normalization

diff --git a/data_objects/preprocess.py b/data_objects/preprocess.py
@@ -65,7 +65,7 @@ def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir,
     # Function to preprocess utterances for one speaker
     def preprocess_speaker(speaker_dir: Path):
         # Give a name to the speaker that includes its dataset
-        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        speaker_name = speaker_dir.parts[-1]
 
         # Create an output directory with that name, as well as a txt file containing a
         # reference to each source file.
@@ -121,7 +121,7 @@ def preprocess_speaker(speaker_dir: Path):
     print("Done preprocessing %s.\n" % dataset_name)
 
 
-def preprocess_voxceleb1(dataset_root: Path, out_dir: Path, skip_existing=False):
+def preprocess_voxceleb1(dataset_root: Path, parition: str, out_dir: Path, skip_existing=False):
     # Initialize the preprocessing
     dataset_name = "VoxCeleb1"
     dataset_root, logger = _init_preprocess_dataset(dataset_name, dataset_root, out_dir)
@@ -140,7 +140,7 @@ def preprocess_voxceleb1(dataset_root: Path, out_dir: Path, skip_existing=False)
           (len(keep_speaker_ids), len(nationalities)))
 
     # Get the speaker directories for anglophone speakers only
-    speaker_dirs = dataset_root.joinpath("wav").glob("*")
+    speaker_dirs = dataset_root.joinpath(parition).glob("*")
     speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs]
 
     print("VoxCeleb1: found %d anglophone speakers on the disk." %

diff --git a/data_preprocess.py b/data_preprocess.py
@@ -3,6 +3,7 @@
 from data_objects.partition_voxceleb import partition_voxceleb
 from pathlib import Path
 import argparse
+import subprocess
 
 if __name__ == "__main__":
     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
@@ -19,15 +20,25 @@ class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptio
     args = parser.parse_args()
 
     # Process the arguments
-    out_dir = args.dataset_root.joinpath("feature")
+    dev_out_dir = args.dataset_root.joinpath("feature", "dev")
+    test_out_dir = args.dataset_root.joinpath("feature", "test")
+    merged_out_dir = args.dataset_root.joinpath("feature", "merged")
     assert args.dataset_root.exists()
     assert args.dataset_root.joinpath('iden_split.txt').exists()
     assert args.dataset_root.joinpath('veri_test.txt').exists()
     assert args.dataset_root.joinpath('vox1_meta.csv').exists()
-    out_dir.mkdir(exist_ok=True, parents=True)
+    dev_out_dir.mkdir(exist_ok=True, parents=True)
+    test_out_dir.mkdir(exist_ok=True, parents=True)
+    merged_out_dir.mkdir(exist_ok=True, parents=True)
 
     # Preprocess the datasets
-    preprocess_voxceleb1(args.dataset_root, out_dir, args.skip_existing)
-    compute_mean_std(out_dir, args.dataset_root.joinpath('mean.npy'), args.dataset_root.joinpath('std.npy'))
-    partition_voxceleb(out_dir, args.dataset_root.joinpath('iden_split.txt'))
+    preprocess_voxceleb1(args.dataset_root, 'dev', dev_out_dir, args.skip_existing)
+    preprocess_voxceleb1(args.dataset_root, 'test', test_out_dir, args.skip_existing)
+    for path in dev_out_dir.iterdir():
+        subprocess.call(['cp', '-r', path.as_posix(), merged_out_dir.as_posix()])
+    for path in test_out_dir.iterdir():
+        subprocess.call(['cp', '-r', path.as_posix(), merged_out_dir.as_posix()])
+    compute_mean_std(merged_out_dir, args.dataset_root.joinpath('mean.npy'),
+                     args.dataset_root.joinpath('std.npy'))
+    partition_voxceleb(merged_out_dir, args.dataset_root.joinpath('iden_split.txt'))
     print("Done")
diff --git a/evaluate_verification.py b/evaluate_verification.py
@@ -1,10 +1,3 @@
-# -*- coding: utf-8 -*-
-# @Date    : 2019-08-09
-# @Author  : Xinyu Gong (xy_gong@tamu.edu)
-# @Link    : None
-# @Version : 0.0
-
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -41,7 +34,6 @@ def parse_args():
     parser.add_argument('--load_path',
                         help="The path to resumed dir",
                         default=None)
-
     parser.add_argument('--text_arch',
                         help="The path to arch",
                         default=None)
@@ -78,8 +70,6 @@ def main():
         model.drop_path_prob = 0.0
     else:
         model = eval('resnet.{}(num_classes={})'.format(cfg.MODEL.NAME, cfg.MODEL.NUM_CLASSES))
-    # model = Network(cfg.MODEL.NAME, cfg.MODEL.EMBEDDING_DIM, cfg.MODEL.NUM_CLASSES)
-    # model = DeepSpeakerModel(cfg.MODEL.EMBEDDING_DIM, cfg.MODEL.NUM_CLASSES)
     model = model.cuda()
 
     # resume && make log dir and logger

diff --git a/exps/baseline/resnet18_iden.yaml b/exps/baseline/resnet18_iden.yaml
@@ -0,0 +1,28 @@
+PRINT_FREQ: 200
+VAL_FREQ: 10
+
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+
+DATASET:
+  DATA_DIR: '/path/to/VoxCeleb1'
+  SUB_DIR: 'merged'
+  NUM_WORKERS: 0
+  PARTIAL_N_FRAMES: 300
+
+TRAIN:
+  BATCH_SIZE: 256
+  LR: 0.01
+  LR_MIN: 0.001
+  BETA1: 0.9
+  BETA2: 0.999
+
+  BEGIN_EPOCH: 0
+  END_EPOCH: 301
+
+MODEL:
+  NAME: 'resnet18'
+  NUM_CLASSES: 1251
+  INIT_CHANNELS: 64
diff --git a/exps/baseline/resnet18.yaml → exps/baseline/resnet18_veri.yaml b/exps/baseline/resnet18.yaml → exps/baseline/resnet18_veri.yaml
@@ -7,15 +7,15 @@ CUDNN:
   ENABLED: true
 
 DATASET:
-  DATA_DIR: '/path/to/VoxCeleb'
+  DATA_DIR: '/path/to/VoxCeleb1'
+  SUB_DIR: 'dev'
   NUM_WORKERS: 0
   PARTIAL_N_FRAMES: 300
 
 TRAIN:
   BATCH_SIZE: 256
   LR: 0.01
   LR_MIN: 0.001
-  WD: 0.0003
   BETA1: 0.9
   BETA2: 0.999
 

diff --git a/exps/baseline/resnet34_iden.yaml b/exps/baseline/resnet34_iden.yaml
@@ -0,0 +1,28 @@
+PRINT_FREQ: 200
+VAL_FREQ: 10
+
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+
+DATASET:
+  DATA_DIR: '/path/to/VoxCeleb1'
+  SUB_DIR: 'merged'
+  NUM_WORKERS: 0
+  PARTIAL_N_FRAMES: 300
+
+TRAIN:
+  BATCH_SIZE: 128
+  LR: 0.01
+  LR_MIN: 0.001
+  BETA1: 0.9
+  BETA2: 0.999
+
+  BEGIN_EPOCH: 0
+  END_EPOCH: 301
+
+MODEL:
+  NAME: 'resnet34'
+  NUM_CLASSES: 1251
+  INIT_CHANNELS: 64
diff --git a/exps/baseline/resnet34.yaml → exps/baseline/resnet34_veri.yaml b/exps/baseline/resnet34.yaml → exps/baseline/resnet34_veri.yaml
@@ -7,15 +7,15 @@ CUDNN:
   ENABLED: true
 
 DATASET:
-  DATA_DIR: '/path/to/VoxCeleb'
+  DATA_DIR: '/path/to/VoxCeleb1'
+  SUB_DIR: 'dev'
   NUM_WORKERS: 0
   PARTIAL_N_FRAMES: 300
 
 TRAIN:
-  BATCH_SIZE: 256
+  BATCH_SIZE: 128
   LR: 0.01
   LR_MIN: 0.001
-  WD: 0.0003
   BETA1: 0.9
   BETA2: 0.999
 

diff --git a/exps/scratch/scratch.yaml → exps/scratch/scratch_iden.yaml b/exps/scratch/scratch.yaml → exps/scratch/scratch_iden.yaml
@@ -7,16 +7,15 @@ CUDNN:
   ENABLED: true
 
 DATASET:
-  DATA_DIR: '/path/to/VoxCeleb'
-  DATASET: 'train'
+  DATA_DIR: '/path/to/VoxCeleb1'
+  SUB_DIR: 'merged'
   NUM_WORKERS: 0
   PARTIAL_N_FRAMES: 300
 
 TRAIN:
-  BATCH_SIZE: 48
+  BATCH_SIZE: 96
   LR: 0.01
   LR_MIN: 0.001
-  WD: 0.0003
   BETA1: 0.9
   BETA2: 0.999
 

diff --git a/exps/scratch/scratch_veri.yaml b/exps/scratch/scratch_veri.yaml
@@ -0,0 +1,29 @@
+PRINT_FREQ: 200
+VAL_FREQ: 10
+
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+
+DATASET:
+  DATA_DIR: '/path/to/VoxCeleb1'
+  SUB_DIR: 'dev'
+  NUM_WORKERS: 0
+  PARTIAL_N_FRAMES: 300
+
+TRAIN:
+  BATCH_SIZE: 48
+  LR: 0.01
+  LR_MIN: 0.001
+  BETA1: 0.9
+  BETA2: 0.999
+
+  BEGIN_EPOCH: 0
+  END_EPOCH: 301
+
+MODEL:
+  NAME: 'model'
+  NUM_CLASSES: 1211
+  LAYERS: 8
+  INIT_CHANNELS: 64
diff --git a/functions.py b/functions.py
@@ -205,8 +205,8 @@ def validate_identification(cfg, model, test_loader, criterion):
             target = target.cuda(non_blocking=True)
 
             # compute output
-            outputs = model(input)
-            output = torch.mean(outputs, dim=0, keepdim=True)
+            output = model(input)
+            output = torch.mean(output, dim=0, keepdim=True)
             output = model.forward_classifier(output)
             acc1, acc5 = accuracy(output, target, topk=(1, 5))
             top1.update(acc1[0], input.size(0))

diff --git a/search.py b/search.py
@@ -84,8 +84,7 @@ def main():
     # Optimizer
     optimizer = optim.Adam(
         weight_params,
-        lr=cfg.TRAIN.LR,
-        weight_decay=cfg.TRAIN.WD,
+        lr=cfg.TRAIN.LR
     )
 
     # resume && make log dir and logger

diff --git a/train_baseline.py → train_baseline_identification.py b/train_baseline.py → train_baseline_identification.py
@@ -63,8 +63,7 @@ def main():
     model = model.cuda()
     optimizer = optim.Adam(
         model.net_parameters() if hasattr(model, 'net_parameters') else model.parameters(),
-        lr=cfg.TRAIN.LR,
-        weight_decay=cfg.TRAIN.WD,
+        lr=cfg.TRAIN.LR
     )
 
     # Loss
@@ -100,9 +99,9 @@ def main():
 
     # dataloader
     train_dataset = DeepSpeakerDataset(
-        Path(cfg.DATASET.DATA_DIR), cfg.DATASET.PARTIAL_N_FRAMES, 'train')
+        Path(cfg.DATASET.DATA_DIR), cfg.DATASET.SUB_DIR, cfg.DATASET.PARTIAL_N_FRAMES, 'train')
     test_dataset_identification = DeepSpeakerDataset(
-        Path(cfg.DATASET.DATA_DIR), cfg.DATASET.PARTIAL_N_FRAMES, 'test', is_test=True)
+        Path(cfg.DATASET.DATA_DIR),  cfg.DATASET.SUB_DIR, cfg.DATASET.PARTIAL_N_FRAMES, 'test', is_test=True)
     train_loader = torch.utils.data.DataLoader(
         dataset=train_dataset,
         batch_size=cfg.TRAIN.BATCH_SIZE,