config/autoencoderkl.json

{
  "base_config": "config/base.json",
  "model_type": "AutoencoderKL",
  "task_type": "tta",
  "dataset": [
    "AudioCaps"
  ],
  "preprocess": {
    // feature used for model training
    "use_spkid": false,
    "use_uv": false,
    "use_frame_pitch": false,
    "use_phone_pitch": false,
    "use_frame_energy": false,
    "use_phone_energy": false,
    "use_mel": false,
    "use_audio": false,
    "use_label": false,
    "use_one_hot": false
  },
  // model
  "model": {
    "autoencoderkl": {
      "ch": 128,
      "ch_mult": [
        1,
        1,
        2,
        2,
        4
      ],
      "num_res_blocks": 2,
      "in_channels": 1,
      "z_channels": 4,
      "out_ch": 1,
      "double_z": true
    },
    "loss": {
      "kl_weight": 1e-8,
      "disc_weight": 0.5,
      "disc_factor": 1.0,
      "logvar_init": 0.0,
      "min_adapt_d_weight": 0.0,
      "max_adapt_d_weight": 10.0,
      "disc_start": 50001,
      "disc_in_channels": 1,
      "disc_num_layers": 3,
      "use_actnorm": false
    }
  },
  // train
  "train": {
    "lronPlateau": {
      "factor": 0.9,
      "patience": 100,
      "min_lr": 4.0e-5,
      "verbose": true
    },
    "adam": {
      "lr": 4.0e-4,
      "betas": [
        0.9,
        0.999
      ],
      "weight_decay": 1.0e-2,
      "eps": 1.0e-8
    }
  }
}