diff --git a/config.yaml b/config.yaml deleted file mode 100644 index 7679a00..0000000 --- a/config.yaml +++ /dev/null @@ -1,82 +0,0 @@ -wave_file_save_path: "./output" - -id: - version: "v1" - name: "default" - root: "/mnt/fast/nobackup/users/hl01486/projects/general_audio_generation/AudioLDM-python/config/default/latent_diffusion.yaml" - -model: - device: "cuda" - reload_from_ckpt: "/mnt/fast/nobackup/scratch4weeks/hl01486/exps/audio_generation/stablediffusion/LDM/audioverse/2023_01_14_full_F4_B_spatial_v2_v1/checkpoints/last.ckpt" - target: audioldm.pipline.LatentDiffusion - params: - base_learning_rate: 5.0e-6 - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: fbank - cond_stage_key: waveform - latent_t_size: 256 # TODO might need to change - latent_f_size: 16 - channels: 8 # TODO might need to change - cond_stage_trainable: true - conditioning_key: film - monitor: val/loss_simple_ema - scale_by_std: true - unet_config: - target: audioldm.latent_diffusion.openaimodel.UNetModel - params: - image_size: 64 # TODO here - extra_film_condition_dim: 512 - extra_film_use_concat: true - in_channels: 8 # TODO might need to change - out_channels: 8 # TODO might need to change - model_channels: 128 # TODO might need to change - attention_resolutions: - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 5 - num_head_channels: 32 - use_spatial_transformer: true - - first_stage_config: - base_learning_rate: 4.5e-05 - target: audioldm.variational_autoencoder.autoencoder.AutoencoderKL - params: - monitor: val/rec_loss - image_key: fbank - subband: 1 - embed_dim: 8 - time_shuffle: 1 - ddconfig: - double_z: true - z_channels: 8 - resolution: 256 - downsample_time: false - in_channels: 1 - out_ch: 1 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - - cond_stage_config: - target: audioldm.clap.encoders.CLAPAudioEmbeddingClassifierFreev2 - params: - # pretrained_path: /mnt/fast/nobackup/users/hl01486/projects/contrastive_pretraining/CLAP/assets/checkpoints/epoch_top_0_audioset_no_fusion.pt - key: waveform - sampling_rate: 16000 - embed_mode: audio - unconditional_prob: 0.1 \ No newline at end of file