-
Notifications
You must be signed in to change notification settings - Fork 0
/
all_in_one_config.yaml
130 lines (114 loc) · 2.97 KB
/
all_in_one_config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
autoencoder_training:
root_path: /path/to/tfrecord/images # directory where pre-converted "*.tfrecord" files are located
params:
batch_size: 3
image_size: 256
keys: ["image"] # ["image"] for training autoencoders, and ["image", "caption"] for txt2img latent diffusion model
autoencoder_type: "vq" # ["kl", "vq"]
ckpt_path: "aevq" # ["aekl", "aevq"], path to the ckpt in which a trained model will be saved
num_iterations: 500000 # num of training iterations
ldm_training:
root_path: /path/to/tfrecord/images_captions
params:
batch_size: 1
image_size: 256
flip: false
keys: ["image", "caption"] # ["image"] for training autoencoders, and ["image", "caption"] for txt2img latent diffusion model
autoencoder_type: "kl" # ["kl", "vq"]
ckpt_path: "ldm"
num_iterations: 500000
train_cond_model: false
condition_dropout_rate: 0.1
ldm_sampling:
guidance_scale: 5.
latent_shape: [4, 32, 32, 4]
sample_save_progress: false
text_prompt: "a virus monster is playing guitar, oil on canvas"
vocab_dir: bert_model # directory where the `vocab.txt` file is located
autoencoder_type: "kl" # ["kl", "vq"]
# pretrained check point paths
pre_ckpt_paths:
cond_stage_model: transformer-1
unet: unet-1
autoencoder: autoencoder-1
lpips_ckpt_path: lpips.ckpt-1
autoencoder_kl_trainer:
global_step_discriminator: 50001
lpips_weight: 1.
kl_weight: 1.e-6
discriminator_weight: 0.5
discriminator_factor: 1.
discriminator_loss_type: hinge # or vanilla
autoencoder_vq_trainer:
global_step_discriminator: 1
codebook_weight: 1.
lpips_weight: 1.
kl_weight: 1.
discriminator_weight: 0.6
discriminator_factor: 1.
discriminator_loss_type: hinge # or vanilla
cond_stage_model:
vocab_size: 30522 # computed by tokenizer, do not change
encoder_stack_size: 32
hidden_size: 1280
num_heads: 8
size_per_head: 64
max_seq_len: 77
filter_size: 5120
dropout_rate: 0.1
autoencoder_kl:
latent_channels: 4
channels: 128
num_blocks: 2
attention_resolutions: []
dropout_rate: 0.
multipliers: [1, 2, 4, 4]
resample_with_conv: true
ae_kl_discriminator:
channels: 64
num_layers: 3
autoencoder_vq:
latent_channels: 4
channels: 128
num_blocks: 2
attention_resolutions: [32]
dropout_rate: 0.
multipliers: [1, 2, 2, 4]
resample_with_conv: true
vocab_size: 16384
beta: 0.25
ae_vq_discriminator:
channels: 64
num_layers: 2
unet:
model_channels: 320
out_channels: 4
num_blocks: 2
attention_resolutions: [4, 2, 1]
dropout_rate: 0.1
channel_mult: [1, 2, 4, 4]
num_heads: 8
ldm:
num_steps: 1000
beta_start: 0.00085
beta_end: 0.012
v_posterior: 0.
scale_factor: 0.18215
eta: 0.
num_ddim_steps: 50
autoencoder_optimizer:
learning_rate: 4.5e-6
beta_1: 0.5
beta_2: 0.9
epsilon: 1.e-8
discriminator_optimizer:
learning_rate: 4.5e-6
beta_1: 0.5
beta_2: 0.9
epsilon: 1.e-8
latent_diffusion_optimizer:
learning_rate: 5.0e-05
beta_1: 0.9
beta_2: 0.999
epsilon: 1.e-8
weight_decay: 0.01