ecmwf · clessig · Dec 23, 2025 · Oct 30, 2025 · Oct 31, 2025 · Nov 4, 2025
diff --git a/NOTICE b/NOTICE
@@ -0,0 +1,10 @@
+This project includes code derived from project "DINOv2: Learning Robust Visual Features without Supervision",
+originally developed by Meta Platforms, Inc. and affiliates,
+licensed under the Apache License, Version 2.0.
+
+Original NOTICE from project DINOv2
+--------------------------------------
+
+N/A
+
+
diff --git a/config/default_config.yml b/config/default_config.yml
@@ -1,5 +1,4 @@
 streams_directory: "./config/streams/era5_1deg/"
-# streams_directory: "./config/streams/era5_nppatms_synop/"
 
 embed_orientation: "channels"
 embed_unembed_mode: "block"
@@ -46,6 +45,8 @@ pred_adapter_kv: False
 pred_self_attention: True
 pred_dyadic_dims: False
 pred_mlp_adaln: True
+num_class_tokens: 1
+num_register_tokens: 7
 
 # number of steps offset applied to first target window; if set to zero and forecast_steps=0 then
 # one is training an auto-encoder
@@ -93,7 +94,7 @@ validate_with_ema: True
 ema_ramp_up_ratio: 0.09
 ema_halflife_in_thousands: 1e-3
 
-
+### Example validation and training config for mask token modelling in physical space
 validation_config: {"losses": {LossPhysical: {weight: 1.0, loss_fcts: [['mse', 1.0]]},}}
 # Student-teacher configuration (only used when training_mode == "student_teacher")
 # TODO: adapt so that the masking or forecast config entry also sits here
@@ -111,6 +112,36 @@ training_config:
       relationship: "complement"  # "independent", "subset", "disjoint". Relationship of student views to teacher view.
       num_steps_input: 1
 
+# ### Example validation and training config for student-teacher with JEPA
+# validation_config:
+#   losses:
+#     LossLatentSSLStudentTeacher: { 
+#       "weight": 1.0,
+#       "JEPA": {'weight': 5, "loss_extra_args": {}, "out_dim": 2048} }
+# ### Student-teacher configuration (only used when training_mode == "student_teacher")
+# training_config:
+#   # when this is "masking", we are basically only using the model_input subconfig
+#   training_mode: "student_teacher"  # "masking", "student_teacher", "forecast"
+#   target_and_aux_calc: "EMATeacher"
+#   losses :  
+#     LossLatentSSLStudentTeacher: { 
+#       "weight": 1.0,
+#       "JEPA": {'weight': 5, "loss_extra_args": {}, "out_dim": 2048} }
+#   model_input:
+#     - masking_strategy: "random" # "random", "healpix". Masking strategy to use for model input for masking, and local (student) views when doing student-teacher
+#       num_samples: 1  # if student-teacher, the number of local (student) views to generate
+#       masking_strategy_config : { diffusion_rn : False, rate : 0.4 }
+#       # relationship: "independent" #, "subset", "disjoint". Relationship of student views to teacher view.
+#       relationship: "subset"  # "independent", "subset", "disjoint". Relationship of student views to teacher view.
+#       loss : jepa
+#       rate_sampling: False  # randomly sample the rate per batch
+# 
+#   target_input:
+#     - masking_strategy: "healpix"  # Strategy for teacher (global) view: "random", "healpix"
+#       masking_strategy_config : { diffusion_rn : False, rate : 0.4, hl_mask: 0 }
+#       num_samples: 1 # number of teacher views to generate
+#       rate_sampling: False  # randomly sample the rate per batch
+
 
     # - masking_strategy: "random"
     #   num_samples: 2  # if student-teacher, the number of local (student) views to generate
@@ -217,12 +248,60 @@ training_config:
 #       relationship: "independent"
 #       num_steps_input: 1
 
+# ### Example validation and training config for student-teacher with iBOT and DINO
+# validation_config:
+#   losses:
+#     LossLatentSSLStudentTeacher: { 
+#       "weight": 1.0,
+#       "iBOT": {'weight': 0.75, "loss_extra_args": { "student_temp": 0.1,},"out_dim": 16384, "teacher_temp": 0.1,
+#           "teacher_style": "softmax_center", "center_momentum": 0.9}, 
+#       "DINO": {'weight': 0.25, "loss_extra_args": { "student_temp": 0.1,}, "out_dim": 16384, "teacher_temp": 0.1,
+#           "teacher_style": "softmax_center", "center_momentum": 0.9}, 
+#       }
+# 
+# 
+# ### Student-teacher configuration (only used when training_mode == "student_teacher")
+# training_config:
+#   # when this is "masking", we are basically only using the model_input subconfig
+#   training_mode: "student_teacher"  # "masking", "student_teacher", "forecast"
+#   target_and_aux_calc: "EMATeacher"
+#   losses :  
+#     LossLatentSSLStudentTeacher: { 
+#       "weight": 1.0,
+#       "iBOT": {'weight': 0.75, "loss_extra_args": { "student_temp": 0.1,},"out_dim": 4096, # 16384, 
+#           "teacher_temp": 0.1, "teacher_style": "softmax_center", "center_momentum": 0.9}, 
+#       "DINO": {'weight': 0.25, "loss_extra_args": { "student_temp": 0.1,}, "out_dim": 4096, # 16384, 
+#           "teacher_temp": 0.1, "teacher_style": "softmax_center", "center_momentum": 0.9}, 
+#       }
+#   model_input:
+#     - masking_strategy: "random" # "random", "healpix". Masking strategy to use for model input for masking, and local (student) views when doing student-teacher
+#       num_samples: 1  # if student-teacher, the number of local (student) views to generate
+#       masking_strategy_config : { diffusion_rn : False, rate : 0.4 }
+#       relationship: "subset"  # "independent", "subset", "disjoint". Relationship of student views to teacher view.
+#       rate_sampling: False  # randomly sample the rate per batch
+#       loss : ibot
+#     - masking_strategy: "healpix"
+#       num_samples: 2 # if student-teacher, the number of local (student) views to generate
+#       masking_strategy_config : { diffusion_rn : False, rate : 0.4, hl_mask: 1 }
+#       relationship: "subset"  # "independent", "subset", "disjoint". Relationship of student views to teacher view.
+#       rate_sampling: False  # randomly sample the rate per batch
+#       loss : dino
+#     - masking_strategy: "healpix"
+#       num_samples: 1 # if student-teacher, the number of local (student) views to generate
+#       masking_strategy_config : { diffusion_rn : False, rate : 0.4, hl_mask: 1 }
+#       relationship: "identity"  # "independent", "subset", "disjoint". Relationship of student views to teacher view.
+#       rate_sampling: False  # randomly sample the rate per batch
+#       loss : dino
+# 
+#   target_input:
+#     - masking_strategy: "healpix"  # Strategy for teacher (global) view: "random", "healpix"
+#       masking_strategy_config : { diffusion_rn : False, rate : 0.4, hl_mask: 0 }
+#       num_samples: 2 # number of teacher views to generate
+#       rate_sampling: False  # randomly sample the rate per batch
 
 
-num_register_tokens: 0
-
 num_mini_epochs: 32
-samples_per_mini_epoch: 4096
+samples_per_mini_epoch: 4096 # 250000 for student-teacher because validation is meaningless
 samples_per_validation: 512
 
 shuffle: True
@@ -271,7 +350,6 @@ train_log_freq:
   metrics: 20
   checkpoint: 250
 
-
 # Tags for experiment tracking
 # These tags will be logged in MLFlow along with completed runs for train, eval, val
 # The tags are free-form, with the following rules:

diff --git a/config/default_config_dino.yml b/config/default_config_dino.yml
@@ -0,0 +1,216 @@
+streams_directory: "./config/streams/era5_1deg/"
+
+embed_orientation: "channels"
+embed_unembed_mode: "block"
+embed_dropout_rate: 0.1
+
+target_cell_local_prediction: True
+
+ae_local_dim_embed: 1024
+ae_local_num_blocks: 2
+ae_local_num_heads: 16
+ae_local_dropout_rate: 0.1
+ae_local_with_qk_lnorm: True
+
+ae_local_num_queries: 1
+ae_local_queries_per_cell: False
+ae_adapter_num_heads: 16
+ae_adapter_embed: 128
+ae_adapter_with_qk_lnorm: True
+ae_adapter_with_residual: True
+ae_adapter_dropout_rate: 0.1
+
+ae_global_dim_embed: 2048
+ae_global_num_blocks: 8
+ae_global_num_heads: 32
+ae_global_dropout_rate: 0.1
+ae_global_with_qk_lnorm: True
+# TODO: switching to < 1 triggers triton-related issues.
+# See https://github.com/ecmwf/WeatherGenerator/issues/1050
+ae_global_att_dense_rate: 1.0
+ae_global_block_factor: 64
+ae_global_mlp_hidden_factor: 2
+ae_global_trailing_layer_norm: False
+
+ae_aggregation_num_blocks: 2
+ae_aggregation_num_heads: 32
+ae_aggregation_dropout_rate: 0.1
+ae_aggregation_with_qk_lnorm: True
+ae_aggregation_att_dense_rate: 1.0
+ae_aggregation_block_factor: 64
+ae_aggregation_mlp_hidden_factor: 2
+
+decoder_type: PerceiverIOCoordConditioning # CrossAttentionAdaNormConditioning
+pred_adapter_kv: False
+pred_self_attention: True
+pred_dyadic_dims: False
+pred_mlp_adaln: True
+num_class_tokens: 1
+num_register_tokens: 7
+
+# number of steps offset applied to first target window; if set to zero and forecast_steps=0 then
+# one is training an auto-encoder
+forecast_offset : 0
+forecast_delta: 00:00:00
+forecast_steps: 0
+forecast_policy: null
+forecast_freeze_model: False
+forecast_att_dense_rate: 1.0
+forecast_with_step_conditioning: True # False
+fe_num_blocks: 0
+fe_num_heads: 16
+fe_dropout_rate: 0.1
+fe_with_qk_lnorm: True
+fe_layer_norm_after_blocks: []  # Index starts at 0. Thus, [3] adds a LayerNorm after the fourth layer
+fe_impute_latent_noise_std: 0.0  # 1e-4
+
+healpix_level: 5
+
+with_mixed_precision: True
+with_flash_attention: True
+compile_model: False
+with_fsdp: True
+attention_dtype: bf16
+mixed_precision_dtype: bf16
+mlp_norm_eps: 1e-5
+norm_eps: 1e-4
+
+latent_noise_kl_weight: 0.0 # 1e-5
+latent_noise_gamma: 2.0
+latent_noise_saturate_encodings: 5 
+latent_noise_use_additive_noise: False
+latent_noise_deterministic_latents: True 
+
+batch_size_per_gpu: 1
+batch_size_validation_per_gpu: 1
+
+# a regex that needs to fully match the name of the modules you want to freeze
+# e.g. ".*ERA5" will match any module whose name ends in ERA5\
+# encoders and decoders that exist per stream have the stream name attached at the end
+freeze_modules: ""
+
+# whether to track the exponential moving average of weights for validation
+validate_with_ema: True
+ema_ramp_up_ratio: 0.09
+ema_halflife_in_thousands: 1e-3
+
+### Example validation and training config for student-teacher with iBOT and DINO
+validation_config:
+  losses:
+    LossLatentSSLStudentTeacher: { 
+      "weight": 1.0,
+      "iBOT": {'weight': 0.75, "loss_extra_args": { "student_temp": 0.1,},"out_dim": 16384, "teacher_temp": 0.1,
+          "teacher_style": "softmax_center", "center_momentum": 0.9}, 
+      "DINO": {'weight': 0.25, "loss_extra_args": { "student_temp": 0.1,}, "out_dim": 16384, "teacher_temp": 0.1,
+          "teacher_style": "softmax_center", "center_momentum": 0.9}, 
+      }
+
+### Student-teacher configuration (only used when training_mode == "student_teacher")
+training_config:
+  # when this is "masking", we are basically only using the model_input subconfig
+  training_mode: "student_teacher"  # "masking", "student_teacher", "forecast"
+  target_and_aux_calc: "EMATeacher"
+  losses :  
+    LossLatentSSLStudentTeacher: { 
+      "weight": 1.0,
+      "iBOT": {'weight': 0.75, "loss_extra_args": { "student_temp": 0.1,},"out_dim": 4096, # 16384, 
+          "teacher_temp": 0.1, "teacher_style": "softmax_center", "center_momentum": 0.9}, 
+      "DINO": {'weight': 0.25, "loss_extra_args": { "student_temp": 0.1,}, "out_dim": 4096, # 16384, 
+          "teacher_temp": 0.1, "teacher_style": "softmax_center", "center_momentum": 0.9}, 
+      }
+  model_input:
+    - masking_strategy: "random" # "random", "healpix". Masking strategy to use for model input for masking, and local (student) views when doing student-teacher
+      num_samples: 1  # if student-teacher, the number of local (student) views to generate
+      masking_strategy_config : { diffusion_rn : False, rate : 0.4 }
+      relationship: "subset"  # "independent", "subset", "disjoint". Relationship of student views to teacher view.
+      rate_sampling: False  # randomly sample the rate per batch
+      loss : ibot
+    - masking_strategy: "healpix"
+      num_samples: 2 # if student-teacher, the number of local (student) views to generate
+      masking_strategy_config : { diffusion_rn : False, rate : 0.4, hl_mask: 1 }
+      relationship: "subset"  # "independent", "subset", "disjoint". Relationship of student views to teacher view.
+      rate_sampling: False  # randomly sample the rate per batch
+      loss : dino
+    - masking_strategy: "healpix"
+      num_samples: 1 # if student-teacher, the number of local (student) views to generate
+      masking_strategy_config : { diffusion_rn : False, rate : 0.4, hl_mask: 1 }
+      relationship: "identity"  # "independent", "subset", "disjoint". Relationship of student views to teacher view.
+      rate_sampling: False  # randomly sample the rate per batch
+      loss : dino
+
+  target_input:
+    - masking_strategy: "healpix"  # Strategy for teacher (global) view: "random", "healpix"
+      masking_strategy_config : { diffusion_rn : False, rate : 0.4, hl_mask: 0 }
+      num_samples: 2 # number of teacher views to generate
+      rate_sampling: False  # randomly sample the rate per batch
+
+num_mini_epochs: 32
+samples_per_mini_epoch: 4096 # 250000 for student-teacher because validation is meaningless
+samples_per_validation: 512
+
+shuffle: True
+
+lr_scaling_policy: "sqrt"
+lr_start: 1e-6
+lr_max: 5e-5
+lr_final_decay: 1e-6
+lr_final: 0.0
+lr_steps_warmup: 512
+lr_steps_cooldown: 512
+lr_policy_warmup: "cosine"
+lr_policy_decay: "constant"
+lr_policy_cooldown: "linear"
+
+grad_clip: 1.0
+weight_decay: 0.1
+norm_type: "LayerNorm"
+nn_module: "te"
+log_grad_norms: False
+
+start_date: 1979-01-01T00:00
+end_date: 2022-12-31T00:00
+start_date_val: 2023-10-01T00:00
+end_date_val: 2023-12-31T00:00
+time_window_step: 06:00:00
+time_window_len: 06:00:00
+input_window_steps: 1
+
+val_initial: False #True
+
+loader_num_workers: 12
+log_validation: 0
+streams_output: ["ERA5"]
+
+istep: 0
+run_history: []
+
+desc: ""
+data_loader_rng_seed: ???
+run_id: ???
+
+# The period to log in the training loop (in number of batch steps)
+train_log_freq:
+  terminal: 10
+  metrics: 20
+  checkpoint: 250
+
+# Tags for experiment tracking
+# These tags will be logged in MLFlow along with completed runs for train, eval, val
+# The tags are free-form, with the following rules:
+# - tags should be primitive types (strings, numbers, booleans). NO lists or dictionaries
+# - tags should not duplicate existing config entries.
+# - try to reuse existing tags where possible. MLFlow does not like having too many unique tags
+# - do not use long strings in values (less than 20 characters is a good rule of thumb, we may enforce this in the future)
+wgtags:
+  # The name of the organization of the person running the experiment.
+  # This may be autofilled in the future. Expected values are lowercase strings of 
+  # the organizations codenames in https://confluence.ecmwf.int/display/MAEL/Staff+Contact+List
+  # e.g. "ecmwf", "cmcc", "metnor", "jsc", "escience"
+  org: None
+  # The name of the experiment. This is a distinctive codename for the experiment campaign being run.
+  # This is expected to be the primary tag for comparing experiments in MLFlow.
+  # Expected values are lowercase strings with no spaces, just underscores:
+  # Examples: "rollout_ablation_grid"  
+  exp: None
+  # *** Experiment-specific tags ***
+  grid: None