ecmwf · clessig · Nov 12, 2025 · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025
diff --git a/config/default_config.yml b/config/default_config.yml
@@ -1,9 +1,6 @@
 streams_directory: "./config/streams/era5_1deg/"
 
 embed_orientation: "channels"
-embed_local_coords: True
-embed_centroids_local_coords: False
-embed_size_centroids: 0
 embed_unembed_mode: "block"
 embed_dropout_rate: 0.1
 
@@ -42,7 +39,7 @@ pred_mlp_adaln: True
 
 # number of steps offset applied to first target window; if set to zero and forecast_steps=0 then
 # one is training an auto-encoder
-forecast_offset : 0
+forecast_offset : 1
 forecast_delta_hrs: 0
 forecast_steps: 0
 forecast_policy: null
@@ -91,18 +88,16 @@ validate_with_ema: True
 ema_ramp_up_ratio: 0.09
 ema_halflife_in_thousands: 1e-3
 
-# training mode: "forecast" or "masking" (masked token modeling)
-# for "masking" to train with auto-encoder mode, forecast_offset should be 0
-training_mode: "masking"
+# include a masking strategy here, currently only supporting "random", "block", "healpix", "channel", "causal" and "combination"
+masking_strategy: "forecast"
+
+# include a masking strategy here, currently only supporting "random", "block", "healpix", "channel", "causal" and "combination"
+masking_strategy: "forecast"
 # masking rate when training mode is "masking"; ignored in foreacast mode
 masking_rate: 0.6
 # sample the masking rate (with normal distribution centered at masking_rate)
 # note that a sampled masking rate leads to varying requirements
 masking_rate_sampling: True
-# sample a subset of all target points, useful e.g. to reduce memory requirements (also can specify per-stream)
-sampling_rate_target: 1.0
-# include a masking strategy here, currently only supporting "random", "block", "healpix", "channel", "causal" and "combination"
-masking_strategy: "random"
 # masking_strategy_config is a dictionary of additional parameters for the masking strategy
 # required for "healpix" and "channel" masking strategies
 # "healpix": requires healpix mask level to be specified with `hl_mask`
@@ -135,12 +130,13 @@ norm_type: "LayerNorm"
 nn_module: "te"
 log_grad_norms: False
 
-start_date: 197901010000
+# start_date: 197901010000
+start_date: 201401010000
 end_date: 202012310000
 start_date_val: 202101010000
 end_date_val: 202201010000
-len_hrs: 6
-step_hrs: 6
+len_hrs: 3
+step_hrs: 3
 input_window_steps: 1
 
 val_initial: False

diff --git a/src/weathergen/datasets/masking.py b/src/weathergen/datasets/masking.py
@@ -54,10 +54,6 @@ def __init__(self, cf: Config):
         # number of healpix cells
         self.healpix_num_cells = 12 * (4**self.healpix_level_data)
 
-        # Initialize the mask, set to None initially,
-        # until it is generated in mask_source.
-        self.perm_sel: list[np.typing.NDArray] = None
-
         # Per-batch strategy tracking
         self.same_strategy_per_batch = self.masking_strategy_config.get(
             "same_strategy_per_batch", False
@@ -139,6 +135,88 @@ def _select_strategy(self):
             # Non-combination strategy, return as is
             return self.masking_strategy
 
+    def mask_source_idxs(
+        self,
+        stream_info,
+        idxs_cells,
+        idxs_cells_lens,
+        rdata,
+    ) -> (torch.Tensor, torch.Tensor):
+        """
+
+        Return:
+            torch.Tensor[bool] of length num_tokens that determines masking for each token
+        """
+
+        self.mask_tokens, self.mask_channels = None, None
+
+        num_tokens = torch.tensor([len(t) for t in idxs_cells_lens]).sum().item()
+
+        # If there are no tokens, return empty lists.
+        if num_tokens == 0:
+            return (self.mask_tokens, self.mask_channels)
+
+        # clean strategy selection
+        self.current_strategy = self._select_strategy()
+
+        # Set the masking rate.
+        rate = self._get_sampling_rate()
+
+        if self.current_strategy == "random":
+            self.mask_tokens = self.rng.uniform(0, 1, num_tokens) < rate
+
+        elif self.current_strategy == "forecast":
+            self.mask_tokens = np.ones(num_tokens, dtype=np.bool)
+
+        elif self.current_strategy == "healpix":
+            # TODO: currently only for fixed level
+            num_cells = len(idxs_cells_lens)
+            mask_cells = self.rng.uniform(0, 1, num_cells) < rate
+            # translate cell mask to token mask, replicating using number of tokens per cell
+            self.mask_tokens = [
+                (torch.ones(2, dtype=torch.bool) * (1 if m else 0)).to(torch.bool)
+                for idxs_cell, m in zip(idxs_cells_lens, mask_cells, strict=False)
+            ]
+        elif self.current_strategy == "cropping" or self.current_strategy == "causal":
+            pass
+
+        else:
+            assert False, f"Unsupported masking strategy: {self.current_strategy}."
+
+        return (self.mask_tokens, self.mask_channels)
+
+    def mask_targets_idxs(
+        self,
+        stream_info,
+        idxs_cells,
+        idxs_cells_lens,
+        rdata,
+    ) -> (torch.Tensor, torch.Tensor):
+        # mask_source_idxs is
+        assert (self.mask_tokens is not None) or (self.mask_tokens is not None)
+        idxs_ord_inv = torch.tensor([], dtype=torch.int64)
+
+        # TODO: better handling of if statement
+        if self.current_strategy == "forecast":
+            num_tokens = torch.tensor([len(t) for t in idxs_cells_lens]).sum().item()
+            self.mask_tokens = np.ones(num_tokens, dtype=np.bool)
+
+            # inverse map for reordering to output data points in same order as input
+            idxs_ord = torch.cat([t for tt in idxs_cells for t in tt])
+            idxs_ord_inv = torch.argsort(idxs_ord)
+
+        else:
+            # masking strategies: target is complement of source
+            # TODO: ensure/enforce that forecast_offset==0
+            if self.mask_tokens is not None:
+                self.mask_tokens = ~self.mask_tokens
+            if self.mask_channels is not None:
+                self.mask_channels = ~self.mask_channels
+
+        # TODO: self.mask_tokens seems brittle in terms of naming
+
+        return (self.mask_tokens, self.mask_channels, idxs_ord_inv)
+
     def mask_source(
         self,
         tokenized_data: list[torch.Tensor],

diff --git a/src/weathergen/datasets/multi_stream_data_sampler.py b/src/weathergen/datasets/multi_stream_data_sampler.py
@@ -26,7 +26,6 @@
 from weathergen.datasets.icon_dataset import IconDataset
 from weathergen.datasets.masking import Masker
 from weathergen.datasets.stream_data import StreamData, spoof
-from weathergen.datasets.tokenizer_forecast import TokenizerForecast
 from weathergen.datasets.tokenizer_masking import TokenizerMasking
 from weathergen.datasets.utils import (
     compute_idxs_predict,
@@ -207,8 +206,6 @@ def __init__(
         self.shuffle = shuffle
         # TODO: remove options that are no longer supported
         self.input_window_steps = cf.input_window_steps
-        self.embed_local_coords = cf.embed_local_coords
-        self.embed_centroids_local_coords = cf.embed_centroids_local_coords
         self.sampling_rate_target = cf.sampling_rate_target
 
         self.batch_size = batch_size
@@ -224,17 +221,8 @@ def __init__(
         self.healpix_level: int = cf.healpix_level
         self.num_healpix_cells: int = 12 * 4**self.healpix_level
 
-        if cf.training_mode == "forecast":
-            self.tokenizer = TokenizerForecast(cf.healpix_level)
-        elif cf.training_mode == "masking":
-            masker = Masker(cf)
-            self.tokenizer = TokenizerMasking(cf.healpix_level, masker)
-            assert self.forecast_offset == 0, "masked token modeling requires auto-encoder training"
-            msg = "masked token modeling does not support self.input_window_steps > 1; "
-            msg += "increase window length"
-            assert self.input_window_steps == 1, msg
-        else:
-            assert False, f"Unsupported training mode: {cf.training_mode}"
+        masker = Masker(cf)
+        self.tokenizer = TokenizerMasking(cf.healpix_level, masker)
 
         self.epoch = 0
 
@@ -362,6 +350,8 @@ def __iter__(self):
 
                 streams_data: list[StreamData] = []
 
+                # tokenizer.generate_masks_for_sample()
+
                 # for all streams
                 for stream_info, stream_ds in zip(self.streams, self.streams_datasets, strict=True):
                     stream_data = StreamData(
@@ -383,15 +373,14 @@ def __iter__(self):
                         stream_data.source_is_spoof = True
 
                     # preprocess data for model input
-                    (ss_cells, ss_lens, ss_centroids) = self.tokenizer.batchify_source(
+                    (ss_cells, ss_lens) = self.tokenizer.batchify_source(
                         stream_info,
                         readerdata_to_torch(rdata),
                         (time_win_source.start, time_win_source.end),
-                        stream_ds[0].normalize_coords,
                     )
 
-                    # TODO: rdata only be collected in validation mode
-                    stream_data.add_source(rdata, ss_lens, ss_cells, ss_centroids)
+                    # collect data for stream
+                    stream_data.add_source(rdata, ss_lens, ss_cells)
 
                     # target
 
@@ -419,14 +408,14 @@ def __iter__(self):
                             stream_data.target_is_spoof = True
 
                         # preprocess data for model input
-                        (tt_cells, tc, tt_c, tt_t) = self.tokenizer.batchify_target(
+                        (tt_cells, tt_t, tt_c, tc, tc_l, idxs_inv) = self.tokenizer.batchify_target(
                             stream_info,
                             self.sampling_rate_target,
                             readerdata_to_torch(rdata),
                             (time_win_target.start, time_win_target.end),
                         )
 
-                        stream_data.add_target(fstep, tt_cells, tc, tt_c, tt_t)
+                        stream_data.add_target(fstep, tt_cells, tc, tc_l, tt_c, tt_t, idxs_inv)
 
                     # merge inputs for sources and targets for current stream
                     streams_data += [stream_data]

diff --git a/src/weathergen/datasets/stream_data.py b/src/weathergen/datasets/stream_data.py
@@ -57,12 +57,12 @@ def __init__(self, idx: int, forecast_steps: int, healpix_cells: int) -> None:
         self.target_tokens_lens = [
             torch.tensor([0 for _ in range(self.healpix_cells)]) for _ in range(forecast_steps + 1)
         ]
+        self.idxs_inv = [torch.tensor([], dtype=torch.int64) for _ in range(forecast_steps + 1)]
 
         # source tokens per cell
         self.source_tokens_cells = []
         # length of source tokens per cell (without padding)
         self.source_tokens_lens = []
-        self.source_centroids = []
         # unprocessed source (for logging)
         self.source_raw = []
         # auxiliary data for scatter operation that changes from stream-centric to cell-centric
@@ -85,12 +85,10 @@ def to_device(self, device: str) -> None:
         """
 
         self.source_tokens_cells = self.source_tokens_cells.to(device, non_blocking=True)
-        self.source_centroids = self.source_centroids.to(device, non_blocking=True)
         self.source_tokens_lens = self.source_tokens_lens.to(device, non_blocking=True)
 
         self.target_coords = [t.to(device, non_blocking=True) for t in self.target_coords]
         self.target_tokens = [t.to(device, non_blocking=True) for t in self.target_tokens]
-        self.target_tokens_lens = [t.to(device, non_blocking=True) for t in self.target_tokens_lens]
 
         self.source_idxs_embed = self.source_idxs_embed.to(device, non_blocking=True)
         self.source_idxs_embed_pe = self.source_idxs_embed_pe.to(device, non_blocking=True)
@@ -114,7 +112,6 @@ def add_empty_source(self, source: IOReaderData) -> None:
         self.source_raw += [source]
         self.source_tokens_lens += [torch.ones([self.healpix_cells], dtype=torch.int32)]
         self.source_tokens_cells += [torch.tensor([])]
-        self.source_centroids += [torch.tensor([])]
 
     def add_empty_target(self, fstep: int) -> None:
         """
@@ -131,17 +128,14 @@ def add_empty_target(self, fstep: int) -> None:
         """
 
         self.target_tokens[fstep] += [torch.tensor([], dtype=torch.int32)]
-        self.target_tokens_lens[fstep] += [torch.zeros([self.healpix_cells], dtype=torch.int32)]
         self.target_coords[fstep] += [torch.zeros((0, 105)) for _ in range(self.healpix_cells)]
         self.target_coords_lens[fstep] += [torch.zeros([self.healpix_cells], dtype=torch.int32)]
         self.target_coords_raw[fstep] += [torch.tensor([]) for _ in range(self.healpix_cells)]
         self.target_times_raw[fstep] += [
             np.array([], dtype="datetime64[ns]") for _ in range(self.healpix_cells)
         ]
 
-    def add_source(
-        self, ss_raw: IOReaderData, ss_lens: torch.tensor, ss_cells: list, ss_centroids: list
-    ) -> None:
+    def add_source(self, ss_raw: IOReaderData, ss_lens: torch.tensor, ss_cells: list) -> None:
         """
         Add data for source for one input.
 
@@ -151,8 +145,6 @@ def add_source(
         ss_lens : torch.tensor( number of healpix cells )
         ss_cells : list( number of healpix cells )
             [ torch.tensor( tokens per cell, token size, number of channels) ]
-        ss_centroids : list(number of healpix cells )
-            [ torch.tensor( for source , 5) ]
 
         Returns
         -------
@@ -161,8 +153,7 @@ def add_source(
 
         self.source_raw = ss_raw
         self.source_tokens_lens = ss_lens
-        self.source_tokens_cells = torch.cat(ss_cells)
-        self.source_centroids = torch.cat(ss_centroids)
+        self.source_tokens_cells = torch.stack(ss_cells)
 
         idx = torch.isnan(self.source_tokens_cells)
         self.source_tokens_cells[idx] = self.mask_value
@@ -172,8 +163,10 @@ def add_target(
         fstep: int,
         targets: list,
         target_coords: torch.tensor,
+        target_coords_per_cell: torch.tensor,
         target_coords_raw: torch.tensor,
         times_raw: torch.tensor,
+        idxs_inv: torch.tensor,
     ) -> None:
         """
         Add data for target for one input.
@@ -193,26 +186,20 @@ def add_target(
         target_times : list( number of healpix cells)
             [ torch.tensor( points per cell) ]
               absolute target times
+        idxs_inv:
+            Indices to reorder targets back to order in input
 
         Returns
         -------
         None
         """
 
-        self.target_tokens[fstep] = torch.cat(targets)
-        self.target_coords[fstep] = torch.cat(target_coords)
-        self.target_times_raw[fstep] = np.concatenate(times_raw)
-        self.target_coords_raw[fstep] = torch.cat(target_coords_raw)
-
-        tc = target_coords
-        self.target_coords_lens[fstep] = torch.tensor(
-            [len(f) for f in tc] if len(tc) > 1 else self.target_coords_lens[fstep],
-            dtype=torch.int,
-        )
-        self.target_tokens_lens[fstep] = torch.tensor(
-            [len(f) for f in targets] if len(targets) > 1 else self.target_tokens_lens[fstep],
-            dtype=torch.int,
-        )
+        self.target_tokens[fstep] = targets
+        self.target_coords[fstep] = target_coords
+        self.target_coords_lens[fstep] = target_coords_per_cell
+        self.target_times_raw[fstep] = times_raw
+        self.target_coords_raw[fstep] = target_coords_raw
+        self.idxs_inv[fstep] = idxs_inv
 
     def target_empty(self) -> bool:
         """
@@ -229,7 +216,7 @@ def target_empty(self) -> bool:
         """
 
         # cat over forecast steps
-        return torch.cat(self.target_tokens_lens).sum() == 0
+        return torch.cat(self.target_coords_lens).sum() == 0
 
     def source_empty(self) -> bool:
         """