|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Merge ETOPO NetCDF Output Files |
| 4 | +
|
| 5 | +This script merges all chunked NetCDF outputs from the ETOPO processing into a single file, |
| 6 | +ensuring that: |
| 7 | +1. All cell IDs (groups) are represented in the merged file |
| 8 | +2. Each cell has an 'is_land' attribute |
| 9 | +3. Missing cells are filled with ocean placeholders (is_land=0) |
| 10 | +""" |
| 11 | + |
| 12 | +import netCDF4 |
| 13 | +import numpy as np |
| 14 | +from pathlib import Path |
| 15 | +from tqdm import tqdm |
| 16 | +import sys |
| 17 | + |
| 18 | +def get_expected_cell_range(files): |
| 19 | + """ |
| 20 | + Determine the expected cell range from filenames. |
| 21 | +
|
| 22 | + Parameters |
| 23 | + ---------- |
| 24 | + files : list of Path |
| 25 | + List of NetCDF files |
| 26 | +
|
| 27 | + Returns |
| 28 | + ------- |
| 29 | + tuple |
| 30 | + (min_cell, max_cell) expected in the dataset |
| 31 | + """ |
| 32 | + min_cell = float('inf') |
| 33 | + max_cell = float('-inf') |
| 34 | + |
| 35 | + for f in files: |
| 36 | + parts = f.stem.split('_') |
| 37 | + range_part = parts[-1] # e.g., '00000-00099' |
| 38 | + start, end = map(int, range_part.split('-')) |
| 39 | + min_cell = min(min_cell, start) |
| 40 | + max_cell = max(max_cell, end) |
| 41 | + |
| 42 | + return int(min_cell), int(max_cell) |
| 43 | + |
| 44 | + |
| 45 | +def collect_all_cells(files): |
| 46 | + """ |
| 47 | + Collect all cell data from chunked NetCDF files. |
| 48 | +
|
| 49 | + Parameters |
| 50 | + ---------- |
| 51 | + files : list of Path |
| 52 | + List of NetCDF files to merge |
| 53 | +
|
| 54 | + Returns |
| 55 | + ------- |
| 56 | + dict |
| 57 | + Dictionary mapping cell_id (int) to cell data dict containing: |
| 58 | + - is_land: int (0 or 1) |
| 59 | + - clat: float (radians) |
| 60 | + - clon: float (radians) |
| 61 | + - analysis: dict of arrays (only for land cells) |
| 62 | + """ |
| 63 | + cell_data = {} |
| 64 | + |
| 65 | + print("Reading cell data from NetCDF files...") |
| 66 | + for nc_file in tqdm(files, desc="Processing files"): |
| 67 | + try: |
| 68 | + nc = netCDF4.Dataset(nc_file, 'r') |
| 69 | + |
| 70 | + # Iterate over all groups (cell IDs) in this file |
| 71 | + for group_name in nc.groups.keys(): |
| 72 | + cell_id = int(group_name) |
| 73 | + group = nc.groups[group_name] |
| 74 | + |
| 75 | + # Extract cell data |
| 76 | + is_land = int(group.variables['is_land'][:]) |
| 77 | + clat = float(group.variables['clat'][:]) |
| 78 | + clon = float(group.variables['clon'][:]) |
| 79 | + |
| 80 | + cell_info = { |
| 81 | + 'is_land': is_land, |
| 82 | + 'clat': clat, |
| 83 | + 'clon': clon, |
| 84 | + } |
| 85 | + |
| 86 | + # For land cells, also extract analysis data |
| 87 | + if is_land == 1: |
| 88 | + cell_info['analysis'] = {} |
| 89 | + for var_name in group.variables.keys(): |
| 90 | + if var_name not in ['is_land', 'clat', 'clon']: |
| 91 | + cell_info['analysis'][var_name] = group.variables[var_name][:] |
| 92 | + |
| 93 | + cell_data[cell_id] = cell_info |
| 94 | + |
| 95 | + nc.close() |
| 96 | + |
| 97 | + except Exception as e: |
| 98 | + print(f"Error reading {nc_file.name}: {e}") |
| 99 | + continue |
| 100 | + |
| 101 | + return cell_data |
| 102 | + |
| 103 | + |
| 104 | +def create_merged_netcdf(cell_data, output_path, expected_min, expected_max): |
| 105 | + """ |
| 106 | + Create merged NetCDF file with all cells. |
| 107 | +
|
| 108 | + Parameters |
| 109 | + ---------- |
| 110 | + cell_data : dict |
| 111 | + Dictionary of cell data from collect_all_cells() |
| 112 | + output_path : Path |
| 113 | + Output file path |
| 114 | + expected_min : int |
| 115 | + Expected minimum cell ID |
| 116 | + expected_max : int |
| 117 | + Expected maximum cell ID |
| 118 | + """ |
| 119 | + print(f"\nCreating merged NetCDF file: {output_path}") |
| 120 | + |
| 121 | + # Create new NetCDF file |
| 122 | + nc_out = netCDF4.Dataset(output_path, 'w', format='NETCDF4') |
| 123 | + |
| 124 | + # Set global attributes |
| 125 | + nc_out.title = "ICON ETOPO Global Topography - Merged Output" |
| 126 | + nc_out.description = "Merged spectral analysis of ETOPO topography on ICON grid" |
| 127 | + nc_out.source = "pycsa spectral approximation framework" |
| 128 | + |
| 129 | + # Statistics counters |
| 130 | + land_cells = 0 |
| 131 | + ocean_cells = 0 |
| 132 | + missing_cells = 0 |
| 133 | + |
| 134 | + print(f"Writing cells {expected_min} to {expected_max}...") |
| 135 | + |
| 136 | + # Iterate through all expected cells |
| 137 | + for cell_id in tqdm(range(expected_min, expected_max + 1), desc="Writing cells"): |
| 138 | + # Create group for this cell |
| 139 | + grp = nc_out.createGroup(str(cell_id)) |
| 140 | + |
| 141 | + if cell_id in cell_data: |
| 142 | + # Cell exists in data |
| 143 | + cell = cell_data[cell_id] |
| 144 | + is_land = cell['is_land'] |
| 145 | + clat = cell['clat'] |
| 146 | + clon = cell['clon'] |
| 147 | + |
| 148 | + if is_land: |
| 149 | + land_cells += 1 |
| 150 | + else: |
| 151 | + ocean_cells += 1 |
| 152 | + |
| 153 | + else: |
| 154 | + # Missing cell - create ocean placeholder |
| 155 | + print(f"Warning: Cell {cell_id} missing, creating ocean placeholder") |
| 156 | + is_land = 0 |
| 157 | + clat = 0.0 # Placeholder |
| 158 | + clon = 0.0 # Placeholder |
| 159 | + missing_cells += 1 |
| 160 | + ocean_cells += 1 |
| 161 | + |
| 162 | + # Write basic cell attributes (always present) |
| 163 | + var_is_land = grp.createVariable('is_land', 'i4') |
| 164 | + var_is_land[:] = is_land |
| 165 | + |
| 166 | + var_clat = grp.createVariable('clat', 'f8') |
| 167 | + var_clat[:] = clat |
| 168 | + var_clat.units = "radians" |
| 169 | + var_clat.long_name = "cell center latitude" |
| 170 | + |
| 171 | + var_clon = grp.createVariable('clon', 'f8') |
| 172 | + var_clon[:] = clon |
| 173 | + var_clon.units = "radians" |
| 174 | + var_clon.long_name = "cell center longitude" |
| 175 | + |
| 176 | + # Write analysis data for land cells |
| 177 | + if is_land and cell_id in cell_data: |
| 178 | + analysis = cell_data[cell_id]['analysis'] |
| 179 | + for var_name, var_data in analysis.items(): |
| 180 | + # Create variable with appropriate dimensions |
| 181 | + if var_data.ndim == 0: |
| 182 | + # Scalar variable (0-dimensional) |
| 183 | + var = grp.createVariable(var_name, var_data.dtype) |
| 184 | + var[:] = var_data |
| 185 | + elif var_data.ndim == 1: |
| 186 | + dim_name = f"dim_{var_name}" |
| 187 | + grp.createDimension(dim_name, var_data.shape[0]) |
| 188 | + var = grp.createVariable(var_name, var_data.dtype, (dim_name,)) |
| 189 | + var[:] = var_data |
| 190 | + elif var_data.ndim == 2: |
| 191 | + dim0_name = f"dim0_{var_name}" |
| 192 | + dim1_name = f"dim1_{var_name}" |
| 193 | + grp.createDimension(dim0_name, var_data.shape[0]) |
| 194 | + grp.createDimension(dim1_name, var_data.shape[1]) |
| 195 | + var = grp.createVariable(var_name, var_data.dtype, (dim0_name, dim1_name)) |
| 196 | + var[:] = var_data |
| 197 | + else: |
| 198 | + print(f"Warning: Skipping variable {var_name} with unsupported dimensions: {var_data.ndim}") |
| 199 | + continue |
| 200 | + |
| 201 | + nc_out.close() |
| 202 | + |
| 203 | + # Print statistics |
| 204 | + print("\n" + "="*80) |
| 205 | + print("MERGE COMPLETE") |
| 206 | + print("="*80) |
| 207 | + print(f"Output file: {output_path}") |
| 208 | + print(f"Total cells: {expected_max - expected_min + 1}") |
| 209 | + print(f" Land cells (is_land=1): {land_cells}") |
| 210 | + print(f" Ocean cells (is_land=0): {ocean_cells}") |
| 211 | + if missing_cells > 0: |
| 212 | + print(f" Missing cells (filled with ocean): {missing_cells}") |
| 213 | + print(f"\nLand/Ocean ratio: {land_cells}/{ocean_cells} = {land_cells/ocean_cells:.3f}" if ocean_cells > 0 else "") |
| 214 | + print(f"Land percentage: {100*land_cells/(land_cells+ocean_cells):.2f}%") |
| 215 | + print("="*80) |
| 216 | + |
| 217 | + |
| 218 | +def verify_merged_file(output_path, expected_min, expected_max): |
| 219 | + """ |
| 220 | + Verify the merged NetCDF file has all cells with is_land attribute. |
| 221 | +
|
| 222 | + Parameters |
| 223 | + ---------- |
| 224 | + output_path : Path |
| 225 | + Path to merged NetCDF file |
| 226 | + expected_min : int |
| 227 | + Expected minimum cell ID |
| 228 | + expected_max : int |
| 229 | + Expected maximum cell ID |
| 230 | +
|
| 231 | + Returns |
| 232 | + ------- |
| 233 | + bool |
| 234 | + True if verification passes |
| 235 | + """ |
| 236 | + print(f"\nVerifying merged file: {output_path}") |
| 237 | + |
| 238 | + nc = netCDF4.Dataset(output_path, 'r') |
| 239 | + |
| 240 | + expected_cells = set(range(expected_min, expected_max + 1)) |
| 241 | + found_cells = set(int(g) for g in nc.groups.keys()) |
| 242 | + |
| 243 | + # Check all cells present |
| 244 | + missing = expected_cells - found_cells |
| 245 | + if missing: |
| 246 | + print(f"ERROR: Missing cells: {sorted(missing)[:10]}... ({len(missing)} total)") |
| 247 | + nc.close() |
| 248 | + return False |
| 249 | + |
| 250 | + # Check extra cells |
| 251 | + extra = found_cells - expected_cells |
| 252 | + if extra: |
| 253 | + print(f"Warning: Extra cells: {sorted(extra)[:10]}... ({len(extra)} total)") |
| 254 | + |
| 255 | + # Check is_land attribute and count land vs ocean |
| 256 | + cells_without_is_land = [] |
| 257 | + land_count = 0 |
| 258 | + ocean_count = 0 |
| 259 | + for group_name in nc.groups.keys(): |
| 260 | + group = nc.groups[group_name] |
| 261 | + if 'is_land' not in group.variables: |
| 262 | + cells_without_is_land.append(group_name) |
| 263 | + else: |
| 264 | + is_land_val = int(group.variables['is_land'][:]) |
| 265 | + if is_land_val == 1: |
| 266 | + land_count += 1 |
| 267 | + else: |
| 268 | + ocean_count += 1 |
| 269 | + |
| 270 | + if cells_without_is_land: |
| 271 | + print(f"ERROR: Cells without is_land attribute: {cells_without_is_land[:10]}... ({len(cells_without_is_land)} total)") |
| 272 | + nc.close() |
| 273 | + return False |
| 274 | + |
| 275 | + nc.close() |
| 276 | + |
| 277 | + print("✓ Verification PASSED") |
| 278 | + print(f" All {len(expected_cells)} cells present") |
| 279 | + print(f" All cells have 'is_land' attribute") |
| 280 | + print(f" Land cells (is_land=1): {land_count}") |
| 281 | + print(f" Ocean cells (is_land=0): {ocean_count}") |
| 282 | + print(f" Land percentage: {100*land_count/(land_count+ocean_count):.2f}%") |
| 283 | + |
| 284 | + return True |
| 285 | + |
| 286 | + |
| 287 | +if __name__ == '__main__': |
| 288 | + # Configuration |
| 289 | + input_dir = Path("datasets") |
| 290 | + output_dir = Path("datasets") |
| 291 | + output_filename = "icon_etopo_global_merged.nc" |
| 292 | + |
| 293 | + # Find all input files |
| 294 | + input_files = sorted(input_dir.glob("icon_etopo_global_cells_*.nc")) |
| 295 | + |
| 296 | + if not input_files: |
| 297 | + print(f"ERROR: No NetCDF files found in {input_dir}") |
| 298 | + sys.exit(1) |
| 299 | + |
| 300 | + print(f"Found {len(input_files)} NetCDF files to merge") |
| 301 | + |
| 302 | + # Determine expected cell range |
| 303 | + expected_min, expected_max = get_expected_cell_range(input_files) |
| 304 | + print(f"Expected cell range: {expected_min} to {expected_max} ({expected_max - expected_min + 1} cells)") |
| 305 | + |
| 306 | + # Collect all cell data |
| 307 | + cell_data = collect_all_cells(input_files) |
| 308 | + print(f"Collected data for {len(cell_data)} cells") |
| 309 | + |
| 310 | + # Create merged file |
| 311 | + output_path = output_dir / output_filename |
| 312 | + create_merged_netcdf(cell_data, output_path, expected_min, expected_max) |
| 313 | + |
| 314 | + # Verify merged file |
| 315 | + if verify_merged_file(output_path, expected_min, expected_max): |
| 316 | + print(f"\n✓ Successfully created merged file: {output_path}") |
| 317 | + print(f" Size: {output_path.stat().st_size / (1024**2):.1f} MB") |
| 318 | + else: |
| 319 | + print(f"\n✗ Verification failed for: {output_path}") |
| 320 | + sys.exit(1) |
0 commit comments