Skip to content

Commit 84ef0a3

Browse files
committed
(#13) Added outputs merge and verification
The idea of the land-ocean ratio verification is to make sure that the NetCDF outputs have roughly as many number of land and ocean cells as we expect. Verification script with etopo_cg=40: Land cells: 8509 Ocean cells: 11971 Merging and verification scripts with outputs from etopo_cg=4: Land cells (is_land=1): 8622 Ocean cells (is_land=0): 11858 Both scripts are consistent. Furthermore, the plots are sensible.
1 parent e6a6a26 commit 84ef0a3

File tree

2 files changed

+856
-0
lines changed

2 files changed

+856
-0
lines changed
Lines changed: 320 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Merge ETOPO NetCDF Output Files
4+
5+
This script merges all chunked NetCDF outputs from the ETOPO processing into a single file,
6+
ensuring that:
7+
1. All cell IDs (groups) are represented in the merged file
8+
2. Each cell has an 'is_land' attribute
9+
3. Missing cells are filled with ocean placeholders (is_land=0)
10+
"""
11+
12+
import netCDF4
13+
import numpy as np
14+
from pathlib import Path
15+
from tqdm import tqdm
16+
import sys
17+
18+
def get_expected_cell_range(files):
19+
"""
20+
Determine the expected cell range from filenames.
21+
22+
Parameters
23+
----------
24+
files : list of Path
25+
List of NetCDF files
26+
27+
Returns
28+
-------
29+
tuple
30+
(min_cell, max_cell) expected in the dataset
31+
"""
32+
min_cell = float('inf')
33+
max_cell = float('-inf')
34+
35+
for f in files:
36+
parts = f.stem.split('_')
37+
range_part = parts[-1] # e.g., '00000-00099'
38+
start, end = map(int, range_part.split('-'))
39+
min_cell = min(min_cell, start)
40+
max_cell = max(max_cell, end)
41+
42+
return int(min_cell), int(max_cell)
43+
44+
45+
def collect_all_cells(files):
46+
"""
47+
Collect all cell data from chunked NetCDF files.
48+
49+
Parameters
50+
----------
51+
files : list of Path
52+
List of NetCDF files to merge
53+
54+
Returns
55+
-------
56+
dict
57+
Dictionary mapping cell_id (int) to cell data dict containing:
58+
- is_land: int (0 or 1)
59+
- clat: float (radians)
60+
- clon: float (radians)
61+
- analysis: dict of arrays (only for land cells)
62+
"""
63+
cell_data = {}
64+
65+
print("Reading cell data from NetCDF files...")
66+
for nc_file in tqdm(files, desc="Processing files"):
67+
try:
68+
nc = netCDF4.Dataset(nc_file, 'r')
69+
70+
# Iterate over all groups (cell IDs) in this file
71+
for group_name in nc.groups.keys():
72+
cell_id = int(group_name)
73+
group = nc.groups[group_name]
74+
75+
# Extract cell data
76+
is_land = int(group.variables['is_land'][:])
77+
clat = float(group.variables['clat'][:])
78+
clon = float(group.variables['clon'][:])
79+
80+
cell_info = {
81+
'is_land': is_land,
82+
'clat': clat,
83+
'clon': clon,
84+
}
85+
86+
# For land cells, also extract analysis data
87+
if is_land == 1:
88+
cell_info['analysis'] = {}
89+
for var_name in group.variables.keys():
90+
if var_name not in ['is_land', 'clat', 'clon']:
91+
cell_info['analysis'][var_name] = group.variables[var_name][:]
92+
93+
cell_data[cell_id] = cell_info
94+
95+
nc.close()
96+
97+
except Exception as e:
98+
print(f"Error reading {nc_file.name}: {e}")
99+
continue
100+
101+
return cell_data
102+
103+
104+
def create_merged_netcdf(cell_data, output_path, expected_min, expected_max):
105+
"""
106+
Create merged NetCDF file with all cells.
107+
108+
Parameters
109+
----------
110+
cell_data : dict
111+
Dictionary of cell data from collect_all_cells()
112+
output_path : Path
113+
Output file path
114+
expected_min : int
115+
Expected minimum cell ID
116+
expected_max : int
117+
Expected maximum cell ID
118+
"""
119+
print(f"\nCreating merged NetCDF file: {output_path}")
120+
121+
# Create new NetCDF file
122+
nc_out = netCDF4.Dataset(output_path, 'w', format='NETCDF4')
123+
124+
# Set global attributes
125+
nc_out.title = "ICON ETOPO Global Topography - Merged Output"
126+
nc_out.description = "Merged spectral analysis of ETOPO topography on ICON grid"
127+
nc_out.source = "pycsa spectral approximation framework"
128+
129+
# Statistics counters
130+
land_cells = 0
131+
ocean_cells = 0
132+
missing_cells = 0
133+
134+
print(f"Writing cells {expected_min} to {expected_max}...")
135+
136+
# Iterate through all expected cells
137+
for cell_id in tqdm(range(expected_min, expected_max + 1), desc="Writing cells"):
138+
# Create group for this cell
139+
grp = nc_out.createGroup(str(cell_id))
140+
141+
if cell_id in cell_data:
142+
# Cell exists in data
143+
cell = cell_data[cell_id]
144+
is_land = cell['is_land']
145+
clat = cell['clat']
146+
clon = cell['clon']
147+
148+
if is_land:
149+
land_cells += 1
150+
else:
151+
ocean_cells += 1
152+
153+
else:
154+
# Missing cell - create ocean placeholder
155+
print(f"Warning: Cell {cell_id} missing, creating ocean placeholder")
156+
is_land = 0
157+
clat = 0.0 # Placeholder
158+
clon = 0.0 # Placeholder
159+
missing_cells += 1
160+
ocean_cells += 1
161+
162+
# Write basic cell attributes (always present)
163+
var_is_land = grp.createVariable('is_land', 'i4')
164+
var_is_land[:] = is_land
165+
166+
var_clat = grp.createVariable('clat', 'f8')
167+
var_clat[:] = clat
168+
var_clat.units = "radians"
169+
var_clat.long_name = "cell center latitude"
170+
171+
var_clon = grp.createVariable('clon', 'f8')
172+
var_clon[:] = clon
173+
var_clon.units = "radians"
174+
var_clon.long_name = "cell center longitude"
175+
176+
# Write analysis data for land cells
177+
if is_land and cell_id in cell_data:
178+
analysis = cell_data[cell_id]['analysis']
179+
for var_name, var_data in analysis.items():
180+
# Create variable with appropriate dimensions
181+
if var_data.ndim == 0:
182+
# Scalar variable (0-dimensional)
183+
var = grp.createVariable(var_name, var_data.dtype)
184+
var[:] = var_data
185+
elif var_data.ndim == 1:
186+
dim_name = f"dim_{var_name}"
187+
grp.createDimension(dim_name, var_data.shape[0])
188+
var = grp.createVariable(var_name, var_data.dtype, (dim_name,))
189+
var[:] = var_data
190+
elif var_data.ndim == 2:
191+
dim0_name = f"dim0_{var_name}"
192+
dim1_name = f"dim1_{var_name}"
193+
grp.createDimension(dim0_name, var_data.shape[0])
194+
grp.createDimension(dim1_name, var_data.shape[1])
195+
var = grp.createVariable(var_name, var_data.dtype, (dim0_name, dim1_name))
196+
var[:] = var_data
197+
else:
198+
print(f"Warning: Skipping variable {var_name} with unsupported dimensions: {var_data.ndim}")
199+
continue
200+
201+
nc_out.close()
202+
203+
# Print statistics
204+
print("\n" + "="*80)
205+
print("MERGE COMPLETE")
206+
print("="*80)
207+
print(f"Output file: {output_path}")
208+
print(f"Total cells: {expected_max - expected_min + 1}")
209+
print(f" Land cells (is_land=1): {land_cells}")
210+
print(f" Ocean cells (is_land=0): {ocean_cells}")
211+
if missing_cells > 0:
212+
print(f" Missing cells (filled with ocean): {missing_cells}")
213+
print(f"\nLand/Ocean ratio: {land_cells}/{ocean_cells} = {land_cells/ocean_cells:.3f}" if ocean_cells > 0 else "")
214+
print(f"Land percentage: {100*land_cells/(land_cells+ocean_cells):.2f}%")
215+
print("="*80)
216+
217+
218+
def verify_merged_file(output_path, expected_min, expected_max):
219+
"""
220+
Verify the merged NetCDF file has all cells with is_land attribute.
221+
222+
Parameters
223+
----------
224+
output_path : Path
225+
Path to merged NetCDF file
226+
expected_min : int
227+
Expected minimum cell ID
228+
expected_max : int
229+
Expected maximum cell ID
230+
231+
Returns
232+
-------
233+
bool
234+
True if verification passes
235+
"""
236+
print(f"\nVerifying merged file: {output_path}")
237+
238+
nc = netCDF4.Dataset(output_path, 'r')
239+
240+
expected_cells = set(range(expected_min, expected_max + 1))
241+
found_cells = set(int(g) for g in nc.groups.keys())
242+
243+
# Check all cells present
244+
missing = expected_cells - found_cells
245+
if missing:
246+
print(f"ERROR: Missing cells: {sorted(missing)[:10]}... ({len(missing)} total)")
247+
nc.close()
248+
return False
249+
250+
# Check extra cells
251+
extra = found_cells - expected_cells
252+
if extra:
253+
print(f"Warning: Extra cells: {sorted(extra)[:10]}... ({len(extra)} total)")
254+
255+
# Check is_land attribute and count land vs ocean
256+
cells_without_is_land = []
257+
land_count = 0
258+
ocean_count = 0
259+
for group_name in nc.groups.keys():
260+
group = nc.groups[group_name]
261+
if 'is_land' not in group.variables:
262+
cells_without_is_land.append(group_name)
263+
else:
264+
is_land_val = int(group.variables['is_land'][:])
265+
if is_land_val == 1:
266+
land_count += 1
267+
else:
268+
ocean_count += 1
269+
270+
if cells_without_is_land:
271+
print(f"ERROR: Cells without is_land attribute: {cells_without_is_land[:10]}... ({len(cells_without_is_land)} total)")
272+
nc.close()
273+
return False
274+
275+
nc.close()
276+
277+
print("✓ Verification PASSED")
278+
print(f" All {len(expected_cells)} cells present")
279+
print(f" All cells have 'is_land' attribute")
280+
print(f" Land cells (is_land=1): {land_count}")
281+
print(f" Ocean cells (is_land=0): {ocean_count}")
282+
print(f" Land percentage: {100*land_count/(land_count+ocean_count):.2f}%")
283+
284+
return True
285+
286+
287+
if __name__ == '__main__':
288+
# Configuration
289+
input_dir = Path("datasets")
290+
output_dir = Path("datasets")
291+
output_filename = "icon_etopo_global_merged.nc"
292+
293+
# Find all input files
294+
input_files = sorted(input_dir.glob("icon_etopo_global_cells_*.nc"))
295+
296+
if not input_files:
297+
print(f"ERROR: No NetCDF files found in {input_dir}")
298+
sys.exit(1)
299+
300+
print(f"Found {len(input_files)} NetCDF files to merge")
301+
302+
# Determine expected cell range
303+
expected_min, expected_max = get_expected_cell_range(input_files)
304+
print(f"Expected cell range: {expected_min} to {expected_max} ({expected_max - expected_min + 1} cells)")
305+
306+
# Collect all cell data
307+
cell_data = collect_all_cells(input_files)
308+
print(f"Collected data for {len(cell_data)} cells")
309+
310+
# Create merged file
311+
output_path = output_dir / output_filename
312+
create_merged_netcdf(cell_data, output_path, expected_min, expected_max)
313+
314+
# Verify merged file
315+
if verify_merged_file(output_path, expected_min, expected_max):
316+
print(f"\n✓ Successfully created merged file: {output_path}")
317+
print(f" Size: {output_path.stat().st_size / (1024**2):.1f} MB")
318+
else:
319+
print(f"\n✗ Verification failed for: {output_path}")
320+
sys.exit(1)

0 commit comments

Comments
 (0)