Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/climatebenchpress/data_loader/datasets/cams.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ def download(download_path: Path, progress: bool = True):
@staticmethod
def open(download_path: Path) -> xr.Dataset:
ds = xr.open_dataset(download_path / Path(NO2_FILE).name)

# Restrict data to a single day.
# The specific day is arbitrary.
ds = ds.sel(valid_time=slice("2023-06-15", "2023-06-15")).chunk(-1)
# Needed to make the dataset CF-compliant.
ds.longitude.attrs["axis"] = "X"
ds.latitude.attrs["axis"] = "Y"
Expand Down
5 changes: 5 additions & 0 deletions src/climatebenchpress/data_loader/datasets/cmip6/abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,13 @@ def download_with(
zstore = zstore.replace("gs://", "https://storage.googleapis.com/")

ds = xr.open_zarr(fsspec.get_mapper(zstore), consolidated=True)
# Only select the year 2020 for the dataset. The exact choice of this
# year is arbitrary,
# .chunk(-1) ensures that we only use a single chunk for the entire dataset.
ds = ds.sel(time=slice("2020", "2020")).chunk(-1)
if variable_selector is not None:
ds = ds[variable_selector]

with monitor.progress_bar(progress):
ds.to_zarr(downloadfile, mode="w", encoding=dict(), compute=False).compute()

Expand Down
6 changes: 4 additions & 2 deletions src/climatebenchpress/data_loader/datasets/era5.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,15 @@ def download(download_path: Path, progress: bool = True):

era5 = xr.open_zarr(ERA5_GCP_PATH, chunks={"time": 48}, consolidated=True)

ds = era5.sel(time=slice("2020-03-01", "2020-03-07"))[
# Restrict data to a single day.
# The specific day is arbitrary.
ds = era5.sel(time=slice("2020-03-01", "2020-03-01"))[
[
"mean_sea_level_pressure",
"10m_u_component_of_wind",
"10m_v_component_of_wind",
]
]
].chunk(-1)
# Needed to make the dataset CF-compliant.
ds.time.attrs["standard_name"] = "time"
ds.longitude.attrs["axis"] = "X"
Expand Down
49 changes: 32 additions & 17 deletions src/climatebenchpress/data_loader/datasets/esa_biomass_cci.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,29 +13,28 @@
from .abc import Dataset

NUM_RETRIES = 3
# Bounding box for an area in mainland France
FRANCE_BBOX = {"T": slice(0, 1), "X": slice(202531, 207531), "Y": slice(35469, 40469)}

# Define rough bounding box coordinates for mainland France.
# Format: [min_longitude, min_latitude, max_longitude, max_latitude].
FRANCE_BBOX = [-5.5, 42.3, 9.6, 51.1]

# Biomass estimate for the year 2020.
BIOMASS_URL = "https://dap.ceda.ac.uk/neodc/esacci/biomass/data/agb/maps/v5.01/netcdf/ESACCI-BIOMASS-L4-AGB-MERGED-100m-2020-fv5.01.nc"


class EsaBiomassCciDataset(Dataset):
name = "esa-biomass-cci"

@staticmethod
def download(download_path: Path, progress: bool = True):
urls = [
f"https://dap.ceda.ac.uk/neodc/esacci/biomass/data/agb/maps/v5.01/netcdf/ESACCI-BIOMASS-L4-AGB-MERGED-100m-{year}-fv5.01.nc"
# Restrict to 2 years for now for smaller download.
for year in [2010, 2015]
]
for url in urls:
output_path = download_path / Path(url).name
for _ in range(NUM_RETRIES):
success = _download_netcdf(url, output_path, progress)
if success:
break
if not success:
logging.info(f"Failed to download {url}")
return
output_path = download_path / Path(BIOMASS_URL).name
for _ in range(NUM_RETRIES):
success = _download_netcdf(BIOMASS_URL, output_path, progress)
if success:
break
if not success:
logging.info(f"Failed to download {BIOMASS_URL}")
return

@staticmethod
def open(download_path: Path) -> xr.Dataset:
Expand All @@ -44,12 +43,28 @@ def open(download_path: Path) -> xr.Dataset:
# Needed to make the dataset CF-compliant.
ds.lon.attrs["axis"] = "X"
ds.lat.attrs["axis"] = "Y"
# We are constraining the dataset to mainland France to reduce its overall size.
# The global snapshot would be around 20 GB, which is too large for our use case.
# We chose France because it should have a fairly diverse set of biomass estimates
# but the choice is overall somewhat arbitrary.
ds = ds.sel(
lon=slice(FRANCE_BBOX[0], FRANCE_BBOX[2]),
lat=slice(FRANCE_BBOX[3], FRANCE_BBOX[1]),
).chunk(-1)
return ds[["agb"]]


if __name__ == "__main__":
ds = open_downloaded_canonicalized_dataset(EsaBiomassCciDataset)
open_downloaded_tiny_canonicalized_dataset(EsaBiomassCciDataset, slices=FRANCE_BBOX)
num_lon, num_lat = ds.lon.size, ds.lat.size
open_downloaded_tiny_canonicalized_dataset(
EsaBiomassCciDataset,
# Use a smaller spatial subset for the tiny dataset.
slices={
"X": slice(num_lon // 2, (num_lon // 2) + 500),
"Y": slice(num_lat // 2, (num_lat // 2) + 500),
},
)

for v, da in ds.items():
print(f"- {v}: {da.dims}")
15 changes: 9 additions & 6 deletions src/climatebenchpress/data_loader/datasets/nextgems.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,19 @@ def download(download_path: Path, progress: bool = True):
zoom=ZOOM, time=TIME_RESOLUTION, chunks=dict()
).to_dask()

ds = icon[[PRECIP_KEY, OLR_KEY]].sel(time=slice("2020-03-01", "2020-03-07"))
# Restrict data to a single day.
# The specific day is arbitrary.
ds = icon[[PRECIP_KEY, OLR_KEY]].sel(time=slice("2020-03-01", "2020-03-01"))
# Regrid the data to 0.125 degree resolution.
# NOTE: This is using nearest neighbour interpolation. We need to do some
# quality checks to ensure we don't get any significant aliasing
# artifacts as the result of interpolation. For more details:
# https://easy.gems.dkrz.de/Processing/healpix/lonlat_remap.html.
# NOTE:
# This is using nearest neighbour interpolation. Different interpolation methods
# should not have a drastic effect on the intercomparison of different compressors.
# However, this should be studied in more detail because re-gridding can often
# have unforeseen consequences.
idx = _get_nn_lon_lat_index(
2**ZOOM, np.linspace(-180, 180, NUM_LON), np.linspace(-90, 90, NUM_LAT)
)
ds = ds.isel(cell=idx).chunk({"time": 1, "lat": NUM_LAT, "lon": NUM_LON})
ds = ds.isel(cell=idx).chunk(-1)
ds.lon.attrs["axis"] = "X"
ds.lat.attrs["axis"] = "Y"

Expand Down