Skip to content

Commit

Permalink
Merge pull request #93 from ACCESS-NRI/91/update-esm-netcdf-names
Browse files Browse the repository at this point in the history
Update ESM1.5 driver file naming.
  • Loading branch information
blimlim authored Sep 23, 2024
2 parents 9acda2f + b014029 commit 0ee303a
Show file tree
Hide file tree
Showing 2 changed files with 175 additions and 57 deletions.
107 changes: 80 additions & 27 deletions test/test_conversion_driver_esm1p5.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,75 @@ def test_get_esm1p5_fields_file_pattern_wrong_id_length(run_id):
esm1p5_convert.get_esm1p5_fields_file_pattern(run_id)


def test_get_nc_write_path():
fields_file_path = Path("/test/path/fields_123.file")
nc_write_dir = Path("/test/path/NetCDF")
@pytest.mark.parametrize("ff_path,ff_date,nc_write_dir,expected",
[
(
Path("/test/aiihca.paa1feb"),
(101, 2, 1),
Path("/test/netCDF"),
Path("/test/netCDF/aiihca.pa-010102_mon.nc")
),
(
Path("aiihca.pe50dec"),
(1850, 12, 21),
Path("netCDF"),
Path("netCDF/aiihca.pe-185012_dai.nc")
),
(
Path("abc/aiihca.pi87jun"),
(1887, 6, 12),
Path("./netCDF"),
Path("./netCDF/aiihca.pi-188706_3hr.nc")
),
(
Path("abc/aiihca.pjc0jan"),
(120, 1, 7),
Path("./netCDF"),
Path("./netCDF/aiihca.pj-012001_6hr.nc")
),
(
Path("abc/aiihca.pjc0jan"),
None,
Path("./netCDF"),
Path("./netCDF/aiihca.pjc0jan.nc")
),
])
def test_get_nc_write_path_recognized_unit(ff_path, ff_date,
nc_write_dir, expected):
"""
Check that netCDF file naming produces expected file paths for various
expected unit keys.
"""
nc_write_path = esm1p5_convert.get_nc_write_path(
ff_path,
nc_write_dir,
ff_date
)

assert nc_write_path == expected


def test_get_nc_write_path_unrecognized_unit():
"""
Check that netCDF file naming falls back to simpler naming scheme
when unit key in fields file name not recognized.
"""
unknown_key = "w"
assert unknown_key not in esm1p5_convert.FF_UNIT_SUFFIX.keys()

ff_name = f"aiihca.p{unknown_key}abcd"
ff_year = 50
ff_month = 7
nc_write_dir = Path("netCDF")
expected_nc_write_path = nc_write_dir / f"aiihca.p{unknown_key}-005007.nc"

nc_write_path = esm1p5_convert.get_nc_write_path(
fields_file_path, nc_write_dir)
Path(ff_name),
nc_write_dir,
(ff_year, ff_month, 1)
)

assert nc_write_path == Path("/test/path/NetCDF/fields_123.file.nc")
assert nc_write_path == expected_nc_write_path


def test_find_matching_fields_files():
Expand Down Expand Up @@ -109,43 +170,35 @@ def _mock_process_with_exception(error_type, error_message):


@pytest.mark.parametrize(
"input_list", [[], ["fake_file"], [
"fake_file_1", "fake_file_2", "fake_file_3"]]
"input_output_list", [[],
[("fake_file", "fake_file.nc")],
[("fake_file_1", "fake_file_1.nc"),
("fake_file_2", "fake_file_2.nc"),
("fake_file_3", "fake_file_3.nc")]]
)
def test_convert_fields_file_list_success(mock_process,
input_list):
input_output_list):
"""
Test that process is called for each input.
"""
input_list_paths = [Path(p) for p in input_list]
input_output_paths = [(Path(p1), Path(p2)) for p1, p2 in input_output_list]

succeeded, _ = esm1p5_convert.convert_fields_file_list(
input_list_paths, "fake_nc_write_dir")
succeeded, _ = esm1p5_convert.convert_fields_file_list(input_output_paths)

assert mock_process.call_count == len(input_list)
assert mock_process.call_count == len(input_output_list)

successful_input_paths = [successful_path_pair[0] for
successful_path_pair in succeeded]

assert input_list_paths == successful_input_paths
assert succeeded == input_output_paths


def test_convert_fields_file_list_fail_excepted(mock_process_with_exception):
# Test that UnsupportedTimeSeriesErrors are excepted.

# TODO: This test will not catch changes to the exceptions in um2nc. E.g.
# if decided to instead raise a RuntimeError in um2nc when timeseries
# are encountered, the conversion of ESM1.5 outputs would undesirably
# crash, but this test would say everything is fine, since we're
# prescribing the error that is raised.
mock_process_with_exception(um2nc.UnsupportedTimeSeriesError,
"timeseries error")
fake_file_path = Path("fake_file")
fake_input_output_paths = [(Path("fake_file"), Path("fake_file.nc"))]

_, failed = esm1p5_convert.convert_fields_file_list(
[fake_file_path], "fake_nc_write_dir")
fake_input_output_paths)

assert failed[0][0] == fake_file_path
assert failed[0][0] == fake_input_output_paths[0][0]


def test_convert_fields_file_list_fail_critical(mock_process_with_exception):
Expand All @@ -155,7 +208,7 @@ def test_convert_fields_file_list_fail_critical(mock_process_with_exception):
mock_process_with_exception(Exception, generic_error_message)
with pytest.raises(Exception) as exc_info:
esm1p5_convert.convert_fields_file_list(
["fake_file"], "fake_nc_write_dir")
[("fake_file", "fake_file.nc")])

assert str(exc_info.value) == generic_error_message

Expand Down
125 changes: 95 additions & 30 deletions umpost/conversion_driver_esm1p5.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
ESM1.5 conversion driver
Wrapper script for automated fields file to NetCDF conversion
Wrapper script for automated fields file to netCDF conversion
during ESM1.5 simulations. Runs conversion module
on each atmospheric output in a specified directory.
Expand All @@ -21,6 +21,7 @@
import errno
from pathlib import Path
from umpost import um2netcdf
import mule


# TODO: um2netcdf will update the way arguments are fed to `process`.
Expand All @@ -36,6 +37,18 @@
ARG_VALS = ARG_NAMES(3, 4, True, False, 0.5, False, None, None, False, False)


# Character in filenames specifying the unit key
FF_UNIT_INDEX = 8
# Output file suffix for each type of unit. Assume's
# ESM1.5's unit definitions are being used.
FF_UNIT_SUFFIX = {
"a": "mon",
"e": "dai",
"i": "3hr",
"j": "6hr",
}


def get_esm1p5_fields_file_pattern(run_id: str):
"""
Generate regex pattern for finding current experiment's UM outputs.
Expand All @@ -46,7 +59,7 @@ def get_esm1p5_fields_file_pattern(run_id: str):
Returns
-------
fields_file_name_pattern: Regex pattern for matching fields file names
fields_file_name_pattern: Regex pattern for matching fields file names.
"""

# For ESM1.5 simulations, files start with run_id + 'a' (atmosphere) +
Expand All @@ -63,14 +76,15 @@ def get_esm1p5_fields_file_pattern(run_id: str):
return fields_file_name_pattern


def get_nc_write_path(fields_file_path, nc_write_dir):
def get_nc_write_path(fields_file_path, nc_write_dir, date=None):
"""
Get filepath for writing NetCDF to based on fields file name.
Get filepath for writing netCDF to based on fields file name and date.
Parameters
----------
fields_file_path : path to single UM fields file to be converted.
nc_write_dir : path to target directory for writing NetCDF files.
nc_write_dir : path to target directory for writing netCDF files.
date : tuple of form (year, month, day) associated with fields file data.
Returns
-------
Expand All @@ -79,11 +93,9 @@ def get_nc_write_path(fields_file_path, nc_write_dir):
fields_file_path = Path(fields_file_path)
nc_write_dir = Path(nc_write_dir)

fields_file_name = fields_file_path.name
nc_file_name = fields_file_name + ".nc"
nc_file_write_path = nc_write_dir / nc_file_name
nc_file_name = get_nc_filename(fields_file_path.name, date)

return nc_file_write_path
return nc_write_dir / nc_file_name


def find_matching_fields_files(dir_contents, fields_file_name_pattern):
Expand All @@ -110,14 +122,72 @@ def find_matching_fields_files(dir_contents, fields_file_name_pattern):
return fields_file_paths


def convert_fields_file_list(fields_file_paths, nc_write_dir):
def get_nc_filename(fields_file_name, date=None):
"""
Format a netCDF output filename based on the input fields file name and
its date. Assumes fields_file_name follows ESM1.5's naming convention
'{5 char run_id}.pa{unit}{date encoding}`.
Parameters
----------
fields_file_name: name of fields file to be converted.
date: tuple of form (year, month, day) associated with fields file data,
or None. If None, ".nc" will be concatenated to the original fields
file name.
Returns
-------
name: formated netCDF filename for writing output.
"""
if date is None:
return f"{fields_file_name}.nc"

# TODO: Use regex to extract stem and unit from filename to improve
# clarity, and for better handling of unexpected filenames.
stem = fields_file_name[0:FF_UNIT_INDEX + 1]
unit = fields_file_name[FF_UNIT_INDEX]

try:
suffix = f"_{FF_UNIT_SUFFIX[unit]}"
except KeyError:
warnings.warn(
f"Unit code '{unit}' from filename f{fields_file_name} "
"not recognized. Frequency information will not be added "
"to the netCDF filename."
)
suffix = ""

year, month, _ = date
return f"{stem}-{year:04d}{month:02d}{suffix}.nc"


def get_ff_date(fields_file_path):
"""
Convert group of fields files to NetCDF, writing output in nc_write_dir.
Get the date from a fields file. To be used for naming output files.
Parameters
----------
fields_file_paths : list of paths to fields files for conversion.
nc_write_dir : directory to save NetCDF files into.
fields_file_path : path to single fields file.
Returns
-------
date_tuple : tuple of integers (yyyy,mm,dd) containing the fields
file's date.
"""
header = mule.FixedLengthHeader.from_file(
str(fields_file_path))

return header.t2_year, header.t2_month, header.t2_day # pylint: disable=no-member


def convert_fields_file_list(input_output_paths):
"""
Convert group of fields files to netCDF, writing output in nc_write_dir.
Parameters
----------
input_output_paths : list of tuples of form (input_path, output_path). Fields file
at input_path will be written to netCDF at ouput_path.
Returns
-------
Expand All @@ -129,18 +199,13 @@ def convert_fields_file_list(fields_file_paths, nc_write_dir):
succeeded = []
failed = []

fields_file_paths = [Path(p) for p in fields_file_paths]

for fields_file_path in fields_file_paths:

nc_write_path = get_nc_write_path(fields_file_path, nc_write_dir)

for ff_path, nc_path in input_output_paths:
try:
um2netcdf.process(fields_file_path, nc_write_path, ARG_VALS)
succeeded.append((fields_file_path, nc_write_path))
um2netcdf.process(ff_path, nc_path, ARG_VALS)
succeeded.append((ff_path, nc_path))

except um2netcdf.UnsupportedTimeSeriesError as exc:
failed.append((fields_file_path, exc))
failed.append((ff_path, exc))

# Any unexpected errors will be raised

Expand Down Expand Up @@ -211,7 +276,7 @@ def convert_esm1p5_output_dir(esm1p5_output_dir):
----------
esm1p5_output_dir: an "outputXYZ" directory produced by an ESM1.5 simulation.
Fields files in the "atmosphere" subdirectory will be
converted to NetCDF.
converted to netCDF.
Returns
-------
Expand All @@ -230,8 +295,8 @@ def convert_esm1p5_output_dir(esm1p5_output_dir):
errno.ENOENT, os.strerror(errno.ENOENT), current_atm_output_dir
)

# Create a directory for writing NetCDF files
nc_write_dir = current_atm_output_dir / "NetCDF"
# Create a directory for writing netCDF files
nc_write_dir = current_atm_output_dir / "netCDF"
nc_write_dir.mkdir(exist_ok=True)

# Find fields file outputs to be converted
Expand All @@ -249,15 +314,15 @@ def convert_esm1p5_output_dir(esm1p5_output_dir):
warnings.warn(
f"No files matching pattern '{fields_file_name_pattern}' "
f"found in {current_atm_output_dir.resolve()}. No files will be "
"converted to NetCDF."
"converted to netCDF."
)

return [], [] # Don't try to run the conversion

succeeded, failed = convert_fields_file_list(
atm_dir_fields_files,
nc_write_dir
)
output_paths = [get_nc_write_path(path, nc_write_dir, get_ff_date(path)) for path in atm_dir_fields_files]
input_output_pairs = zip(atm_dir_fields_files, output_paths)

succeeded, failed = convert_fields_file_list(input_output_pairs)

return succeeded, failed

Expand Down

0 comments on commit 0ee303a

Please sign in to comment.