Skip to content

Commit

Permalink
add a further cleaning of dataset ids
Browse files Browse the repository at this point in the history
these collapse various cases of names to a simpler set.
  • Loading branch information
Jonathan Eisenhamer committed Feb 27, 2023
1 parent cd7843d commit a2934e8
Showing 1 changed file with 54 additions and 3 deletions.
57 changes: 54 additions & 3 deletions crds/bestrefs/stale.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,9 +477,9 @@ def archive_state_exposures(self, exposures, reset=True):
self.exposures = vstack([self.exposures, exposures])

# First result: List of uncalibrated exposures.
uncalibrated_datasets = {filename_to_datasetid(exposure)
uncalibrated_datasets = clean_datasets({filename_to_datasetid(exposure)
for exposure, context in exposures['filename', 'crds_ctx']
if isinstance(context, MaskedConstant)}
if isinstance(context, MaskedConstant)})
logger.info('\t# uncalibrated datasets: %d', len(uncalibrated_datasets))

# Start filtering down to find the stale exposures.
Expand All @@ -496,7 +496,7 @@ def archive_state_exposures(self, exposures, reset=True):
current_exposures = old_exposures[context_mask]

# Create list of formal dataset ids.
datasets = {filename_to_datasetid(filename) for filename in current_exposures['filename']}
datasets = clean_datasets({filename_to_datasetid(filename) for filename in current_exposures['filename']})
self.datasets.update(datasets)

# Check against affected datasets
Expand Down Expand Up @@ -547,6 +547,57 @@ def reset(self):
# #########
# Utilities
# #########
def clean_datasets(datasets):
"""Clean dataset list
Cleaning does the following:
- Remove the '_trapsfilled' portion of the name
- Remove the '_sXXXXX' portion of the name
- Remove the 'segXXX' portion of the name
- Remove all non-string values
Parameters
----------
datasets : [str(,...)] or {str(,...)}
List/set of dataset names to clean
Returns
-------
cleaned : {str(,...)}
Cleaned names
Examples
--------
>>> clean_datasets({'jw01210001003_03201_00003.nrca2'})
{'jw01210001003_03201_00003.nrca2'}
>>> clean_datasets({'jw02609011002_02101_00004.nrca2_trapsfilled'})
{'jw02609011002_02101_00004.nrca2'}
>>> clean_datasets({'jw01185017001_04103_00001-seg006.nrca3'})
{'jw01185017001_04103_00001.nrca3'}
>>> clean_datasets({'jw01345-o015_s00802_nircam'})
{'jw01345-o015_nircam'}
"""
segms = re.compile(r'(.+)-seg\d+(\..+)')
sources = re.compile(r'(.+)_s\d{5}(.+)')
trapsfilled = re.compile(r'(.+)_trapsfilled$')

str_only = {dataset for dataset in datasets if isinstance(dataset, str)}
cleaned = set()
for dataset in str_only:
match = trapsfilled.search(dataset)
if match:
dataset = match.group(1)
match = sources.search(dataset)
if match:
dataset = match.group(1) + match.group(2)
match = segms.search(dataset)
if match:
dataset = match.group(1) + match.group(2)
cleaned.add(dataset)
return cleaned


def env_override(envvar, override=None):
"""Use environment variable unless override is specified
Expand Down

0 comments on commit a2934e8

Please sign in to comment.