Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 43 additions & 13 deletions src/satchip/chip_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import glob
from collections import Counter
from datetime import datetime
from pathlib import Path
Expand Down Expand Up @@ -74,8 +75,7 @@
def create_chips(
label_paths: list[Path],
platform: str,
date_start: datetime,
date_end: datetime,
dates: utils.DateRange | list[datetime],
strategy: str,
max_cloud_pct: int,
chip_dir: Path,
Expand All @@ -84,7 +84,7 @@
platform_dir = chip_dir / platform
platform_dir.mkdir(parents=True, exist_ok=True)

opts: utils.ChipDataOpts = {'strategy': strategy, 'date_start': date_start, 'date_end': date_end}
opts: utils.ChipDataOpts = {'strategy': strategy, 'dates': dates}
if platform in ['S2L2A', 'HLS']:
opts['max_cloud_pct'] = max_cloud_pct

Expand All @@ -94,9 +94,11 @@
duplicates = [name for name, count in Counter(chip_names).items() if count > 1]
msg = f'Duplicate sample locations not supported. Duplicate chips: {", ".join(duplicates)}'
raise NotImplementedError(msg)

chip_paths = [
platform_dir / (x.with_suffix('').with_suffix('').name + f'_{platform}.zarr.zip') for x in label_paths
]

if platform == 'HYP3S1RTC':
rtc_paths_for_chips = get_rtc_paths_for_chips(chips, image_dir, opts)
opts['local_hyp3_paths'] = rtc_paths_for_chips
Expand All @@ -109,11 +111,26 @@

def main() -> None:
parser = argparse.ArgumentParser(description='Chip a label image')
parser.add_argument('labelpath', type=Path, help='Path to the label directory')

parser.add_argument('labels', type=str, help='Path or Glob pattern for label chips')

parser.add_argument(
'platform', choices=['S1RTC', 'S2L2A', 'HLS', 'HYP3S1RTC'], type=str, help='Dataset to create chips for'
)
parser.add_argument('daterange', type=str, help='Inclusive date range to search for data in the format Ymd-Ymd')

date_group = parser.add_mutually_exclusive_group(required=True)

date_group.add_argument(
'--daterange',
nargs=2,
type=str,
metavar=('START', 'END'),
help='Inclusive date range in the format YYYY-mm-dd YYYY-mm-dd',
)
date_group.add_argument(
'--dates', nargs='+', type=str, help='Space-separated list of specific dates in format YYYY-mm-dd'
)

parser.add_argument('--maxcloudpct', default=100, type=int, help='Maximum percent cloud cover for a data chip')
parser.add_argument('--chipdir', default='.', type=Path, help='Output directory for the chips')
parser.add_argument(
Expand All @@ -122,24 +139,37 @@
parser.add_argument(
'--strategy',
default='BEST',
choices=['BEST', 'ALL'],
choices=['BEST', 'ALL', 'SPECIFIC'],
type=str,
help='Strategy to use when multiple scenes are found (default: BEST)',
)
args = parser.parse_args()
args.platform = args.platform.upper()

assert 0 <= args.maxcloudpct <= 100, 'maxcloudpct must be between 0 and 100'
date_start, date_end = [datetime.strptime(d, '%Y%m%d') for d in args.daterange.split('-')]
assert date_start < date_end, 'start date must be before end date'
label_paths = list(args.labelpath.glob('*.zarr.zip'))
assert len(label_paths) > 0, f'No label files found in {args.labelpath}'

dates: utils.DateRange | list[datetime]
if args.daterange:
start_date = datetime.strptime(args.daterange[0], '%Y-%m-%d')
end_date = datetime.strptime(args.daterange[1], '%Y-%m-%d')
assert start_date < end_date, 'start date must be before end date'

dates = utils.DateRange(start_date, end_date)
else:
dates = [datetime.strptime(d, '%Y-%m-%d') for d in args.dates]
args.strategy = 'SPECIFIC'

if '*' in args.labels or '?' in args.labels or '[' in args.labels:
label_paths = [Path(p) for p in glob.glob(args.labels)]

Check failure on line 163 in src/satchip/chip_data.py

View workflow job for this annotation

GitHub Actions / call-ruff-workflow / check-with-ruff

Ruff (PTH207)

src/satchip/chip_data.py:163:41: PTH207 Replace `glob` with `Path.glob` or `Path.rglob`
else:
label_paths = list(Path(args.labels).glob('*.zarr.zip'))

assert len(label_paths) > 0, f'No label files found in {args.labels}'

if args.imagedir is None:
args.imagedir = args.chipdir / 'IMAGES'

create_chips(
label_paths, args.platform, date_start, date_end, args.strategy, args.maxcloudpct, args.chipdir, args.imagedir
)
create_chips(label_paths, args.platform, dates, args.strategy, args.maxcloudpct, args.chipdir, args.imagedir)


if __name__ == '__main__':
Expand Down
46 changes: 37 additions & 9 deletions src/satchip/chip_hls.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from satchip.terra_mind_grid import TerraMindChip


earthaccess.login()

HLS_L_BANDS = OrderedDict(
{
'B01': 'COASTAL',
Expand Down Expand Up @@ -78,19 +80,29 @@ def get_scenes(
Returns:
The best HLS items.
"""
assert strategy in ['BEST', 'ALL'], 'Strategy must be either BEST or ALL'
assert strategy in ['BEST', 'ALL', 'SPECIFIC'], 'Strategy must be either BEST or ALL'
overlapping_items = [x for x in items if get_pct_intersect(x['umm'], roi) > 95]
best_first = sorted(overlapping_items, key=lambda x: (-get_pct_intersect(x['umm'], roi), get_date(x['umm'])))

valid_scenes = []

for item in best_first:
product_id = get_product_id(item['umm'])
n_products = len(list(image_dir.glob(f'{product_id}*')))

if n_products < 15:
print(f'Downloading {product_id}...')
breakpoint()
earthaccess.download([item], image_dir, pqdm_kwargs={'disable': True})
print('Done Downloading...')

fmask_path = image_dir / f'{product_id}.v2.0.Fmask.tif'

assert fmask_path.exists(), f'File not found: {fmask_path}'

qual_da = rioxarray.open_rasterio(fmask_path).rio.clip_box(*roi.bounds, crs='EPSG:4326') # type: ignore
bit_masks = np.unpackbits(qual_da.data[0][..., np.newaxis], axis=-1)

# Looks for a 1 in the 4th, 6th and 7th bit of the Fmask (reverse order). See table 9 and appendix A of:
# https://lpdaac.usgs.gov/documents/1698/HLS_User_Guide_V2.pdf
bad_pixels = (bit_masks[..., 4] == 1) | (bit_masks[..., 6] == 1) | (bit_masks[..., 7] == 1)
Expand All @@ -104,22 +116,38 @@ def get_scenes(
return valid_scenes


def search_for_data(dates: utils.DateRange | list[datetime], bounds: utils.Bounds) -> list:
results = []
if isinstance(dates, utils.DateRange):
results = earthaccess.search_data(
short_name=['HLSL30', 'HLSS30'], bounding_box=bounds, temporal=(dates.start, dates.end + timedelta(days=1))
)
else:
for date in dates:
day_results = earthaccess.search_data(
short_name=['HLSL30', 'HLSS30'], bounding_box=bounds, temporal=(date, date + timedelta(days=1))
)
results.extend(day_results)

return results


def get_hls_data(chip: TerraMindChip, image_dir: Path, opts: utils.ChipDataOpts) -> xr.Dataset:
"""Returns XArray DataArray of a Harmonized Landsat Sentinel-2 image for the given bounds and
closest collection after date.
"""
date_start = opts['date_start']
date_end = opts['date_end'] + timedelta(days=1) # inclusive end
earthaccess.login()
results = earthaccess.search_data(
short_name=['HLSL30', 'HLSS30'], bounding_box=chip.bounds, temporal=(date_start, date_end)
)
assert len(results) > 0, f'No HLS scenes found for chip {chip.name} between {date_start} and {date_end}.'
dates = opts['dates']

results = search_for_data(dates, utils.Bounds(*chip.bounds))
assert len(results) > 0, f'No HLS scenes found for chip {chip.name} {utils.dates_error_msg(dates)}.'

roi = shapely.box(*chip.bounds)
roi_buffered = roi.buffer(0.01)
max_cloud_pct = opts.get('max_cloud_pct', 100)
strategy = opts.get('strategy', 'BEST').upper()
timesteps = get_scenes(results, roi, max_cloud_pct, strategy, image_dir)

timesteps = get_scenes(results, roi_buffered, max_cloud_pct, strategy, image_dir)

template = create_template_da(chip)
timestep_arrays = []
for scene in timesteps:
Expand Down
47 changes: 38 additions & 9 deletions src/satchip/chip_operas1rtc.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,25 +87,54 @@ def get_scenes(groups: list[RTCGroup], roi: shapely.geometry.Polygon, strategy:
return intersecting[:1]
elif strategy == 'ALL':
return intersecting
elif strategy == 'SPECIFIC':
return intersecting
else:
raise ValueError(f'Strategy must be either BEST or ALL. Got {strategy}')


def search_for_data(dates: utils.DateRange | list[datetime], bounds: utils.Bounds) -> list:
earthaccess.login()
results = []

if isinstance(dates, utils.DateRange):
results = earthaccess.search_data(
short_name=['OPERA_L2_RTC-S1_V1'],
bounding_box=bounds,
temporal=(dates.start, dates.end + timedelta(days=1)),
)
else:
for date in dates:
day_results = earthaccess.search_data(
short_name=['OPERA_L2_RTC-S1_V1'], bounding_box=bounds, temporal=(date, date + timedelta(days=1))
)
results.extend(day_results)

return results


def get_operartc_data(chip: TerraMindChip, image_dir: Path, opts: utils.ChipDataOpts) -> xr.Dataset:
"""Returns XArray DataArray of a OPERA S1-RTC for the given chip and selection startegy."""
date_start = opts['date_start']
date_end = opts['date_end'] + timedelta(days=1) # inclusive end
earthaccess.login()
results = earthaccess.search_data(
short_name=['OPERA_L2_RTC-S1_V1'], bounding_box=chip.bounds, temporal=(date_start, date_end)
)
results = filter_to_dualpol(results)
rtc_groups = group_rtcs(results)

dates = opts['dates']

roi = shapely.box(*chip.bounds)
roi_buffered = roi.buffer(0.01)

results = search_for_data(dates, utils.Bounds(*roi_buffered.bounds))
dualpol = filter_to_dualpol(results)

rtc_groups = group_rtcs(dualpol)
strategy = opts.get('strategy', 'BEST').upper()
timesteps = get_scenes(rtc_groups, roi_buffered, strategy)
assert len(timesteps) > 0, f'No OPERA RTC scenes found for chip {chip.name} between {date_start} and {date_end}.'

if len(timesteps) <= 0:
breakpoint()
f'No OPERA RTC scenes found for chip {chip.name} {utils.dates_error_msg(dates)}'

if isinstance(dates, list):
assert len(timesteps) == len(dates)

vrts = [timestep.download(image_dir) for timestep in timesteps]
template = create_template_da(chip)
timestep_arrays = []
Expand Down
58 changes: 40 additions & 18 deletions src/satchip/chip_sentinel2.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def get_scenes(
items = [item for item in items if get_pct_intersect(item.geometry, roi) > 0.95]
best_first = sorted(items, key=lambda x: (-get_pct_intersect(x.geometry, roi), x.datetime))
valid_scenes = []

for item in best_first:
scl_href = item.assets['scl'].href
local_path = fetch_s3_file(scl_href, image_dir)
Expand Down Expand Up @@ -131,6 +132,36 @@ def get_s2_version(item: Item) -> int:
return latest_items


def search_for_data(dates: utils.DateRange | list[datetime], roi: shapely.Polygon) -> list:
results = []
client = Client.open('https://earth-search.aws.element84.com/v1')

if isinstance(dates, utils.DateRange):
date_end = dates.end + timedelta(days=1)
date_range = f'{datetime.strftime(dates.start, "%Y-%m-%d")}/{datetime.strftime(date_end, "%Y-%m-%d")}'
search = client.search(
collections=['sentinel-2-l2a'],
intersects=roi,
datetime=date_range,
max_items=1000,
)

results = list(search.item_collection())
else:
for date in dates:
search = client.search(
collections=['sentinel-2-l2a'],
intersects=roi,
datetime=datetime.strftime(date, '%Y-%m-%d'),
max_items=1000,
)
results.extend(search.item_collection())

results = get_latest_image_versions(results)

return results


def get_s2l2a_data(chip: TerraMindChip, image_dir: Path, opts: utils.ChipDataOpts) -> xr.Dataset:
"""Get XArray DataArray of Sentinel-2 L2A image for the given bounds and best collection parameters.

Expand All @@ -146,28 +177,19 @@ def get_s2l2a_data(chip: TerraMindChip, image_dir: Path, opts: utils.ChipDataOpt
Returns:
XArray Dataset containing the Sentinel-2 L2A image data.
"""
date_start = opts['date_start']
date_end = opts['date_end'] + timedelta(days=1) # inclusive end
date_range = f'{datetime.strftime(date_start, "%Y-%m-%d")}/{datetime.strftime(date_end, "%Y-%m-%d")}'
dates = opts['dates']

roi = shapely.box(*chip.bounds)
roi_buffered = roi.buffer(0.01)
client = Client.open('https://earth-search.aws.element84.com/v1')
search = client.search(
collections=['sentinel-2-l2a'],
intersects=roi,
datetime=date_range,
max_items=1000,
)
assert len(search.item_collection()) > 0, (
f'No Sentinel-2 L2A scenes found for chip {chip.name} between {date_start} and {date_end}.'
)
assert len(search.item_collection()) < 1000, (
'Too many Sentinel-2 L2A scenes found for chip. Please narrow the date range.'
)
items = list(search.item_collection())
items = get_latest_image_versions(items)

items = search_for_data(dates, roi)
max_cloud_pct = opts.get('max_cloud_pct', 100)
strategy = opts.get('strategy', 'BEST')

assert len(items) > 0, (
f'No Sentinel-2 L2A scenes found for chip {chip.name} between {utils.dates_error_msg(dates)}.'
)
assert len(items) < 1000, 'Too many Sentinel-2 L2A scenes found for chip. Please narrow the date range.'
timesteps = get_scenes(items, roi, strategy, max_cloud_pct, image_dir)

urls = [item.assets[band.lower()].href for item in timesteps for band in S2_BANDS.values()]
Expand Down
12 changes: 10 additions & 2 deletions src/satchip/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,25 @@ class Bounds(NamedTuple):
maxy: float


class DateRange(NamedTuple):
start: datetime.datetime
end: datetime.datetime


class ChipDataRequiredOpts(TypedDict):
strategy: str
date_start: datetime.datetime
date_end: datetime.datetime
dates: DateRange | list[datetime.datetime]


class ChipDataOpts(ChipDataRequiredOpts, total=False):
max_cloud_pct: int
local_hyp3_paths: dict[str, list[RtcImageSet]]


def dates_error_msg(dates: DateRange | list[datetime.datetime]) -> str:
return f'between {dates.start} and {dates.end}' if isinstance(dates, DateRange) else f'for dates {dates}'


def get_overall_bounds(bounds: list) -> Bounds:
minx = min([b[0] for b in bounds])
miny = min([b[1] for b in bounds])
Expand Down
Loading