Skip to content

Commit b67aa67

Browse files
authored
feat: Checkout stage with STAC catalog json (#138)
* feat: reduce pystac length by keeping only data asset * chore: update version * feat: pagination for cumulus granules search * feat: add pagination for CMR + bugfix in Cumulus pagination * chore: update version * fix: add filter stac items to assets only * feat: upload by stac catalog * chore: update version
1 parent 2e2fee1 commit b67aa67

File tree

8 files changed

+540
-6
lines changed

8 files changed

+540
-6
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [3.5.0] - 2023-04-18
9+
### Changed
10+
- [#138](https://github.com/unity-sds/unity-data-services/pull/138) feat: Checkout stage with STAC catalog json
11+
812
## [3.4.0] - 2023-04-17
913
### Changed
1014
- [#132](https://github.com/unity-sds/unity-data-services/pull/132) feat: add DAAC download logic
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from pystac import Catalog, Item, Asset
2+
3+
from cumulus_lambda_functions.lib.utils.file_utils import FileUtils
4+
5+
6+
class GranulesCatalog:
7+
8+
def get_child_link_hrefs(self, catalog_file_path: str):
9+
if not FileUtils.file_exist(catalog_file_path):
10+
raise ValueError(f'missing file: {catalog_file_path}')
11+
catalog = FileUtils.read_json(catalog_file_path)
12+
catalog = Catalog.from_dict(catalog)
13+
return [k.href for k in catalog.get_links(rel='child')]
14+
15+
def get_granules_item(self, granule_stac_json) -> Item:
16+
if not FileUtils.file_exist(granule_stac_json):
17+
raise ValueError(f'missing file: {granule_stac_json}')
18+
granules_stac = FileUtils.read_json(granule_stac_json)
19+
granules_stac = Item.from_dict(granules_stac)
20+
return granules_stac
21+
22+
def extract_assets_href(self, granules_stac: Item) -> dict:
23+
assets = {k: v.href for k, v in granules_stac.get_assets().items()}
24+
return assets
25+
26+
def update_assets_href(self, granules_stac: Item, new_assets: dict):
27+
for k, v in new_assets.items():
28+
if k in granules_stac.assets:
29+
existing_asset = granules_stac.assets.get(k)
30+
existing_asset.href = v
31+
else:
32+
existing_asset = Asset(v, k)
33+
granules_stac.add_asset(k, existing_asset)
34+
return self
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import json
2+
3+
from cumulus_lambda_functions.cumulus_stac.granules_catalog import GranulesCatalog
4+
from cumulus_lambda_functions.stage_in_out.search_collections_factory import SearchCollectionsFactory
5+
from cumulus_lambda_functions.stage_in_out.upload_granules_abstract import UploadGranulesAbstract
6+
import logging
7+
import os
8+
import re
9+
from cumulus_lambda_functions.lib.aws.aws_s3 import AwsS3
10+
11+
LOGGER = logging.getLogger(__name__)
12+
13+
14+
class UploadGranulesByCatalogS3(UploadGranulesAbstract):
15+
CATALOG_FILE = 'CATALOG_FILE'
16+
COLLECTION_ID_KEY = 'COLLECTION_ID'
17+
STAGING_BUCKET_KEY = 'STAGING_BUCKET'
18+
GRANULES_SEARCH_DOMAIN = 'GRANULES_SEARCH_DOMAIN'
19+
20+
VERIFY_SSL_KEY = 'VERIFY_SSL'
21+
DELETE_FILES_KEY = 'DELETE_FILES'
22+
23+
def __init__(self) -> None:
24+
super().__init__()
25+
self.__gc = GranulesCatalog()
26+
self.__collection_id = ''
27+
self.__collection_details = {}
28+
self.__staging_bucket = ''
29+
self.__verify_ssl = True
30+
self.__delete_files = False
31+
self.__s3 = AwsS3()
32+
33+
def __set_props_from_env(self):
34+
missing_keys = [k for k in [self.CATALOG_FILE, self.COLLECTION_ID_KEY, self.GRANULES_SEARCH_DOMAIN, self.STAGING_BUCKET_KEY] if k not in os.environ]
35+
if len(missing_keys) > 0:
36+
raise ValueError(f'missing environment keys: {missing_keys}')
37+
38+
self.__collection_id = os.environ.get(self.COLLECTION_ID_KEY)
39+
self.__staging_bucket = os.environ.get(self.STAGING_BUCKET_KEY)
40+
41+
self.__verify_ssl = os.environ.get(self.VERIFY_SSL_KEY, 'TRUE').strip().upper() == 'TRUE'
42+
self.__delete_files = os.environ.get(self.DELETE_FILES_KEY, 'FALSE').strip().upper() == 'TRUE'
43+
return self
44+
45+
def upload(self, **kwargs) -> list:
46+
self.__set_props_from_env()
47+
self.__collection_details = SearchCollectionsFactory().get_class(os.getenv('GRANULES_SEARCH_DOMAIN', 'MISSING_GRANULES_SEARCH_DOMAIN')).search()
48+
self.__collection_details = json.loads(self.__collection_details)
49+
50+
granule_id_extraction = self.__collection_details['summaries']['granuleIdExtraction'][0]
51+
child_links = self.__gc.get_child_link_hrefs(os.environ.get(self.CATALOG_FILE))
52+
errors = []
53+
dapa_body_granules = []
54+
for each_child in child_links:
55+
try:
56+
current_granule_stac = self.__gc.get_granules_item(each_child)
57+
current_assets = self.__gc.extract_assets_href(current_granule_stac)
58+
if 'data' not in current_assets:
59+
LOGGER.warning(f'skipping {each_child}. no data in {current_assets}')
60+
continue
61+
62+
current_granule_id = re.findall(granule_id_extraction, os.path.basename(current_assets['data']))
63+
if len(current_granule_id) < 1:
64+
LOGGER.warning(f'skipping {each_child}. cannot be matched to granule_id: {current_granule_id}')
65+
continue
66+
current_granule_id = current_granule_id[0]
67+
68+
updating_assets = {}
69+
uploading_current_granule_stac = None
70+
for asset_type, asset_href in current_assets.items():
71+
72+
LOGGER.debug(f'uploading {asset_type}, {asset_href}')
73+
s3_url = self.__s3.upload(asset_href, self.__staging_bucket, f'{self.__collection_id}:{current_granule_id}', self.__delete_files)
74+
if asset_href == each_child:
75+
uploading_current_granule_stac = s3_url
76+
updating_assets[asset_type] = s3_url
77+
self.__gc.update_assets_href(current_granule_stac, updating_assets)
78+
current_granule_stac.id = current_granule_id
79+
current_granule_stac.collection_id = self.__collection_id
80+
if uploading_current_granule_stac is not None: # upload metadata file again
81+
self.__s3.set_s3_url(uploading_current_granule_stac)
82+
self.__s3.upload_bytes(json.dumps(current_granule_stac.to_dict(False, False)).encode())
83+
dapa_body_granules.append({
84+
'id': f'{self.__collection_id}:{current_granule_id}',
85+
'collection': self.__collection_id,
86+
'assets': {k: v.to_dict() for k, v in current_granule_stac.assets.items()},
87+
})
88+
except Exception as e:
89+
LOGGER.exception(f'error while processing: {each_child}')
90+
errors.append({'href': each_child, 'error': str(e)})
91+
92+
if len(errors) > 0:
93+
LOGGER.error(f'some errors while uploading granules: {errors}')
94+
LOGGER.debug(f'dapa_body_granules: {dapa_body_granules}')
95+
return dapa_body_granules
Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1-
2-
31
class UploadGranulesFactory:
42
S3 = 'S3'
3+
CATALOG_S3 = 'CATALOG_S3'
54

65
def get_class(self, upload_type):
76
if upload_type == UploadGranulesFactory.S3:
87
from cumulus_lambda_functions.stage_in_out.upload_granules_s3 import UploadGranulesS3
98
return UploadGranulesS3()
9+
if upload_type == UploadGranulesFactory.CATALOG_S3:
10+
from cumulus_lambda_functions.stage_in_out.upload_granules_by_catalog_s3 import UploadGranulesByCatalogS3
11+
return UploadGranulesByCatalogS3()
1012
raise ValueError(f'unknown search_type: {upload_type}')

docker/stage-in-stage-out/dc-003-upload.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@ services:
2828
DELETE_FILES: 'FALSE'
2929

3030
GRANULES_SEARCH_DOMAIN: 'UNITY'
31-
GRANULES_UPLOAD_TYPE: 'S3'
32-
UPLOAD_DIR: '/etc/snpp_upload_test_1'
31+
GRANULES_UPLOAD_TYPE: 'S3 or CATALOG_S3'
32+
UPLOAD_DIR: '/etc/snpp_upload_test_1. or empty string'
33+
CATALOG_FILE: 'empty string or /path/to/stac/catalog file'
3334
LOG_LEVEL: '20'
3435
networks:
3536
- internal

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
setup(
2020
name="cumulus_lambda_functions",
21-
version="3.4.0",
21+
version="3.5.0",
2222
packages=find_packages(),
2323
install_requires=install_requires,
2424
tests_require=['mock', 'nose', 'sphinx', 'sphinx_rtd_theme', 'coverage', 'pystac', 'python-dotenv', 'jsonschema'],

0 commit comments

Comments
 (0)