Skip to content

Commit 8ecadd7

Browse files
authored
feat: Adding automated ingestion lambda (#368)
* feat: some steps for cnm ingester * feat: add default collection code * feat: create collection is done * feat: methods for automated ingester is done * feat: add lambda entry pointj * fix: merge conflicts * feat: update lambda config * fix: unquoting s3 url * fix: not triggering final step debugging * fix: maybe provider is missing * fix: add big upload test * chore: dummy commit * fix: get docker build working again * fix: get github action working again
1 parent 33310ae commit 8ecadd7

File tree

19 files changed

+918
-32
lines changed

19 files changed

+918
-32
lines changed

.github/workflows/makefile.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ jobs:
1919
with:
2020
python-version: '3.9'
2121
- run: |
22-
python3 "${GITHUB_WORKSPACE}/setup.py" install
22+
python3 -m pip install -r "${GITHUB_WORKSPACE}/requirements.txt"
2323
- run: |
2424
python3 "${GITHUB_WORKSPACE}/setup.py" install_lib
2525
- run: |

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
</div>
1212

13-
<pre align="center">This repository contains source code that handles data ingest, data catalog, data search and data access that complies to OGC DAPA and STAC specifications.</pre>
13+
<pre align="center">This repository contains source code that handles data ingest, data catalog, data search and data access that complies to OGC DAPA and STAC specifications</pre>
1414
<!-- ☝️ Replace with a single sentence describing the purpose of your repo / proj ☝️ -->
1515

1616
<!-- Header block for project -->

ci.cd/Makefile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ upload_lambda:
2121
aws --profile saml-pub s3 cp cumulus_lambda_functions_deployment.zip s3://am-uds-dev-cumulus-tf-state/unity_cumulus_lambda/
2222

2323
upload_lambda_mcp_dev:
24-
aws s3 cp cumulus_lambda_functions_deployment.zip s3://uds-dev-cumulus-public/unity_cumulus_lambda/
24+
aws s3 cp tf-module/unity-cumulus/build/cumulus_lambda_functions_deployment.zip s3://uds-dev-cumulus-public/unity_cumulus_lambda/
2525
update_lambda_function_mcp_dev_6:
2626
aws lambda update-function-code --s3-key unity_cumulus_lambda/cumulus_lambda_functions_deployment.zip --s3-bucket uds-dev-cumulus-public --function-name arn:aws:lambda:us-west-2:237868187491:function:uds-dev-cumulus-metadata_s4pa_generate_cmr --publish &>/dev/null
2727
update_lambda_function_mcp_dev_7:
@@ -49,6 +49,9 @@ update_lambda_function_mcp_sbx_7:
4949
update_lambda_function_mcp_sbx_8:
5050
aws lambda update-function-code --s3-key unity_cumulus_lambda/cumulus_lambda_functions_deployment.zip --s3-bucket uds-dev-cumulus-public --function-name arn:aws:lambda:us-west-2:237868187491:function:uds-sbx-cumulus-granules_to_es --publish &>/dev/null
5151

52+
update_lambda_function_mcp_sbx_ingester:
53+
aws lambda update-function-code --s3-key unity_cumulus_lambda/cumulus_lambda_functions_deployment.zip --s3-bucket uds-dev-cumulus-public --function-name arn:aws:lambda:us-west-2:237868187491:function:uds-sbx-cumulus-granules_cnm_ingester --publish &>/dev/null
54+
5255
mcp_sbx: upload_lambda_mcp_dev update_lambda_function_mcp_sbx_7 update_lambda_function_mcp_sbx_8
5356

5457
mcp_sbx_fastapi: upload_lambda_mcp_dev update_lambda_function_mcp_sbx_uds_api
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
import os
2+
import time
3+
4+
from cumulus_lambda_functions.lib.aws.aws_message_transformers import AwsMessageTransformers
5+
from cumulus_lambda_functions.lib.uds_db.uds_collections import UdsCollections
6+
7+
from cumulus_lambda_functions.stage_in_out.stage_in_out_utils import StageInOutUtils
8+
9+
from cumulus_lambda_functions.uds_api.dapa.collections_dapa_cnm import CollectionsDapaCnm
10+
11+
from cumulus_lambda_functions.cumulus_stac.unity_collection_stac import UnityCollectionStac
12+
from cumulus_lambda_functions.uds_api.dapa.collections_dapa_creation import CollectionDapaCreation
13+
from cumulus_lambda_functions.cumulus_stac.item_transformer import ItemTransformer
14+
from pystac import ItemCollection, Item
15+
from cumulus_lambda_functions.lib.utils.file_utils import FileUtils
16+
from cumulus_lambda_functions.lib.lambda_logger_generator import LambdaLoggerGenerator
17+
from cumulus_lambda_functions.lib.aws.aws_s3 import AwsS3
18+
19+
LOGGER = LambdaLoggerGenerator.get_logger(__name__, LambdaLoggerGenerator.get_level_from_env())
20+
21+
"""
22+
TODO
23+
24+
UNITY_DEFAULT_PROVIDER
25+
CUMULUS_WORKFLOW_NAME
26+
REPORT_TO_EMS
27+
CUMULUS_WORKFLOW_SQS_URL
28+
CUMULUS_LAMBDA_PREFIX
29+
ES_URL
30+
ES_PORT
31+
SNS_TOPIC_ARN
32+
"""
33+
class GranulesCnmIngesterLogic:
34+
def __init__(self):
35+
self.__s3 = AwsS3()
36+
self.__successful_features_json = None
37+
self.__successful_features: ItemCollection = None
38+
self.__collection_id = None
39+
self.__chunk_size = StageInOutUtils.CATALOG_DEFAULT_CHUNK_SIZE
40+
if 'UNITY_DEFAULT_PROVIDER' not in os.environ:
41+
raise ValueError(f'missing UNITY_DEFAULT_PROVIDER')
42+
self.__default_provider = os.environ.get('UNITY_DEFAULT_PROVIDER')
43+
self.__uds_collection = UdsCollections(es_url=os.getenv('ES_URL'), es_port=int(os.getenv('ES_PORT', '443')))
44+
45+
@property
46+
def successful_features_json(self):
47+
return self.__successful_features_json
48+
49+
@successful_features_json.setter
50+
def successful_features_json(self, val):
51+
"""
52+
:param val:
53+
:return: None
54+
"""
55+
self.__successful_features_json = val
56+
return
57+
58+
@property
59+
def collection_id(self):
60+
return self.__collection_id
61+
62+
@collection_id.setter
63+
def collection_id(self, val):
64+
"""
65+
:param val:
66+
:return: None
67+
"""
68+
self.__collection_id = val
69+
return
70+
71+
@property
72+
def successful_features(self):
73+
return self.__successful_features
74+
75+
@successful_features.setter
76+
def successful_features(self, val):
77+
"""
78+
:param val:
79+
:return: None
80+
"""
81+
self.__successful_features = val
82+
return
83+
84+
def load_successful_features_s3(self, successful_features_s3_url):
85+
self.__s3.set_s3_url(successful_features_s3_url)
86+
if not self.__s3.exists(self.__s3.target_bucket, self.__s3.target_key):
87+
LOGGER.error(f'missing successful_features: {successful_features_s3_url}')
88+
raise ValueError(f'missing successful_features: {successful_features_s3_url}')
89+
local_successful_features = self.__s3.download('/tmp')
90+
self.__successful_features_json = FileUtils.read_json(local_successful_features)
91+
FileUtils.remove_if_exists(local_successful_features)
92+
self.__successful_features = ItemCollection.from_dict(self.__successful_features_json)
93+
return
94+
95+
def validate_granules(self):
96+
if self.successful_features is None:
97+
raise RuntimeError(f'NULL successful_features')
98+
missing_granules = []
99+
for each_granule in self.successful_features.items:
100+
missing_assets = []
101+
for each_asset_name, each_asset in each_granule.assets.items():
102+
temp_bucket, temp_key = self.__s3.split_s3_url(each_asset.href)
103+
if not self.__s3.exists(temp_bucket, temp_key):
104+
missing_assets.append({each_asset_name: each_asset.href})
105+
if len(missing_assets) > 0:
106+
missing_granules.append({
107+
'granule_id': each_granule.id,
108+
'missing_assets': missing_assets
109+
})
110+
if len(missing_granules) > 0:
111+
LOGGER.error(f'missing_granules: {missing_granules}')
112+
raise ValueError(f'missing_granules: {missing_granules}')
113+
return
114+
115+
def extract_collection_id(self):
116+
if self.successful_features is None:
117+
raise RuntimeError(f'NULL successful_features')
118+
if len(self.successful_features.items) < 1:
119+
LOGGER.error(f'not required to process. No Granules: {self.successful_features.to_dict(False)}')
120+
return
121+
self.collection_id = self.successful_features.items[0].collection_id
122+
return
123+
124+
def has_collection(self):
125+
uds_collection_result = self.__uds_collection.get_collection(self.collection_id)
126+
return len(uds_collection_result) > 0
127+
128+
def create_collection(self):
129+
if self.collection_id is None:
130+
raise RuntimeError(f'NULL collection_id')
131+
if self.has_collection():
132+
LOGGER.debug(f'{self.collection_id} already exists. continuing..')
133+
return
134+
# ref: https://github.com/unity-sds/unity-py/blob/0.4.0/unity_sds_client/services/data_service.py
135+
dapa_collection = UnityCollectionStac() \
136+
.with_id(self.collection_id) \
137+
.with_graule_id_regex("^test_file.*$") \
138+
.with_granule_id_extraction_regex("(^test_file.*)(\\.nc|\\.nc\\.cas|\\.cmr\\.xml)") \
139+
.with_title(f'Collection: {self.collection_id}') \
140+
.with_process('stac') \
141+
.with_provider(self.__default_provider) \
142+
.add_file_type("test_file01.nc", "^test_file.*\\.nc$", 'unknown_bucket', 'application/json', 'root') \
143+
.add_file_type("test_file01.nc", "^test_file.*\\.nc$", 'protected', 'data', 'item') \
144+
.add_file_type("test_file01.nc.cas", "^test_file.*\\.nc.cas$", 'protected', 'metadata', 'item') \
145+
.add_file_type("test_file01.nc.cmr.xml", "^test_file.*\\.nc.cmr.xml$", 'protected', 'metadata', 'item') \
146+
.add_file_type("test_file01.nc.stac.json", "^test_file.*\\.nc.stac.json$", 'protected', 'metadata', 'item')
147+
148+
stac_collection = dapa_collection.start()
149+
creation_result = CollectionDapaCreation(stac_collection).create()
150+
if creation_result['statusCode'] >= 400:
151+
raise RuntimeError(f'failed to create collection: {self.collection_id}. details: {creation_result["body"]}')
152+
time.sleep(3) # cool off period before checking DB
153+
if not self.has_collection():
154+
LOGGER.error(f'missing collection. (failed to create): {self.collection_id}')
155+
raise ValueError(f'missing collection. (failed to create): {self.collection_id}')
156+
return
157+
158+
def send_cnm_msg(self):
159+
LOGGER.debug(f'starting ingest_cnm_dapa_actual')
160+
try:
161+
errors = []
162+
for i, features_chunk in enumerate(StageInOutUtils.chunk_list(self.successful_features_json['features'], self.__chunk_size)):
163+
try:
164+
LOGGER.debug(f'working on chunk_index {i}')
165+
dapa_body = {
166+
"provider_id": self.__default_provider,
167+
"features": features_chunk
168+
}
169+
collections_dapa_cnm = CollectionsDapaCnm(dapa_body)
170+
cnm_result = collections_dapa_cnm.start()
171+
if cnm_result['statusCode'] != 200:
172+
errors.extend(features_chunk)
173+
except Exception as e1:
174+
LOGGER.exception(f'failed to queue CNM process.')
175+
errors.extend(features_chunk)
176+
except Exception as e:
177+
LOGGER.exception('failed to ingest to CNM')
178+
raise ValueError(f'failed to ingest to CNM: {e}')
179+
if len(errors) > 0:
180+
raise RuntimeError(f'failures during CNM ingestion: {errors}')
181+
return
182+
183+
def start(self, event):
184+
LOGGER.debug(f'event: {event}')
185+
sns_msg = AwsMessageTransformers().sqs_sns(event)
186+
s3_details = AwsMessageTransformers().get_s3_from_sns(sns_msg)
187+
s3_url = f's3://{s3_details["bucket"]}/{s3_details["key"]}'
188+
self.load_successful_features_s3(s3_url)
189+
self.validate_granules()
190+
self.extract_collection_id()
191+
self.create_collection()
192+
self.send_cnm_msg()
193+
return
Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import json
2+
3+
from cumulus_lambda_functions.granules_cnm_ingester.granules_cnm_ingester_logic import GranulesCnmIngesterLogic
24
from cumulus_lambda_functions.lib.lambda_logger_generator import LambdaLoggerGenerator
35

46

@@ -7,7 +9,8 @@ def lambda_handler(event, context):
79
:param event:
810
:param context:
911
:return:
12+
{'Records': [{'messageId': '6ff7c6fd-4053-4ab4-bc12-c042be1ed275', 'receiptHandle': 'AQEBYASiFPjQT5JBI2KKCTF/uQhHfJt/tHhgucslQQdvkNVxcXCNi2E5Ux4U9N0eu7RfvlnvtycjUh0gdL7jIeoyH+VRKSF61uAJuT4p31BsNe0GYu49N9A6+kxjP/RrykR7ZofmQRdHToX1ugRc76SMRic4H/ZZ89YAHA2QeomJFMrYywIxlk8OAzYaBf2dQI7WexjY5u1CW00XNMbTGyTo4foVPxcSn6bdFpfgxW/L7yJMX/0YQvrA9ruiuQ+lrui+6fWYh5zEk3f5v1bYtUQ6DtyyfbtMHZQJTJpUlWAFRzzN+3melilH7FySyOGDXhPb0BOSzmdKq9wBbfLW/YPb7l99ejq4GfRfj8LyI4EtB96vTeUw4LCgUqbZcBrxbGBLUXMacweh+gCjHav9ylqr2SeOiqG3vWPq9pwFYQIDqNE=', 'body': '{\n "Type" : "Notification",\n "MessageId" : "33e1075a-435c-5217-a33d-59fae85e19b2",\n "TopicArn" : "arn:aws:sns:us-west-2:237868187491:uds-sbx-cumulus-granules_cnm_ingester",\n "Subject" : "Amazon S3 Notification",\n "Message" : "{\\"Service\\":\\"Amazon S3\\",\\"Event\\":\\"s3:TestEvent\\",\\"Time\\":\\"2024-04-22T18:13:22.416Z\\",\\"Bucket\\":\\"uds-sbx-cumulus-staging\\",\\"RequestId\\":\\"DQ4T0GRVFPSX45C9\\",\\"HostId\\":\\"gHBFnYNmfnGDZBmqoQwA3RScjtjBk5lr426moGxu8IDpe5UhWAqNTxHqilWBoPN1njzIrzNrf8c=\\"}",\n "Timestamp" : "2024-04-22T18:13:22.434Z",\n "SignatureVersion" : "1",\n "Signature" : "RvSxqpU7J7CCJXbin9cXqTxzjMjgAUFtk/n454mTMcOe5x3Ay1w4AHfzyeYQCFBdLHNBa8n3OdMDoDlJqyVQMb8k+nERaiZWN2oqFVDRqT9pqSr89b+4FwlhPv6TYy2pBa/bgjZ4cOSYsey1uSQ3hjl0idfssvuV5cCRxQScbA+yu8Gcv9K7Oqgy01mC0sDHiuPIifhFXxupG5ygbjqoHIB+1gdMEbBwyixoY5GOpHM/O2uHNF+dJDjax1WMxQ2FzVjiFeCa+tNcjovF059+tx2v1YmDq/kEAFrN6DAtP6R4zKag62P9jkvjU/wHYJ2jjXmZAqoG+nuzAo24HiZPSw==",\n "SigningCertURL" : "https://sns.us-west-2.amazonaws.com/SimpleNotificationService-60eadc530605d63b8e62a523676ef735.pem",\n "UnsubscribeURL" : "https://sns.us-west-2.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:us-west-2:237868187491:uds-sbx-cumulus-granules_cnm_ingester:76cbefa1-addf-45c2-97e1-ae16986b195b"\n}', 'attributes': {'ApproximateReceiveCount': '1', 'SentTimestamp': '1713809602474', 'SenderId': 'AIDAIYLAVTDLUXBIEIX46', 'ApproximateFirstReceiveTimestamp': '1713809602483'}, 'messageAttributes': {}, 'md5OfBody': 'c6d06d1b742ad5bd2cfe5f542640aad2', 'eventSource': 'aws:sqs', 'eventSourceARN': 'arn:aws:sqs:us-west-2:237868187491:uds-sbx-cumulus-granules_cnm_ingester', 'awsRegion': 'us-west-2'}]}
1013
"""
1114
LambdaLoggerGenerator.remove_default_handlers()
12-
print(f'event: {event}')
13-
raise NotImplementedError('Require implementation later')
15+
GranulesCnmIngesterLogic().start(event)
16+
return

cumulus_lambda_functions/lib/aws/aws_message_transformers.py

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import json
2+
from urllib.parse import unquote
23

34
from cumulus_lambda_functions.lib.json_validator import JsonValidator
45

@@ -29,8 +30,15 @@ class AwsMessageTransformers:
2930
"Type": {"type": "string"},
3031
"MessageId": {"type": "string"},
3132
"TopicArn": {"type": "string"},
33+
"Subject": {"type": "string"},
34+
"Timestamp": {"type": "string"},
35+
"SignatureVersion": {"type": "string"},
36+
"Signature": {"type": "string"},
37+
"SigningCertURL": {"type": "string"},
38+
"UnsubscribeURL": {"type": "string"},
3239
"Message": {"type": "string"},
33-
}
40+
},
41+
"required": ["Message"]
3442
}
3543

3644
S3_RECORD_SCHEMA = {
@@ -41,22 +49,25 @@ class AwsMessageTransformers:
4149
'maxItems': 1,
4250
'items': {
4351
'type': 'object',
44-
'properties': {'s3': {
45-
'type': 'object',
46-
'properties': {
47-
'bucket': {
48-
'type': 'object',
49-
'properties': {'name': {'type': 'string', 'minLength': 1}},
50-
'required': ['name']
51-
},
52-
'object': {
53-
'type': 'object',
54-
'properties': {'key': {'type': 'string', 'minLength': 1}},
55-
'required': ['key']
56-
}},
57-
'required': ['bucket', 'object']
58-
}},
59-
'required': ['s3']
52+
'properties': {
53+
'eventName': {'type': 'string'},
54+
's3': {
55+
'type': 'object',
56+
'properties': {
57+
'bucket': {
58+
'type': 'object',
59+
'properties': {'name': {'type': 'string', 'minLength': 1}},
60+
'required': ['name']
61+
},
62+
'object': {
63+
'type': 'object',
64+
'properties': {'key': {'type': 'string', 'minLength': 1}},
65+
'required': ['key']
66+
}},
67+
'required': ['bucket', 'object']
68+
}
69+
},
70+
'required': ['eventName', 's3']
6071
}
6172
}},
6273
'required': ['Records']
@@ -74,3 +85,14 @@ def sqs_sns(self, raw_msg: json):
7485
sns_msg_body = sqs_msg_body['Message']
7586
sns_msg_body = json.loads(sns_msg_body)
7687
return sns_msg_body
88+
89+
def get_s3_from_sns(self, sns_msg_body):
90+
result = JsonValidator(self.S3_RECORD_SCHEMA).validate(sns_msg_body)
91+
if result is not None:
92+
raise ValueError(f'sqs_msg did not pass SQS_MSG_SCHEMA: {result}')
93+
s3_summary = {
94+
'eventName': sns_msg_body['Records'][0]['eventName'],
95+
'bucket': sns_msg_body['Records'][0]['s3']['bucket']['name'],
96+
'key': unquote(sns_msg_body['Records'][0]['s3']['object']['key'].replace('+', ' ')),
97+
}
98+
return s3_summary

cumulus_lambda_functions/lib/aws/aws_s3.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,13 @@ def __upload_to_s3(self, bucket, prefix, file_path, delete_files=False, add_size
5858
raise e
5959
return f's3://{bucket}/{s3_key}'
6060

61+
def exists(self, base_path: str, relative_path: str):
62+
try:
63+
response = self.__s3_client.head_object(Bucket=base_path, Key=relative_path)
64+
except:
65+
return False
66+
return True
67+
6168
def upload(self, file_path: str, base_path: str, relative_parent_path: str, delete_files: bool,
6269
s3_name: Union[str, None] = None, obj_tags: dict = {}, overwrite: bool = False):
6370
s3_url = self.__upload_to_s3(base_path, relative_parent_path, file_path, delete_files, True, obj_tags, s3_name)

cumulus_lambda_functions/lib/uds_db/uds_collections.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,25 @@ def add_collection(self, collection_id: str, start_time: int, end_time: int, bbo
7070
self.__es.index_one(indexing_dict, collection_id, DBConstants.collections_index)
7171
return self
7272

73+
def get_collection(self, collection_id: str):
74+
authorized_collection_ids_dsl = {
75+
'size': 20,
76+
'query': {
77+
'bool': {
78+
'must': [
79+
{'term': {DBConstants.collection_id: {'value': collection_id}}}
80+
]
81+
}
82+
},
83+
'sort': [
84+
{DBConstants.collection_id: {'order': 'asc'}}
85+
]
86+
}
87+
LOGGER.debug(f'authorized_collection_ids_dsl: {authorized_collection_ids_dsl}')
88+
authorized_collection_ids = self.__es.query(authorized_collection_ids_dsl, DBConstants.collections_index)
89+
authorized_collection_ids = [k['_source'] for k in authorized_collection_ids['hits']['hits']]
90+
return authorized_collection_ids
91+
7392
def get_collections(self, collection_regex: list):
7493
# temp_dsl = {
7594
# 'query': {'match_all': {}},

0 commit comments

Comments
 (0)