Skip to content

Commit 961604d

Browse files
authored
feat: granules query pagination (#131)
* feat: reduce pystac length by keeping only data asset * chore: update version * feat: pagination for cumulus granules search * feat: add pagination for CMR + bugfix in Cumulus pagination * chore: update version * fix: add filter stac items to assets only * fix: add missing ENV in sample yaml
1 parent 1d5a0ca commit 961604d

File tree

7 files changed

+224
-26
lines changed

7 files changed

+224
-26
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [3.2.0] - 2023-01-23
9+
### Added
10+
- [#131](https://github.com/unity-sds/unity-data-services/pull/131) granules query pagination
11+
812
## [3.1.0] - 2023-01-23
913
### Added
1014
- [#126](https://github.com/unity-sds/unity-data-services/pull/126) reduce pystac length by keeping only data asset

cumulus_lambda_functions/cumulus_dapa_client/dapa_client.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,22 @@ def get_collection(self, collection_id: str):
6060
raise RuntimeError(f'unable to find collection in DAPA')
6161
return collection_details[0]
6262

63+
def get_all_granules(self, collection_id='*', limit=-1, date_from='', date_to=''):
64+
results = []
65+
page_size = 100 if limit < 0 or limit > 100 else limit
66+
offset = 0
67+
while True:
68+
if 0 < limit <= len(results):
69+
break
70+
temp_results = self.get_granules(collection_id, page_size, offset, date_from, date_to)
71+
offset += len(temp_results)
72+
results.extend(temp_results)
73+
if len(temp_results) < page_size:
74+
break
75+
if limit < 0 or limit >= len(results):
76+
return results
77+
return results[0: limit]
78+
6379
def get_granules(self, collection_id='*', limit=1000, offset=0, date_from='', date_to=''):
6480
"""
6581
TODO: pagination. getting only 1st 1k item

cumulus_lambda_functions/stage_in_out/search_granules_cmr.py

Lines changed: 40 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import requests
66

7+
from cumulus_lambda_functions.cumulus_stac.stac_utils import StacUtils
78
from cumulus_lambda_functions.stage_in_out.search_granules_abstract import SearchGranulesAbstract
89

910
LOGGER = logging.getLogger(__name__)
@@ -19,14 +20,18 @@ class SearchGranulesCmr(SearchGranulesAbstract):
1920
DATE_TO_KEY = 'DATE_TO'
2021
VERIFY_SSL_KEY = 'VERIFY_SSL'
2122

23+
FILTER_ONLY_ASSETS = 'FILTER_ONLY_ASSETS'
24+
2225
def __init__(self) -> None:
2326
super().__init__()
2427
self.__collection_id = ''
2528
self.__date_from = ''
2629
self.__date_to = ''
2730
self.__limit = 1000
31+
self.__page_size = 2000 # page_size - number of results per page - default is 10, max is 2000
2832
self.__verify_ssl = True
2933
self.__cmr_base_url = ''
34+
self.__filter_results = True
3035

3136
def __set_props_from_env(self):
3237
missing_keys = [k for k in [self.COLLECTION_ID_KEY, self.CMR_BASE_URL_KEY] if k not in os.environ]
@@ -45,14 +50,24 @@ def __set_props_from_env(self):
4550
self.__date_from = os.environ.get(self.DATE_FROM_KEY, '')
4651
self.__date_to = os.environ.get(self.DATE_TO_KEY, '')
4752
self.__verify_ssl = os.environ.get(self.VERIFY_SSL_KEY, 'TRUE').strip().upper() == 'TRUE'
53+
self.__filter_results = os.environ.get(self.FILTER_ONLY_ASSETS, 'TRUE').strip().upper() == 'TRUE'
4854
return self
4955

56+
def __get_correct_result_count(self, results):
57+
if self.__limit < 0 or self.__limit >= len(results):
58+
return results
59+
return results[0: self.__limit]
60+
5061
def search(self, **kwargs) -> str:
5162
"""
5263
curl 'https://cmr.earthdata.nasa.gov/search/granules.stac' \
5364
-H 'accept: application/json; profile=stac-catalogue' \
5465
-H 'content-type: application/x-www-form-urlencoded' \
5566
--data-raw 'collection_concept_id=C1649553296-PODAAC&page_num=1&page_size=20&temporal[]=2011-08-01T00:00:00,2011-09-01T00:00:00'
67+
68+
https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html#stac
69+
https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html#query-parameters
70+
5671
:param kwargs:
5772
:return:
5873
"""
@@ -61,17 +76,28 @@ def search(self, **kwargs) -> str:
6176
'accept': 'application/json; profile=stac-catalogue',
6277
'Content-Type': 'application/x-www-form-urlencoded',
6378
}
64-
request_body = {
65-
'collection_concept_id': self.__collection_id,
66-
'page_num': '1',
67-
'page_size': str(self.__limit),
68-
'temporal[]': f'{self.__date_from},{self.__date_to}'
69-
}
70-
cmr_granules_url = f'{self.__cmr_base_url}search/granules.stac'
71-
response = requests.post(url=cmr_granules_url, headers=header, verify=self.__verify_ssl,
72-
data=request_body)
73-
if response.status_code > 400:
74-
raise RuntimeError(
75-
f'Cognito ends in error. status_code: {response.status_code}. url: {cmr_granules_url}. details: {response.text}')
76-
response = json.loads(response.content.decode('utf-8'))
77-
return json.dumps(response['features'])
79+
results = []
80+
page_size = self.__page_size if self.__limit < 0 or self.__limit > self.__page_size else self.__limit
81+
page_num = 1
82+
while True:
83+
if 0 < self.__limit <= len(results):
84+
break
85+
request_body = {
86+
'collection_concept_id': self.__collection_id,
87+
'page_num': str(page_num),
88+
'page_size': str(page_size),
89+
# 'temporal[]': f'{self.__date_from},{self.__date_to}'
90+
}
91+
cmr_granules_url = f'{self.__cmr_base_url}search/granules.stac'
92+
response = requests.post(url=cmr_granules_url, headers=header, verify=self.__verify_ssl,
93+
data=request_body)
94+
if response.status_code > 400:
95+
raise RuntimeError(
96+
f'Cognito ends in error. status_code: {response.status_code}. url: {cmr_granules_url}. details: {response.text}')
97+
temp_results = json.loads(response.content.decode('utf-8'))['features']
98+
page_num += 1
99+
results.extend(temp_results)
100+
if len(temp_results) < page_size:
101+
break
102+
results = self.__get_correct_result_count(results)
103+
return json.dumps(StacUtils.reduce_stac_list_to_data_links(results)) if self.__filter_results else json.dumps(results)

cumulus_lambda_functions/stage_in_out/search_granules_unity.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44

55
from cumulus_lambda_functions.cumulus_dapa_client.dapa_client import DapaClient
6+
from cumulus_lambda_functions.cumulus_stac.stac_utils import StacUtils
67
from cumulus_lambda_functions.lib.utils.file_utils import FileUtils
78
from cumulus_lambda_functions.stage_in_out.search_granules_abstract import SearchGranulesAbstract
89

@@ -18,13 +19,16 @@ class SearchGranulesUnity(SearchGranulesAbstract):
1819
DATE_TO_KEY = 'DATE_TO'
1920
VERIFY_SSL_KEY = 'VERIFY_SSL'
2021

22+
FILTER_ONLY_ASSETS = 'FILTER_ONLY_ASSETS'
23+
2124
def __init__(self) -> None:
2225
super().__init__()
2326
self.__collection_id = ''
2427
self.__date_from = ''
2528
self.__date_to = ''
2629
self.__limit = 1000
2730
self.__verify_ssl = True
31+
self.__filter_results = True
2832

2933
def __set_props_from_env(self):
3034
missing_keys = [k for k in [self.COLLECTION_ID_KEY] if k not in os.environ]
@@ -40,11 +44,11 @@ def __set_props_from_env(self):
4044
self.__date_from = os.environ.get(self.DATE_FROM_KEY, '')
4145
self.__date_to = os.environ.get(self.DATE_TO_KEY, '')
4246
self.__verify_ssl = os.environ.get(self.VERIFY_SSL_KEY, 'TRUE').strip().upper() == 'TRUE'
47+
self.__filter_results = os.environ.get(self.FILTER_ONLY_ASSETS, 'TRUE').strip().upper() == 'TRUE'
4348
return self
4449

4550
def search(self, **kwargs) -> str:
4651
self.__set_props_from_env()
4752
dapa_client = DapaClient().with_verify_ssl(self.__verify_ssl)
48-
granules_result = dapa_client.get_granules(self.__collection_id, self.__limit, 0, self.__date_from,
49-
self.__date_to)
50-
return json.dumps(granules_result)
53+
granules_result = dapa_client.get_all_granules(self.__collection_id, self.__limit, self.__date_from, self.__date_to)
54+
return json.dumps(StacUtils.reduce_stac_list_to_data_links(granules_result)) if self.__filter_results else json.dumps(granules_result)

docker/stage-in-stage-out/dc-001-search.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@ services:
1919

2020
DAPA_API: 'https://k3a3qmarxh.execute-api.us-west-2.amazonaws.com/dev'
2121
COLLECTION_ID: 'ATMS_SCIENCE_Group___001'
22-
LIMITS: '100'
22+
LIMITS: '100. -1 to get all granules, but not recommended.'
2323
DATE_FROM: '2016-01-17T00:00:00Z'
2424
DATE_TO: '2016-01-17T12:00:00Z'
2525
VERIFY_SSL: 'FALSE'
26+
FILTER_ONLY_ASSETS: 'TRUE'
2627

27-
GRANULES_SEARCH_DOMAIN: 'UNITY'
28+
GRANULES_SEARCH_DOMAIN: 'UNITY or CMR'
29+
CMR_BASE_URL: 'https://cmr.earthdata.nasa.gov'
2830
LOG_LEVEL: '20'
2931
networks:
3032
- internal

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
setup(
1919
name="cumulus_lambda_functions",
20-
version="3.1.0",
20+
version="3.3.0",
2121
packages=find_packages(),
2222
install_requires=install_requires,
2323
tests_require=['mock', 'nose', 'sphinx', 'sphinx_rtd_theme', 'coverage', 'pystac', 'python-dotenv', 'jsonschema'],

tests/integration_tests/test_docker_entry.py

Lines changed: 152 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,35 @@
1111

1212

1313
class TestDockerEntry(TestCase):
14-
def test_01_search(self):
14+
def test_01_search_part_01(self):
15+
"""
16+
:return:
17+
"""
18+
os.environ[Constants.USERNAME] = '/unity/uds/user/wphyo/username'
19+
os.environ[Constants.PASSWORD] = '/unity/uds/user/wphyo/dwssap'
20+
os.environ['PASSWORD_TYPE'] = 'PARAM_STORE'
21+
os.environ['CLIENT_ID'] = '6ir9qveln397i0inh9pmsabq1'
22+
os.environ['COGNITO_URL'] = 'https://cognito-idp.us-west-2.amazonaws.com'
23+
os.environ['DAPA_API'] = 'https://58nbcawrvb.execute-api.us-west-2.amazonaws.com/test'
24+
os.environ['COLLECTION_ID'] = 'L0_SNPP_ATMS_SCIENCE___1'
25+
os.environ['LIMITS'] = '4000'
26+
os.environ['DATE_FROM'] = '1990-01-14T08:00:00Z'
27+
os.environ['DATE_TO'] = '2022-01-14T11:59:59Z'
28+
os.environ['VERIFY_SSL'] = 'FALSE'
29+
os.environ['FILTER_ONLY_ASSETS'] = 'FALSE'
30+
os.environ['GRANULES_SEARCH_DOMAIN'] = 'UNITY'
31+
if len(argv) > 1:
32+
argv.pop(-1)
33+
argv.append('SEARCH')
34+
search_result = choose_process()
35+
search_result = json.loads(search_result)
36+
self.assertTrue(isinstance(search_result, list), f'search_result is not list: {search_result}')
37+
self.assertEqual(len(search_result), 4000, f'wrong length')
38+
search_result = set([k['id'] for k in search_result])
39+
self.assertEqual(len(search_result),4000, f'wrong length. not unique')
40+
return
41+
42+
def test_01_search_part_02(self):
1543
"""
1644
:return:
1745
"""
@@ -26,16 +54,20 @@ def test_01_search(self):
2654
os.environ['DATE_FROM'] = '2016-01-14T08:00:00Z'
2755
os.environ['DATE_TO'] = '2016-01-14T11:59:59Z'
2856
os.environ['VERIFY_SSL'] = 'FALSE'
57+
os.environ['FILTER_ONLY_ASSETS'] = 'FALSE'
2958
os.environ['GRANULES_SEARCH_DOMAIN'] = 'UNITY'
3059
if len(argv) > 1:
3160
argv.pop(-1)
3261
argv.append('SEARCH')
3362
search_result = choose_process()
3463
search_result = json.loads(search_result)
3564
self.assertTrue(isinstance(search_result, list), f'search_result is not list: {search_result}')
65+
self.assertEqual(len(search_result), 20, f'wrong length')
66+
search_result = set([k['id'] for k in search_result])
67+
self.assertEqual(len(search_result),20, f'wrong length. not unique')
3668
return
3769

38-
def test_01_1_search_cmr(self):
70+
def test_01_search_part_03(self):
3971
"""
4072
:return:
4173
"""
@@ -45,19 +77,133 @@ def test_01_1_search_cmr(self):
4577
os.environ['CLIENT_ID'] = '6ir9qveln397i0inh9pmsabq1'
4678
os.environ['COGNITO_URL'] = 'https://cognito-idp.us-west-2.amazonaws.com'
4779
os.environ['DAPA_API'] = 'https://58nbcawrvb.execute-api.us-west-2.amazonaws.com/test'
48-
os.environ['COLLECTION_ID'] = 'C1649553296-PODAAC'
49-
os.environ['LIMITS'] = '100'
50-
os.environ['DATE_FROM'] = '2016-01-14T08:00:00Z'
51-
os.environ['DATE_TO'] = '2016-01-14T11:59:59Z'
80+
os.environ['COLLECTION_ID'] = 'L0_SNPP_ATMS_SCIENCE___1'
81+
os.environ['LIMITS'] = '-1'
82+
os.environ['DATE_FROM'] = '1990-01-14T08:00:00Z'
83+
os.environ['DATE_TO'] = '2022-01-14T11:59:59Z'
84+
os.environ['VERIFY_SSL'] = 'FALSE'
85+
os.environ['FILTER_ONLY_ASSETS'] = 'FALSE'
86+
os.environ['GRANULES_SEARCH_DOMAIN'] = 'UNITY'
87+
if len(argv) > 1:
88+
argv.pop(-1)
89+
argv.append('SEARCH')
90+
search_result = choose_process()
91+
search_result = json.loads(search_result)
92+
self.assertTrue(isinstance(search_result, list), f'search_result is not list: {search_result}')
93+
self.assertEqual(len(search_result), 4381, f'wrong length')
94+
search_result = set([k['id'] for k in search_result])
95+
self.assertEqual(len(search_result), 4381, f'wrong length. not unique')
96+
return
97+
98+
def test_01_search_part_04(self):
99+
"""
100+
:return:
101+
"""
102+
os.environ[Constants.USERNAME] = '/unity/uds/user/wphyo/username'
103+
os.environ[Constants.PASSWORD] = '/unity/uds/user/wphyo/dwssap'
104+
os.environ['PASSWORD_TYPE'] = 'PARAM_STORE'
105+
os.environ['CLIENT_ID'] = '6ir9qveln397i0inh9pmsabq1'
106+
os.environ['COGNITO_URL'] = 'https://cognito-idp.us-west-2.amazonaws.com'
107+
os.environ['DAPA_API'] = 'https://58nbcawrvb.execute-api.us-west-2.amazonaws.com/test'
108+
os.environ['COLLECTION_ID'] = 'L0_SNPP_ATMS_SCIENCE___1'
109+
os.environ['LIMITS'] = '347'
110+
os.environ['DATE_FROM'] = '1990-01-14T08:00:00Z'
111+
os.environ['DATE_TO'] = '2022-01-14T11:59:59Z'
112+
os.environ['VERIFY_SSL'] = 'FALSE'
113+
os.environ['FILTER_ONLY_ASSETS'] = 'FALSE'
114+
os.environ['GRANULES_SEARCH_DOMAIN'] = 'UNITY'
115+
if len(argv) > 1:
116+
argv.pop(-1)
117+
argv.append('SEARCH')
118+
search_result = choose_process()
119+
search_result = json.loads(search_result)
120+
self.assertTrue(isinstance(search_result, list), f'search_result is not list: {search_result}')
121+
self.assertEqual(len(search_result), 347, f'wrong length')
122+
search_result = set([k['id'] for k in search_result])
123+
self.assertEqual(len(search_result), 347, f'wrong length. not unique')
124+
return
125+
126+
def test_01_search_part_05(self):
127+
"""
128+
:return:
129+
"""
130+
os.environ[Constants.USERNAME] = '/unity/uds/user/wphyo/username'
131+
os.environ[Constants.PASSWORD] = '/unity/uds/user/wphyo/dwssap'
132+
os.environ['PASSWORD_TYPE'] = 'PARAM_STORE'
133+
os.environ['CLIENT_ID'] = '6ir9qveln397i0inh9pmsabq1'
134+
os.environ['COGNITO_URL'] = 'https://cognito-idp.us-west-2.amazonaws.com'
135+
os.environ['DAPA_API'] = 'https://58nbcawrvb.execute-api.us-west-2.amazonaws.com/test'
136+
os.environ['COLLECTION_ID'] = 'L0_SNPP_ATMS_SCIENCE___1'
137+
os.environ['LIMITS'] = '37'
138+
os.environ['DATE_FROM'] = '1990-01-14T08:00:00Z'
139+
os.environ['DATE_TO'] = '2022-01-14T11:59:59Z'
140+
os.environ['VERIFY_SSL'] = 'FALSE'
141+
os.environ['FILTER_ONLY_ASSETS'] = 'TRUE'
142+
os.environ['GRANULES_SEARCH_DOMAIN'] = 'UNITY'
143+
if len(argv) > 1:
144+
argv.pop(-1)
145+
argv.append('SEARCH')
146+
search_result = choose_process()
147+
search_result = json.loads(search_result)
148+
self.assertTrue(isinstance(search_result, list), f'search_result is not list: {search_result}')
149+
self.assertEqual(len(search_result), 37, f'wrong length')
150+
self.assertTrue('id' not in search_result[0], 'not filtered')
151+
return
152+
153+
def test_01_1_search_cmr_part_01(self):
154+
"""
155+
:return:
156+
"""
157+
os.environ[Constants.USERNAME] = '/unity/uds/user/wphyo/username'
158+
os.environ[Constants.PASSWORD] = '/unity/uds/user/wphyo/dwssap'
159+
os.environ['PASSWORD_TYPE'] = 'PARAM_STORE'
160+
os.environ['CLIENT_ID'] = '6ir9qveln397i0inh9pmsabq1'
161+
os.environ['COGNITO_URL'] = 'https://cognito-idp.us-west-2.amazonaws.com'
162+
os.environ['DAPA_API'] = 'https://58nbcawrvb.execute-api.us-west-2.amazonaws.com/test'
163+
os.environ['COLLECTION_ID'] = 'C1666605425-PODAAC'
164+
os.environ['LIMITS'] = '2120'
165+
os.environ['DATE_FROM'] = '2002-06-01T12:06:00.000Z'
166+
os.environ['DATE_TO'] = '2011-10-04T06:51:45.000Z'
167+
os.environ['VERIFY_SSL'] = 'FALSE'
168+
os.environ['GRANULES_SEARCH_DOMAIN'] = 'CMR'
169+
os.environ['FILTER_ONLY_ASSETS'] = 'FALSE'
170+
os.environ['CMR_BASE_URL'] = 'https://cmr.earthdata.nasa.gov'
171+
if len(argv) > 1:
172+
argv.pop(-1)
173+
argv.append('SEARCH')
174+
search_result = choose_process()
175+
search_result = json.loads(search_result)
176+
self.assertTrue(isinstance(search_result, list), f'search_result is not list: {search_result}')
177+
self.assertEqual(len(search_result), 2120, f'wrong length')
178+
return
179+
180+
def test_01_1_search_cmr_part_02(self):
181+
"""
182+
:return:
183+
"""
184+
os.environ[Constants.USERNAME] = '/unity/uds/user/wphyo/username'
185+
os.environ[Constants.PASSWORD] = '/unity/uds/user/wphyo/dwssap'
186+
os.environ['PASSWORD_TYPE'] = 'PARAM_STORE'
187+
os.environ['CLIENT_ID'] = '6ir9qveln397i0inh9pmsabq1'
188+
os.environ['COGNITO_URL'] = 'https://cognito-idp.us-west-2.amazonaws.com'
189+
os.environ['DAPA_API'] = 'https://58nbcawrvb.execute-api.us-west-2.amazonaws.com/test'
190+
os.environ['COLLECTION_ID'] = 'C1666605425-PODAAC'
191+
os.environ['LIMITS'] = '23'
192+
os.environ['DATE_FROM'] = '2002-06-01T12:06:00.000Z'
193+
os.environ['DATE_TO'] = '2011-10-04T06:51:45.000Z'
52194
os.environ['VERIFY_SSL'] = 'FALSE'
53195
os.environ['GRANULES_SEARCH_DOMAIN'] = 'CMR'
196+
os.environ['FILTER_ONLY_ASSETS'] = 'TRUE'
54197
os.environ['CMR_BASE_URL'] = 'https://cmr.earthdata.nasa.gov'
55198
if len(argv) > 1:
56199
argv.pop(-1)
57200
argv.append('SEARCH')
58201
search_result = choose_process()
59202
search_result = json.loads(search_result)
60203
self.assertTrue(isinstance(search_result, list), f'search_result is not list: {search_result}')
204+
self.assertEqual(len(search_result), 23, f'wrong length')
205+
self.assertTrue('id' not in search_result[0], 'not filtered')
206+
self.assertTrue('assets' in search_result[0], 'not filtered')
61207
return
62208

63209
def test_02_download(self):

0 commit comments

Comments
 (0)