Skip to content

feat: granules query pagination #131

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [3.2.0] - 2023-01-23
### Added
- [#131](https://github.com/unity-sds/unity-data-services/pull/131) granules query pagination

## [3.1.0] - 2023-01-23
### Added
- [#126](https://github.com/unity-sds/unity-data-services/pull/126) reduce pystac length by keeping only data asset
Expand Down
16 changes: 16 additions & 0 deletions cumulus_lambda_functions/cumulus_dapa_client/dapa_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,22 @@ def get_collection(self, collection_id: str):
raise RuntimeError(f'unable to find collection in DAPA')
return collection_details[0]

def get_all_granules(self, collection_id='*', limit=-1, date_from='', date_to=''):
results = []
page_size = 100 if limit < 0 or limit > 100 else limit
offset = 0
while True:
if 0 < limit <= len(results):
break
temp_results = self.get_granules(collection_id, page_size, offset, date_from, date_to)
offset += len(temp_results)
results.extend(temp_results)
if len(temp_results) < page_size:
break
if limit < 0 or limit >= len(results):
return results
return results[0: limit]

def get_granules(self, collection_id='*', limit=1000, offset=0, date_from='', date_to=''):
"""
TODO: pagination. getting only 1st 1k item
Expand Down
54 changes: 40 additions & 14 deletions cumulus_lambda_functions/stage_in_out/search_granules_cmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import requests

from cumulus_lambda_functions.cumulus_stac.stac_utils import StacUtils
from cumulus_lambda_functions.stage_in_out.search_granules_abstract import SearchGranulesAbstract

LOGGER = logging.getLogger(__name__)
Expand All @@ -19,14 +20,18 @@ class SearchGranulesCmr(SearchGranulesAbstract):
DATE_TO_KEY = 'DATE_TO'
VERIFY_SSL_KEY = 'VERIFY_SSL'

FILTER_ONLY_ASSETS = 'FILTER_ONLY_ASSETS'

def __init__(self) -> None:
super().__init__()
self.__collection_id = ''
self.__date_from = ''
self.__date_to = ''
self.__limit = 1000
self.__page_size = 2000 # page_size - number of results per page - default is 10, max is 2000
self.__verify_ssl = True
self.__cmr_base_url = ''
self.__filter_results = True

def __set_props_from_env(self):
missing_keys = [k for k in [self.COLLECTION_ID_KEY, self.CMR_BASE_URL_KEY] if k not in os.environ]
Expand All @@ -45,14 +50,24 @@ def __set_props_from_env(self):
self.__date_from = os.environ.get(self.DATE_FROM_KEY, '')
self.__date_to = os.environ.get(self.DATE_TO_KEY, '')
self.__verify_ssl = os.environ.get(self.VERIFY_SSL_KEY, 'TRUE').strip().upper() == 'TRUE'
self.__filter_results = os.environ.get(self.FILTER_ONLY_ASSETS, 'TRUE').strip().upper() == 'TRUE'
return self

def __get_correct_result_count(self, results):
if self.__limit < 0 or self.__limit >= len(results):
return results
return results[0: self.__limit]

def search(self, **kwargs) -> str:
"""
curl 'https://cmr.earthdata.nasa.gov/search/granules.stac' \
-H 'accept: application/json; profile=stac-catalogue' \
-H 'content-type: application/x-www-form-urlencoded' \
--data-raw 'collection_concept_id=C1649553296-PODAAC&page_num=1&page_size=20&temporal[]=2011-08-01T00:00:00,2011-09-01T00:00:00'

https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html#stac
https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html#query-parameters

:param kwargs:
:return:
"""
Expand All @@ -61,17 +76,28 @@ def search(self, **kwargs) -> str:
'accept': 'application/json; profile=stac-catalogue',
'Content-Type': 'application/x-www-form-urlencoded',
}
request_body = {
'collection_concept_id': self.__collection_id,
'page_num': '1',
'page_size': str(self.__limit),
'temporal[]': f'{self.__date_from},{self.__date_to}'
}
cmr_granules_url = f'{self.__cmr_base_url}search/granules.stac'
response = requests.post(url=cmr_granules_url, headers=header, verify=self.__verify_ssl,
data=request_body)
if response.status_code > 400:
raise RuntimeError(
f'Cognito ends in error. status_code: {response.status_code}. url: {cmr_granules_url}. details: {response.text}')
response = json.loads(response.content.decode('utf-8'))
return json.dumps(response['features'])
results = []
page_size = self.__page_size if self.__limit < 0 or self.__limit > self.__page_size else self.__limit
page_num = 1
while True:
if 0 < self.__limit <= len(results):
break
request_body = {
'collection_concept_id': self.__collection_id,
'page_num': str(page_num),
'page_size': str(page_size),
# 'temporal[]': f'{self.__date_from},{self.__date_to}'
}
cmr_granules_url = f'{self.__cmr_base_url}search/granules.stac'
response = requests.post(url=cmr_granules_url, headers=header, verify=self.__verify_ssl,
data=request_body)
if response.status_code > 400:
raise RuntimeError(
f'Cognito ends in error. status_code: {response.status_code}. url: {cmr_granules_url}. details: {response.text}')
temp_results = json.loads(response.content.decode('utf-8'))['features']
page_num += 1
results.extend(temp_results)
if len(temp_results) < page_size:
break
results = self.__get_correct_result_count(results)
return json.dumps(StacUtils.reduce_stac_list_to_data_links(results)) if self.__filter_results else json.dumps(results)
10 changes: 7 additions & 3 deletions cumulus_lambda_functions/stage_in_out/search_granules_unity.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os

from cumulus_lambda_functions.cumulus_dapa_client.dapa_client import DapaClient
from cumulus_lambda_functions.cumulus_stac.stac_utils import StacUtils
from cumulus_lambda_functions.lib.utils.file_utils import FileUtils
from cumulus_lambda_functions.stage_in_out.search_granules_abstract import SearchGranulesAbstract

Expand All @@ -18,13 +19,16 @@ class SearchGranulesUnity(SearchGranulesAbstract):
DATE_TO_KEY = 'DATE_TO'
VERIFY_SSL_KEY = 'VERIFY_SSL'

FILTER_ONLY_ASSETS = 'FILTER_ONLY_ASSETS'

def __init__(self) -> None:
super().__init__()
self.__collection_id = ''
self.__date_from = ''
self.__date_to = ''
self.__limit = 1000
self.__verify_ssl = True
self.__filter_results = True

def __set_props_from_env(self):
missing_keys = [k for k in [self.COLLECTION_ID_KEY] if k not in os.environ]
Expand All @@ -40,11 +44,11 @@ def __set_props_from_env(self):
self.__date_from = os.environ.get(self.DATE_FROM_KEY, '')
self.__date_to = os.environ.get(self.DATE_TO_KEY, '')
self.__verify_ssl = os.environ.get(self.VERIFY_SSL_KEY, 'TRUE').strip().upper() == 'TRUE'
self.__filter_results = os.environ.get(self.FILTER_ONLY_ASSETS, 'TRUE').strip().upper() == 'TRUE'
return self

def search(self, **kwargs) -> str:
self.__set_props_from_env()
dapa_client = DapaClient().with_verify_ssl(self.__verify_ssl)
granules_result = dapa_client.get_granules(self.__collection_id, self.__limit, 0, self.__date_from,
self.__date_to)
return json.dumps(granules_result)
granules_result = dapa_client.get_all_granules(self.__collection_id, self.__limit, self.__date_from, self.__date_to)
return json.dumps(StacUtils.reduce_stac_list_to_data_links(granules_result)) if self.__filter_results else json.dumps(granules_result)
6 changes: 4 additions & 2 deletions docker/stage-in-stage-out/dc-001-search.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@ services:

DAPA_API: 'https://k3a3qmarxh.execute-api.us-west-2.amazonaws.com/dev'
COLLECTION_ID: 'ATMS_SCIENCE_Group___001'
LIMITS: '100'
LIMITS: '100. -1 to get all granules, but not recommended.'
DATE_FROM: '2016-01-17T00:00:00Z'
DATE_TO: '2016-01-17T12:00:00Z'
VERIFY_SSL: 'FALSE'
FILTER_ONLY_ASSETS: 'TRUE'

GRANULES_SEARCH_DOMAIN: 'UNITY'
GRANULES_SEARCH_DOMAIN: 'UNITY or CMR'
CMR_BASE_URL: 'https://cmr.earthdata.nasa.gov'
LOG_LEVEL: '20'
networks:
- internal
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

setup(
name="cumulus_lambda_functions",
version="3.1.0",
version="3.3.0",
packages=find_packages(),
install_requires=install_requires,
tests_require=['mock', 'nose', 'sphinx', 'sphinx_rtd_theme', 'coverage', 'pystac', 'python-dotenv', 'jsonschema'],
Expand Down
158 changes: 152 additions & 6 deletions tests/integration_tests/test_docker_entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,35 @@


class TestDockerEntry(TestCase):
def test_01_search(self):
def test_01_search_part_01(self):
"""
:return:
"""
os.environ[Constants.USERNAME] = '/unity/uds/user/wphyo/username'
os.environ[Constants.PASSWORD] = '/unity/uds/user/wphyo/dwssap'
os.environ['PASSWORD_TYPE'] = 'PARAM_STORE'
os.environ['CLIENT_ID'] = '6ir9qveln397i0inh9pmsabq1'
os.environ['COGNITO_URL'] = 'https://cognito-idp.us-west-2.amazonaws.com'
os.environ['DAPA_API'] = 'https://58nbcawrvb.execute-api.us-west-2.amazonaws.com/test'
os.environ['COLLECTION_ID'] = 'L0_SNPP_ATMS_SCIENCE___1'
os.environ['LIMITS'] = '4000'
os.environ['DATE_FROM'] = '1990-01-14T08:00:00Z'
os.environ['DATE_TO'] = '2022-01-14T11:59:59Z'
os.environ['VERIFY_SSL'] = 'FALSE'
os.environ['FILTER_ONLY_ASSETS'] = 'FALSE'
os.environ['GRANULES_SEARCH_DOMAIN'] = 'UNITY'
if len(argv) > 1:
argv.pop(-1)
argv.append('SEARCH')
search_result = choose_process()
search_result = json.loads(search_result)
self.assertTrue(isinstance(search_result, list), f'search_result is not list: {search_result}')
self.assertEqual(len(search_result), 4000, f'wrong length')
search_result = set([k['id'] for k in search_result])
self.assertEqual(len(search_result),4000, f'wrong length. not unique')
return

def test_01_search_part_02(self):
"""
:return:
"""
Expand All @@ -26,16 +54,20 @@ def test_01_search(self):
os.environ['DATE_FROM'] = '2016-01-14T08:00:00Z'
os.environ['DATE_TO'] = '2016-01-14T11:59:59Z'
os.environ['VERIFY_SSL'] = 'FALSE'
os.environ['FILTER_ONLY_ASSETS'] = 'FALSE'
os.environ['GRANULES_SEARCH_DOMAIN'] = 'UNITY'
if len(argv) > 1:
argv.pop(-1)
argv.append('SEARCH')
search_result = choose_process()
search_result = json.loads(search_result)
self.assertTrue(isinstance(search_result, list), f'search_result is not list: {search_result}')
self.assertEqual(len(search_result), 20, f'wrong length')
search_result = set([k['id'] for k in search_result])
self.assertEqual(len(search_result),20, f'wrong length. not unique')
return

def test_01_1_search_cmr(self):
def test_01_search_part_03(self):
"""
:return:
"""
Expand All @@ -45,19 +77,133 @@ def test_01_1_search_cmr(self):
os.environ['CLIENT_ID'] = '6ir9qveln397i0inh9pmsabq1'
os.environ['COGNITO_URL'] = 'https://cognito-idp.us-west-2.amazonaws.com'
os.environ['DAPA_API'] = 'https://58nbcawrvb.execute-api.us-west-2.amazonaws.com/test'
os.environ['COLLECTION_ID'] = 'C1649553296-PODAAC'
os.environ['LIMITS'] = '100'
os.environ['DATE_FROM'] = '2016-01-14T08:00:00Z'
os.environ['DATE_TO'] = '2016-01-14T11:59:59Z'
os.environ['COLLECTION_ID'] = 'L0_SNPP_ATMS_SCIENCE___1'
os.environ['LIMITS'] = '-1'
os.environ['DATE_FROM'] = '1990-01-14T08:00:00Z'
os.environ['DATE_TO'] = '2022-01-14T11:59:59Z'
os.environ['VERIFY_SSL'] = 'FALSE'
os.environ['FILTER_ONLY_ASSETS'] = 'FALSE'
os.environ['GRANULES_SEARCH_DOMAIN'] = 'UNITY'
if len(argv) > 1:
argv.pop(-1)
argv.append('SEARCH')
search_result = choose_process()
search_result = json.loads(search_result)
self.assertTrue(isinstance(search_result, list), f'search_result is not list: {search_result}')
self.assertEqual(len(search_result), 4381, f'wrong length')
search_result = set([k['id'] for k in search_result])
self.assertEqual(len(search_result), 4381, f'wrong length. not unique')
return

def test_01_search_part_04(self):
"""
:return:
"""
os.environ[Constants.USERNAME] = '/unity/uds/user/wphyo/username'
os.environ[Constants.PASSWORD] = '/unity/uds/user/wphyo/dwssap'
os.environ['PASSWORD_TYPE'] = 'PARAM_STORE'
os.environ['CLIENT_ID'] = '6ir9qveln397i0inh9pmsabq1'
os.environ['COGNITO_URL'] = 'https://cognito-idp.us-west-2.amazonaws.com'
os.environ['DAPA_API'] = 'https://58nbcawrvb.execute-api.us-west-2.amazonaws.com/test'
os.environ['COLLECTION_ID'] = 'L0_SNPP_ATMS_SCIENCE___1'
os.environ['LIMITS'] = '347'
os.environ['DATE_FROM'] = '1990-01-14T08:00:00Z'
os.environ['DATE_TO'] = '2022-01-14T11:59:59Z'
os.environ['VERIFY_SSL'] = 'FALSE'
os.environ['FILTER_ONLY_ASSETS'] = 'FALSE'
os.environ['GRANULES_SEARCH_DOMAIN'] = 'UNITY'
if len(argv) > 1:
argv.pop(-1)
argv.append('SEARCH')
search_result = choose_process()
search_result = json.loads(search_result)
self.assertTrue(isinstance(search_result, list), f'search_result is not list: {search_result}')
self.assertEqual(len(search_result), 347, f'wrong length')
search_result = set([k['id'] for k in search_result])
self.assertEqual(len(search_result), 347, f'wrong length. not unique')
return

def test_01_search_part_05(self):
"""
:return:
"""
os.environ[Constants.USERNAME] = '/unity/uds/user/wphyo/username'
os.environ[Constants.PASSWORD] = '/unity/uds/user/wphyo/dwssap'
os.environ['PASSWORD_TYPE'] = 'PARAM_STORE'
os.environ['CLIENT_ID'] = '6ir9qveln397i0inh9pmsabq1'
os.environ['COGNITO_URL'] = 'https://cognito-idp.us-west-2.amazonaws.com'
os.environ['DAPA_API'] = 'https://58nbcawrvb.execute-api.us-west-2.amazonaws.com/test'
os.environ['COLLECTION_ID'] = 'L0_SNPP_ATMS_SCIENCE___1'
os.environ['LIMITS'] = '37'
os.environ['DATE_FROM'] = '1990-01-14T08:00:00Z'
os.environ['DATE_TO'] = '2022-01-14T11:59:59Z'
os.environ['VERIFY_SSL'] = 'FALSE'
os.environ['FILTER_ONLY_ASSETS'] = 'TRUE'
os.environ['GRANULES_SEARCH_DOMAIN'] = 'UNITY'
if len(argv) > 1:
argv.pop(-1)
argv.append('SEARCH')
search_result = choose_process()
search_result = json.loads(search_result)
self.assertTrue(isinstance(search_result, list), f'search_result is not list: {search_result}')
self.assertEqual(len(search_result), 37, f'wrong length')
self.assertTrue('id' not in search_result[0], 'not filtered')
return

def test_01_1_search_cmr_part_01(self):
"""
:return:
"""
os.environ[Constants.USERNAME] = '/unity/uds/user/wphyo/username'
os.environ[Constants.PASSWORD] = '/unity/uds/user/wphyo/dwssap'
os.environ['PASSWORD_TYPE'] = 'PARAM_STORE'
os.environ['CLIENT_ID'] = '6ir9qveln397i0inh9pmsabq1'
os.environ['COGNITO_URL'] = 'https://cognito-idp.us-west-2.amazonaws.com'
os.environ['DAPA_API'] = 'https://58nbcawrvb.execute-api.us-west-2.amazonaws.com/test'
os.environ['COLLECTION_ID'] = 'C1666605425-PODAAC'
os.environ['LIMITS'] = '2120'
os.environ['DATE_FROM'] = '2002-06-01T12:06:00.000Z'
os.environ['DATE_TO'] = '2011-10-04T06:51:45.000Z'
os.environ['VERIFY_SSL'] = 'FALSE'
os.environ['GRANULES_SEARCH_DOMAIN'] = 'CMR'
os.environ['FILTER_ONLY_ASSETS'] = 'FALSE'
os.environ['CMR_BASE_URL'] = 'https://cmr.earthdata.nasa.gov'
if len(argv) > 1:
argv.pop(-1)
argv.append('SEARCH')
search_result = choose_process()
search_result = json.loads(search_result)
self.assertTrue(isinstance(search_result, list), f'search_result is not list: {search_result}')
self.assertEqual(len(search_result), 2120, f'wrong length')
return

def test_01_1_search_cmr_part_02(self):
"""
:return:
"""
os.environ[Constants.USERNAME] = '/unity/uds/user/wphyo/username'
os.environ[Constants.PASSWORD] = '/unity/uds/user/wphyo/dwssap'
os.environ['PASSWORD_TYPE'] = 'PARAM_STORE'
os.environ['CLIENT_ID'] = '6ir9qveln397i0inh9pmsabq1'
os.environ['COGNITO_URL'] = 'https://cognito-idp.us-west-2.amazonaws.com'
os.environ['DAPA_API'] = 'https://58nbcawrvb.execute-api.us-west-2.amazonaws.com/test'
os.environ['COLLECTION_ID'] = 'C1666605425-PODAAC'
os.environ['LIMITS'] = '23'
os.environ['DATE_FROM'] = '2002-06-01T12:06:00.000Z'
os.environ['DATE_TO'] = '2011-10-04T06:51:45.000Z'
os.environ['VERIFY_SSL'] = 'FALSE'
os.environ['GRANULES_SEARCH_DOMAIN'] = 'CMR'
os.environ['FILTER_ONLY_ASSETS'] = 'TRUE'
os.environ['CMR_BASE_URL'] = 'https://cmr.earthdata.nasa.gov'
if len(argv) > 1:
argv.pop(-1)
argv.append('SEARCH')
search_result = choose_process()
search_result = json.loads(search_result)
self.assertTrue(isinstance(search_result, list), f'search_result is not list: {search_result}')
self.assertEqual(len(search_result), 23, f'wrong length')
self.assertTrue('id' not in search_result[0], 'not filtered')
self.assertTrue('assets' in search_result[0], 'not filtered')
return

def test_02_download(self):
Expand Down