Skip to content

Commit 18ebff6

Browse files
committed
Refactor validate dataset
1 parent 528dfe2 commit 18ebff6

File tree

2 files changed

+34
-7
lines changed

2 files changed

+34
-7
lines changed

components/clp-py-utils/clp_py_utils/clp_metadata_db_utils.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -164,21 +164,46 @@ def fetch_existing_datasets(
164164

165165

166166
def validate_dataset(
167+
database_config: Database,
168+
dataset: str | None,
169+
) -> bool:
170+
"""
171+
Checks if the provided dataset currently exists in the metadata database.
172+
173+
Primarily used by scripts that want to validate the dataset during argument parsing stage.
174+
:param database_config:
175+
:param dataset:
176+
"""
177+
if not dataset:
178+
return False
179+
sql_adapter: SQL_Adapter = SQL_Adapter(database_config)
180+
clp_db_connection_params: dict[str, any] = database_config.get_clp_connection_params_and_type(
181+
True
182+
)
183+
table_prefix: str = clp_db_connection_params["table_prefix"]
184+
with closing(sql_adapter.create_connection(True)) as db_conn, closing(
185+
db_conn.cursor(dictionary=True)
186+
) as db_cursor:
187+
return validate_and_cache_dataset(db_cursor, table_prefix, dataset)
188+
189+
190+
def validate_and_cache_dataset(
167191
db_cursor,
168192
table_prefix: str,
169-
dataset: str,
193+
dataset: str | None,
170194
existing_datasets: Set[str] | None = None,
171195
) -> bool:
172196
"""
173-
Checks if a dataset currently exists in the metadata or in the local dataset cache.
197+
Checks if the provided dataset currently exists in the metadata database and cache it locally.
174198
199+
If the dataset already exists in the local cache, database query is skipped.
175200
:param db_cursor:
176201
:param table_prefix:
177202
:param dataset: The dataset to validate.
178-
:param existing_datasets: Returns a refreshed cache of dataset names fetched from the metadata
179-
if the current cache doesn not contain the provided dataset and a
180-
lookup is required.
203+
:param existing_datasets: Returns a refreshed cache of dataset names if a lookup is required.
181204
"""
205+
if not dataset:
206+
return False
182207
if existing_datasets is not None and dataset in existing_datasets:
183208
return True
184209
existing_datasets = fetch_existing_datasets(db_cursor, table_prefix)

components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
TAGS_TABLE_SUFFIX,
4242
)
4343
from clp_py_utils.clp_logging import get_logger, get_logging_formatter, set_logging_level
44-
from clp_py_utils.clp_metadata_db_utils import validate_dataset
44+
from clp_py_utils.clp_metadata_db_utils import validate_and_cache_dataset
4545
from clp_py_utils.core import read_yaml_config_file
4646
from clp_py_utils.decorators import exception_default_value
4747
from clp_py_utils.sql_adapter import SQL_Adapter
@@ -644,7 +644,9 @@ def handle_pending_query_jobs(
644644
table_prefix = clp_metadata_db_conn_params["table_prefix"]
645645
if StorageEngine.CLP_S == clp_storage_engine:
646646
dataset = QueryJobConfig.parse_obj(job_config).dataset
647-
if not validate_dataset(db_cursor, table_prefix, dataset, existing_datasets):
647+
if not validate_and_cache_dataset(
648+
db_cursor, table_prefix, dataset, existing_datasets
649+
):
648650
logger.error(f"Dataset `{dataset}` doesn't exist.")
649651
if not set_job_or_task_status(
650652
db_conn,

0 commit comments

Comments
 (0)