feat(clp-json): Use dataset-specific tables and archive directories for compression, decompression, and search. (#868)

Bill-hbrhbr · kirkrodrigues · Marco · web-flow · commit bcb7f54624c4 · 2025-06-20T02:37:51.000+08:00
Co-authored-by: Kirk Rodrigues &lt;2454684+kirkrodrigues@users.noreply.github.com&gt;
Co-authored-by: Marco &lt;david.marcovitch@yscope.com&gt;
Co-authored-by: davemarco &lt;83603688+davemarco@users.noreply.github.com&gt;
diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/archive_manager.py b/components/clp-package-utils/clp_package_utils/scripts/native/archive_manager.py
@@ -10,8 +10,10 @@
 from clp_py_utils.clp_config import (
     ARCHIVE_TAGS_TABLE_SUFFIX,
     ARCHIVES_TABLE_SUFFIX,
+    CLP_DEFAULT_DATASET_NAME,
     Database,
     FILES_TABLE_SUFFIX,
+    StorageEngine,
 )
 from clp_py_utils.sql_adapter import SQL_Adapter
 
@@ -182,6 +184,7 @@ def main(argv: typing.List[str]) -> int:
         logger.exception("Failed to load config.")
         return -1
 
+    storage_engine: StorageEngine = clp_config.package.storage_engine
     database_config: Database = clp_config.database
     archives_dir: Path = clp_config.archive_output.get_directory()
     if not archives_dir.exists():
@@ -192,6 +195,8 @@ def main(argv: typing.List[str]) -> int:
         return _find_archives(
             archives_dir,
             database_config,
+            storage_engine,
+            CLP_DEFAULT_DATASET_NAME,
             parsed_args.begin_ts,
             parsed_args.end_ts,
         )
@@ -202,6 +207,8 @@ def main(argv: typing.List[str]) -> int:
             return _delete_archives(
                 archives_dir,
                 database_config,
+                storage_engine,
+                CLP_DEFAULT_DATASET_NAME,
                 delete_handler,
                 parsed_args.dry_run,
             )
@@ -212,6 +219,8 @@ def main(argv: typing.List[str]) -> int:
             return _delete_archives(
                 archives_dir,
                 database_config,
+                storage_engine,
+                CLP_DEFAULT_DATASET_NAME,
                 delete_handler,
                 parsed_args.dry_run,
             )
@@ -226,6 +235,8 @@ def main(argv: typing.List[str]) -> int:
 def _find_archives(
     archives_dir: Path,
     database_config: Database,
+    storage_engine: StorageEngine,
+    dataset: str,
     begin_ts: int,
     end_ts: int = typing.Optional[int],
 ) -> int:
@@ -234,6 +245,8 @@ def _find_archives(
     `begin_ts <= archive.begin_timestamp` and `archive.end_timestamp <= end_ts`.
     :param archives_dir:
     :param database_config:
+    :param storage_engine:
+    :param dataset:
     :param begin_ts:
     :param end_ts:
     :return: 0 on success, 1 on failure.
@@ -246,6 +259,9 @@ def _find_archives(
             database_config.get_clp_connection_params_and_type(True)
         )
         table_prefix: str = clp_db_connection_params["table_prefix"]
+        if StorageEngine.CLP_S == storage_engine:
+            table_prefix = f"{table_prefix}{dataset}_"
+
         with closing(sql_adapter.create_connection(True)) as db_conn, closing(
             db_conn.cursor(dictionary=True)
         ) as db_cursor:
@@ -271,7 +287,7 @@ def _find_archives(
             logger.info(f"Found {len(archive_ids)} archives within the specified time range.")
             for archive_id in archive_ids:
                 logger.info(archive_id)
-                archive_path: Path = archives_dir / archive_id
+                archive_path: Path = archives_dir / dataset / archive_id
                 if not archive_path.is_dir():
                     logger.warning(f"Archive {archive_id} in database not found on disk.")
 
@@ -286,6 +302,8 @@ def _find_archives(
 def _delete_archives(
     archives_dir: Path,
     database_config: Database,
+    storage_engine: StorageEngine,
+    dataset: str,
     delete_handler: DeleteHandler,
     dry_run: bool = False,
 ) -> int:
@@ -294,6 +312,8 @@ def _delete_archives(
 
     :param archives_dir:
     :param database_config:
+    :param storage_engine:
+    :param dataset:
     :param delete_handler: Object to handle differences between by-filter and by-ids delete types.
     :param dry_run: If True, no changes will be made to the database or disk.
     :return: 0 on success, -1 otherwise.
@@ -307,6 +327,9 @@ def _delete_archives(
             database_config.get_clp_connection_params_and_type(True)
         )
         table_prefix = clp_db_connection_params["table_prefix"]
+        if StorageEngine.CLP_S == storage_engine:
+            table_prefix = f"{table_prefix}{dataset}_"
+
         with closing(sql_adapter.create_connection(True)) as db_conn, closing(
             db_conn.cursor(dictionary=True)
         ) as db_cursor:
@@ -365,7 +388,7 @@ def _delete_archives(
     logger.info(f"Finished deleting archives from the database.")
 
     for archive_id in archive_ids:
-        archive_path: Path = archives_dir / archive_id
+        archive_path: Path = archives_dir / dataset / archive_id
         if not archive_path.is_dir():
             logger.warning(f"Archive {archive_id} is not a directory. Skipping deletion.")
             continue
diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py
@@ -10,6 +10,7 @@
 
 import yaml
 from clp_py_utils.clp_config import (
+    CLP_DEFAULT_DATASET_NAME,
     CLPConfig,
     Database,
     FILES_TABLE_SUFFIX,
@@ -139,7 +140,9 @@ def handle_extract_stream_cmd(
     elif EXTRACT_JSON_CMD == command:
         job_type = QueryJobType.EXTRACT_JSON
         job_config = ExtractJsonJobConfig(
-            archive_id=parsed_args.archive_id, target_chunk_size=parsed_args.target_chunk_size
+            dataset=CLP_DEFAULT_DATASET_NAME,
+            archive_id=parsed_args.archive_id,
+            target_chunk_size=parsed_args.target_chunk_size,
         )
     else:
         logger.error(f"Unsupported stream extraction command: {command}")
diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py
@@ -16,6 +16,7 @@
     ALL_TARGET_NAME,
     ARCHIVES_TABLE_SUFFIX,
     AwsAuthType,
+    CLP_DEFAULT_DATASET_NAME,
     CLPConfig,
     COMPRESSION_JOBS_TABLE_NAME,
     COMPRESSION_SCHEDULER_COMPONENT_NAME,
@@ -31,6 +32,7 @@
     REDIS_COMPONENT_NAME,
     REDUCER_COMPONENT_NAME,
     RESULTS_CACHE_COMPONENT_NAME,
+    StorageEngine,
     StorageType,
     WEBUI_COMPONENT_NAME,
 )
@@ -864,6 +866,8 @@ def start_webui(instance_id: str, clp_config: CLPConfig, mounts: CLPDockerMounts
     # Read and update settings.json
     clp_db_connection_params = clp_config.database.get_clp_connection_params_and_type(True)
     table_prefix = clp_db_connection_params["table_prefix"]
+    if StorageEngine.CLP_S == clp_config.package.storage_engine:
+        table_prefix = f"{table_prefix}{CLP_DEFAULT_DATASET_NAME}_"
     meteor_settings_updates = {
         "private": {
             "SqlDbHost": clp_config.database.host,
diff --git a/components/clp-py-utils/clp_py_utils/clp_metadata_db_utils.py b/components/clp-py-utils/clp_py_utils/clp_metadata_db_utils.py
@@ -1,12 +1,15 @@
 from __future__ import annotations
 
+from pathlib import Path
+from typing import Set
+
 from clp_py_utils.clp_config import (
     ARCHIVE_TAGS_TABLE_SUFFIX,
     ARCHIVES_TABLE_SUFFIX,
-    CLP_DEFAULT_DATASET_NAME,
     COLUMN_METADATA_TABLE_SUFFIX,
     DATASETS_TABLE_SUFFIX,
     FILES_TABLE_SUFFIX,
+    StorageType,
     TAGS_TABLE_SUFFIX,
 )
 
@@ -95,7 +98,7 @@ def _create_column_metadata_table(db_cursor, table_prefix: str) -> None:
 
 def create_datasets_table(db_cursor, table_prefix: str) -> None:
     """
-    Creates the dataset information table.
+    Creates the datasets information table.
 
     :param db_cursor: The database cursor to execute the table creation.
     :param table_prefix: A string to prepend to the table name.
@@ -115,6 +118,51 @@ def create_datasets_table(db_cursor, table_prefix: str) -> None:
     )
 
 
+def add_dataset(
+    db_conn,
+    db_cursor,
+    table_prefix: str,
+    dataset_name: str,
+    archive_storage_type: StorageType,
+    dataset_archive_storage_directory: Path,
+) -> None:
+    """
+    Inserts a new dataset into the `datasets` table and creates the corresponding standard set of
+    tables for CLP's metadata.
+
+    :param db_conn:
+    :param db_cursor: The database cursor to execute the table row insertion.
+    :param table_prefix: A string to prepend to the table name.
+    :param dataset_name:
+    :param archive_storage_type:
+    :param dataset_archive_storage_directory:
+    """
+    query = f"""INSERT INTO `{table_prefix}{DATASETS_TABLE_SUFFIX}`
+                (name, archive_storage_type, archive_storage_directory)
+                VALUES (%s, %s, %s)
+                """
+    db_cursor.execute(
+        query, (dataset_name, archive_storage_type, str(dataset_archive_storage_directory))
+    )
+    create_metadata_db_tables(db_cursor, table_prefix, dataset_name)
+    db_conn.commit()
+
+
+def fetch_existing_datasets(
+    db_cursor,
+    table_prefix: str,
+) -> Set[str]:
+    """
+    Gets the names of all existing datasets.
+
+    :param db_cursor:
+    :param table_prefix:
+    """
+    db_cursor.execute(f"SELECT name FROM `{table_prefix}{DATASETS_TABLE_SUFFIX}`")
+    rows = db_cursor.fetchall()
+    return {row["name"] for row in rows}
+
+
 def create_metadata_db_tables(db_cursor, table_prefix: str, dataset: str | None = None) -> None:
     """
     Creates the standard set of tables for CLP's metadata.
@@ -125,6 +173,7 @@ def create_metadata_db_tables(db_cursor, table_prefix: str, dataset: str | None
     """
     if dataset is not None:
         table_prefix = f"{table_prefix}{dataset}_"
+        _create_column_metadata_table(db_cursor, table_prefix)
 
     archives_table_name = f"{table_prefix}{ARCHIVES_TABLE_SUFFIX}"
     tags_table_name = f"{table_prefix}{TAGS_TABLE_SUFFIX}"
@@ -136,7 +185,3 @@ def create_metadata_db_tables(db_cursor, table_prefix: str, dataset: str | None
         db_cursor, archive_tags_table_name, archives_table_name, tags_table_name
     )
     _create_files_table(db_cursor, table_prefix)
-
-    # TODO: Create this table only for the `CLP_S` storage-engine after the dataset feature is
-    # fully implemented.
-    _create_column_metadata_table(db_cursor, f"{table_prefix}{CLP_DEFAULT_DATASET_NAME}_")
diff --git a/components/clp-py-utils/clp_py_utils/initialize-clp-metadata-db.py b/components/clp-py-utils/clp_py_utils/initialize-clp-metadata-db.py
@@ -53,12 +53,10 @@ def main(argv):
         with closing(sql_adapter.create_connection(True)) as metadata_db, closing(
             metadata_db.cursor(dictionary=True)
         ) as metadata_db_cursor:
-            # TODO: After the dataset feature is fully implemented, for clp-json:
-            # 1. Populate the datasets table with the name and path for the "default" dataset.
-            # 2. Change the metadata tables to be specific to the "default" dataset.
             if StorageEngine.CLP_S == storage_engine:
                 create_datasets_table(metadata_db_cursor, table_prefix)
-            create_metadata_db_tables(metadata_db_cursor, table_prefix)
+            else:
+                create_metadata_db_tables(metadata_db_cursor, table_prefix)
             metadata_db.commit()
     except:
         logger.exception("Failed to create clp metadata tables.")
diff --git a/components/job-orchestration/job_orchestration/executor/compress/compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/compression_task.py
@@ -12,7 +12,6 @@
 from clp_py_utils.clp_config import (
     ARCHIVE_TAGS_TABLE_SUFFIX,
     ARCHIVES_TABLE_SUFFIX,
-    CLP_DEFAULT_DATASET_NAME,
     COMPRESSION_JOBS_TABLE_NAME,
     COMPRESSION_TASKS_TABLE_NAME,
     Database,
@@ -280,6 +279,8 @@ def run_clp(
         s3_config = worker_config.archive_output.storage.s3_config
         enable_s3_write = True
 
+    table_prefix = clp_metadata_db_connection_config["table_prefix"]
+    input_dataset: str
     if StorageEngine.CLP == clp_storage_engine:
         compression_cmd, compression_env = _make_clp_command_and_env(
             clp_home=clp_home,
@@ -288,6 +289,12 @@ def run_clp(
             db_config_file_path=db_config_file_path,
         )
     elif StorageEngine.CLP_S == clp_storage_engine:
+        input_dataset = clp_config.input.dataset
+        table_prefix = f"{table_prefix}{input_dataset}_"
+        archive_output_dir = archive_output_dir / input_dataset
+        if StorageType.S3 == storage_type:
+            s3_config.key_prefix = f"{s3_config.key_prefix}{input_dataset}/"
+
         compression_cmd, compression_env = _make_clp_s_command_and_env(
             clp_home=clp_home,
             archive_output_dir=archive_output_dir,
@@ -367,7 +374,6 @@ def run_clp(
                 with closing(sql_adapter.create_connection(True)) as db_conn, closing(
                     db_conn.cursor(dictionary=True)
                 ) as db_cursor:
-                    table_prefix = clp_metadata_db_connection_config["table_prefix"]
                     if StorageEngine.CLP_S == clp_storage_engine:
                         update_archive_metadata(db_cursor, table_prefix, last_archive_stats)
                     update_job_metadata_and_tags(
@@ -384,7 +390,7 @@ def run_clp(
                         str(clp_home / "bin" / "indexer"),
                         "--db-config-file",
                         str(db_config_file_path),
-                        CLP_DEFAULT_DATASET_NAME,
+                        input_dataset,
                         archive_path,
                     ]
                     try:
diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py
@@ -94,8 +94,10 @@ def _make_clp_s_command_and_env_vars(
         "x",
     ]
 
+    dataset = extract_json_config.dataset
     if StorageType.S3 == storage_type:
         s3_config = worker_config.archive_output.storage.s3_config
+        s3_config.key_prefix = f"{s3_config.key_prefix}{dataset}/"
         try:
             s3_url = generate_s3_virtual_hosted_style_url(
                 s3_config.region_code, s3_config.bucket, f"{s3_config.key_prefix}{archive_id}"
@@ -114,9 +116,10 @@ def _make_clp_s_command_and_env_vars(
         env_vars = dict(os.environ)
         env_vars.update(get_credential_env_vars(s3_config.aws_authentication))
     else:
+        archives_dir = worker_config.archive_output.get_directory() / dataset
         # fmt: off
         command.extend((
-            str(worker_config.archive_output.get_directory()),
+            str(archives_dir),
             str(stream_output_dir),
             "--archive-id",
             archive_id,
diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py
@@ -54,14 +54,15 @@ def _make_core_clp_s_command_and_env_vars(
     archive_id: str,
     search_config: SearchJobConfig,
 ) -> Tuple[Optional[List[str]], Optional[Dict[str, str]]]:
-    archives_dir = worker_config.archive_output.get_directory()
     command = [
         str(clp_home / "bin" / "clp-s"),
         "s",
     ]
 
+    dataset = search_config.dataset
     if StorageType.S3 == worker_config.archive_output.storage.type:
         s3_config = worker_config.archive_output.storage.s3_config
+        s3_config.key_prefix = f"{s3_config.key_prefix}{dataset}/"
         try:
             s3_url = generate_s3_virtual_hosted_style_url(
                 s3_config.region_code, s3_config.bucket, f"{s3_config.key_prefix}{archive_id}"
@@ -79,6 +80,7 @@ def _make_core_clp_s_command_and_env_vars(
         env_vars = dict(os.environ)
         env_vars.update(get_credential_env_vars(s3_config.aws_authentication))
     else:
+        archives_dir = worker_config.archive_output.get_directory() / dataset
         # fmt: off
         command.extend((
             str(archives_dir),
diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py
diff --git a/components/job-orchestration/job_orchestration/scheduler/job_config.py b/components/job-orchestration/job_orchestration/scheduler/job_config.py
diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py