fix(package): Remove archive_storage_type column from the datasets table since it should be a cluster-level setting. (#1029)

haiqi96 · kirkrodrigues · web-flow · commit 8ccccce7998f · 2025-06-25T14:52:15.000-04:00
Co-authored-by: kirkrodrigues &lt;2454684+kirkrodrigues@users.noreply.github.com&gt;
diff --git a/components/clp-py-utils/clp_py_utils/clp_metadata_db_utils.py b/components/clp-py-utils/clp_py_utils/clp_metadata_db_utils.py
@@ -9,7 +9,6 @@
     COLUMN_METADATA_TABLE_SUFFIX,
     DATASETS_TABLE_SUFFIX,
     FILES_TABLE_SUFFIX,
-    StorageType,
     TAGS_TABLE_SUFFIX,
 )
 
@@ -110,7 +109,6 @@ def create_datasets_table(db_cursor, table_prefix: str) -> None:
         f"""
         CREATE TABLE IF NOT EXISTS `{table_prefix}{DATASETS_TABLE_SUFFIX}` (
             `name` VARCHAR(255) NOT NULL,
-            `archive_storage_type` VARCHAR(64) NOT NULL,
             `archive_storage_directory` VARCHAR(4096) NOT NULL,
             PRIMARY KEY (`name`)
         )
@@ -123,7 +121,6 @@ def add_dataset(
     db_cursor,
     table_prefix: str,
     dataset_name: str,
-    archive_storage_type: StorageType,
     dataset_archive_storage_directory: Path,
 ) -> None:
     """
@@ -134,16 +131,13 @@ def add_dataset(
     :param db_cursor: The database cursor to execute the table row insertion.
     :param table_prefix: A string to prepend to the table name.
     :param dataset_name:
-    :param archive_storage_type:
     :param dataset_archive_storage_directory:
     """
     query = f"""INSERT INTO `{table_prefix}{DATASETS_TABLE_SUFFIX}`
-                (name, archive_storage_type, archive_storage_directory)
-                VALUES (%s, %s, %s)
+                (name, archive_storage_directory)
+                VALUES (%s, %s)
                 """
-    db_cursor.execute(
-        query, (dataset_name, archive_storage_type, str(dataset_archive_storage_directory))
-    )
+    db_cursor.execute(query, (dataset_name, str(dataset_archive_storage_directory)))
     create_metadata_db_tables(db_cursor, table_prefix, dataset_name)
     db_conn.commit()
 
diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py
@@ -200,7 +200,6 @@ def search_and_schedule_new_tasks(
                     db_cursor,
                     table_prefix,
                     dataset_name,
-                    clp_archive_output.storage.type,
                     archive_storage_directory,
                 )
                 existing_datasets.add(dataset_name)
diff --git a/docs/src/dev-guide/design-metadata-db.md b/docs/src/dev-guide/design-metadata-db.md
@@ -14,11 +14,10 @@ accommodate all expected values.
 (table-1)=
 :::{card}
 
-| Column name               | Type          | Description                                                              |
-|---------------------------|---------------|--------------------------------------------------------------------------|
-| name                      | VARCHAR(255)  | The *unique* name of the dataset.                                        |
-| archive_storage_type      | VARCHAR(64)   | The storage type (e.g., `s3`) where archives are stored.                 |
-| archive_storage_directory | VARCHAR(4096) | The directory (on the `archive_storage_type`) where archives are stored. |
+| Column name               | Type          | Description                                                                         |
+|---------------------------|---------------|-------------------------------------------------------------------------------------|
+| name                      | VARCHAR(255)  | The *unique* name of the dataset.                                                   |
+| archive_storage_directory | VARCHAR(4096) | The directory (relative to the storage type, e.g., `s3`) where archives are stored. |
 
 +++
 **Table 1**: The high-level schema of CLP's datasets table.

Original file line number	Diff line number	Diff line change
`@@ -200,7 +200,6 @@ def search_and_schedule_new_tasks(`
`200`	`200`	`db_cursor,`
`201`	`201`	`table_prefix,`
`202`	`202`	`dataset_name,`
`203`		`- clp_archive_output.storage.type,`
`204`	`203`	`archive_storage_directory,`
`205`	`204`	`)`
`206`	`205`	`existing_datasets.add(dataset_name)`