Make maximum table size configurable in Cassandra

isullivan · isullivan · commit 2543af75b3b9 · 2025-02-14T17:06:44.000-08:00
diff --git a/python/lsst/dax/apdb/cassandra/apdbCassandra.py b/python/lsst/dax/apdb/cassandra/apdbCassandra.py
@@ -813,6 +813,7 @@ def store(
         objects: pandas.DataFrame,
         sources: pandas.DataFrame | None = None,
         forced_sources: pandas.DataFrame | None = None,
+        maximum_table_length: int = 0,
     ) -> None:
         # docstring is inherited from a base class
         objects = self._fix_input_timestamps(objects)
@@ -828,17 +829,18 @@ def store(
 
         # fill region partition column for DiaObjects
         objects = self._add_apdb_part(objects)
-        self._storeDiaObjects(objects, visit_time, replica_chunk)
+        self._storeDiaObjects(objects, visit_time, replica_chunk, maximum_table_length)
 
         if sources is not None:
             # copy apdb_part column from DiaObjects to DiaSources
             sources = self._add_apdb_part(sources)
-            self._storeDiaSources(ApdbTables.DiaSource, sources, replica_chunk)
+            self._storeDiaSources(ApdbTables.DiaSource, sources, replica_chunk, maximum_table_length)
             self._storeDiaSourcesPartitions(sources, visit_time, replica_chunk)
 
         if forced_sources is not None:
             forced_sources = self._add_apdb_part(forced_sources)
-            self._storeDiaSources(ApdbTables.DiaForcedSource, forced_sources, replica_chunk)
+            self._storeDiaSources(ApdbTables.DiaForcedSource, forced_sources, replica_chunk,
+                                  maximum_table_length)
 
     def storeSSObjects(self, objects: pandas.DataFrame) -> None:
         # docstring is inherited from a base class
@@ -1183,7 +1185,11 @@ def _deleteMovingObjects(self, objs: pandas.DataFrame) -> None:
             timer.add_values(row_count=len(batch))
 
     def _storeDiaObjects(
-        self, objs: pandas.DataFrame, visit_time: astropy.time.Time, replica_chunk: ReplicaChunk | None
+        self,
+        objs: pandas.DataFrame,
+        visit_time: astropy.time.Time,
+        replica_chunk: ReplicaChunk | None,
+        maximum_table_length: int,
     ) -> None:
         """Store catalog of DiaObjects from current visit.
 
@@ -1195,6 +1201,8 @@ def _storeDiaObjects(
             Time of the current visit.
         replica_chunk : `ReplicaChunk` or `None`
             Replica chunk identifier if replication is configured.
+        maximum_table_length : `int`
+            Maximum table length to write in a single operation.
         """
         if len(objs) == 0:
             _LOG.debug("No objects to write to database.")
@@ -1229,6 +1237,7 @@ def _storeDiaSources(
         table_name: ApdbTables,
         sources: pandas.DataFrame,
         replica_chunk: ReplicaChunk | None,
+        maximum_table_length: int,
     ) -> None:
         """Store catalog of DIASources or DIAForcedSources from current visit.
 
@@ -1238,38 +1247,41 @@ def _storeDiaSources(
             Table where to store the data.
         sources : `pandas.DataFrame`
             Catalog containing DiaSource records
-        visit_time : `astropy.time.Time`
-            Time of the current visit.
         replica_chunk : `ReplicaChunk` or `None`
             Replica chunk identifier if replication is configured.
+        maximum_table_length : `int`
+            Maximum table length to write in a single operation.
         """
         # Time partitioning has to be based on midpointMjdTai, not visit_time
         # as visit_time is not really a visit time.
         tp_sources = sources.copy(deep=False)
         tp_sources["apdb_time_part"] = tp_sources["midpointMjdTai"].apply(self._time_partition)
         extra_columns: dict[str, Any] = {}
         if not self.config.partitioning.time_partition_tables:
-            self._storeObjectsPandas(tp_sources, table_name)
+            self._storeObjectsPandas(tp_sources, table_name, maximum_table_length=maximum_table_length)
         else:
             # Group by time partition
             partitions = set(tp_sources["apdb_time_part"])
             if len(partitions) == 1:
                 # Single partition - just save the whole thing.
                 time_part = partitions.pop()
-                self._storeObjectsPandas(sources, table_name, time_part=time_part)
+                self._storeObjectsPandas(sources, table_name, time_part=time_part,
+                                         maximum_table_length=maximum_table_length)
             else:
                 # group by time partition.
                 for time_part, sub_frame in tp_sources.groupby(by="apdb_time_part"):
                     sub_frame.drop(columns="apdb_time_part", inplace=True)
-                    self._storeObjectsPandas(sub_frame, table_name, time_part=time_part)
+                    self._storeObjectsPandas(sub_frame, table_name, time_part=time_part,
+                                             maximum_table_length=maximum_table_length)
 
         if replica_chunk is not None:
             extra_columns = dict(apdb_replica_chunk=replica_chunk.id)
             if table_name is ApdbTables.DiaSource:
                 extra_table = ExtraTables.DiaSourceChunks
             else:
                 extra_table = ExtraTables.DiaForcedSourceChunks
-            self._storeObjectsPandas(sources, extra_table, extra_columns=extra_columns)
+            self._storeObjectsPandas(sources, extra_table, extra_columns=extra_columns,
+                                     maximum_table_length=maximum_table_length)
 
     def _storeDiaSourcesPartitions(
         self, sources: pandas.DataFrame, visit_time: astropy.time.Time, replica_chunk: ReplicaChunk | None
@@ -1299,6 +1311,7 @@ def _storeObjectsPandas(
         table_name: ApdbTables | ExtraTables,
         extra_columns: Mapping | None = None,
         time_part: int | None = None,
+        maximum_table_length: int = 0,
     ) -> None:
         """Store generic objects.
 
@@ -1316,6 +1329,8 @@ def _storeObjectsPandas(
             columns exist there.
         time_part : `int`, optional
             If not `None` then insert into a per-partition table.
+        maximum_table_length : int, optional
+            Maximum table length to write in a single operation.
 
         Notes
         -----
@@ -1329,6 +1344,9 @@ def _storeObjectsPandas(
             extra_columns = {}
         extra_fields = list(extra_columns.keys())
 
+        if maximum_table_length == 0:
+            maximum_table_length = 50_000_000
+
         # Fields that will come from dataframe.
         df_fields = [column for column in records.columns if column not in extra_fields]
 
@@ -1359,7 +1377,7 @@ def _storeObjectsPandas(
             # Cassandra has 64k limit on batch size, normally that should be
             # enough but some tests generate too many forced sources.
             queries = []
-            for rec_chunk in chunk_iterable(records.itertuples(index=False), 50_000_000):
+            for rec_chunk in chunk_iterable(records.itertuples(index=False), maximum_table_length):
                 batch = cassandra.query.BatchStatement()
                 for rec in rec_chunk:
                     values = []
diff --git a/python/lsst/dax/apdb/sql/apdbSql.py b/python/lsst/dax/apdb/sql/apdbSql.py
@@ -640,6 +640,7 @@ def store(
         objects: pandas.DataFrame,
         sources: pandas.DataFrame | None = None,
         forced_sources: pandas.DataFrame | None = None,
+        maximum_table_length: int = 0,
     ) -> None:
         # docstring is inherited from a base class
         objects = self._fix_input_timestamps(objects)