@@ -813,6 +813,7 @@ def store(
813
813
objects : pandas .DataFrame ,
814
814
sources : pandas .DataFrame | None = None ,
815
815
forced_sources : pandas .DataFrame | None = None ,
816
+ maximum_table_length : int = 0 ,
816
817
) -> None :
817
818
# docstring is inherited from a base class
818
819
objects = self ._fix_input_timestamps (objects )
@@ -828,17 +829,18 @@ def store(
828
829
829
830
# fill region partition column for DiaObjects
830
831
objects = self ._add_apdb_part (objects )
831
- self ._storeDiaObjects (objects , visit_time , replica_chunk )
832
+ self ._storeDiaObjects (objects , visit_time , replica_chunk , maximum_table_length )
832
833
833
834
if sources is not None :
834
835
# copy apdb_part column from DiaObjects to DiaSources
835
836
sources = self ._add_apdb_part (sources )
836
- self ._storeDiaSources (ApdbTables .DiaSource , sources , replica_chunk )
837
+ self ._storeDiaSources (ApdbTables .DiaSource , sources , replica_chunk , maximum_table_length )
837
838
self ._storeDiaSourcesPartitions (sources , visit_time , replica_chunk )
838
839
839
840
if forced_sources is not None :
840
841
forced_sources = self ._add_apdb_part (forced_sources )
841
- self ._storeDiaSources (ApdbTables .DiaForcedSource , forced_sources , replica_chunk )
842
+ self ._storeDiaSources (ApdbTables .DiaForcedSource , forced_sources , replica_chunk ,
843
+ maximum_table_length )
842
844
843
845
def storeSSObjects (self , objects : pandas .DataFrame ) -> None :
844
846
# docstring is inherited from a base class
@@ -1183,7 +1185,11 @@ def _deleteMovingObjects(self, objs: pandas.DataFrame) -> None:
1183
1185
timer .add_values (row_count = len (batch ))
1184
1186
1185
1187
def _storeDiaObjects (
1186
- self , objs : pandas .DataFrame , visit_time : astropy .time .Time , replica_chunk : ReplicaChunk | None
1188
+ self ,
1189
+ objs : pandas .DataFrame ,
1190
+ visit_time : astropy .time .Time ,
1191
+ replica_chunk : ReplicaChunk | None ,
1192
+ maximum_table_length : int ,
1187
1193
) -> None :
1188
1194
"""Store catalog of DiaObjects from current visit.
1189
1195
@@ -1195,6 +1201,8 @@ def _storeDiaObjects(
1195
1201
Time of the current visit.
1196
1202
replica_chunk : `ReplicaChunk` or `None`
1197
1203
Replica chunk identifier if replication is configured.
1204
+ maximum_table_length : `int`
1205
+ Maximum table length to write in a single operation.
1198
1206
"""
1199
1207
if len (objs ) == 0 :
1200
1208
_LOG .debug ("No objects to write to database." )
@@ -1229,6 +1237,7 @@ def _storeDiaSources(
1229
1237
table_name : ApdbTables ,
1230
1238
sources : pandas .DataFrame ,
1231
1239
replica_chunk : ReplicaChunk | None ,
1240
+ maximum_table_length : int ,
1232
1241
) -> None :
1233
1242
"""Store catalog of DIASources or DIAForcedSources from current visit.
1234
1243
@@ -1238,38 +1247,41 @@ def _storeDiaSources(
1238
1247
Table where to store the data.
1239
1248
sources : `pandas.DataFrame`
1240
1249
Catalog containing DiaSource records
1241
- visit_time : `astropy.time.Time`
1242
- Time of the current visit.
1243
1250
replica_chunk : `ReplicaChunk` or `None`
1244
1251
Replica chunk identifier if replication is configured.
1252
+ maximum_table_length : `int`
1253
+ Maximum table length to write in a single operation.
1245
1254
"""
1246
1255
# Time partitioning has to be based on midpointMjdTai, not visit_time
1247
1256
# as visit_time is not really a visit time.
1248
1257
tp_sources = sources .copy (deep = False )
1249
1258
tp_sources ["apdb_time_part" ] = tp_sources ["midpointMjdTai" ].apply (self ._time_partition )
1250
1259
extra_columns : dict [str , Any ] = {}
1251
1260
if not self .config .partitioning .time_partition_tables :
1252
- self ._storeObjectsPandas (tp_sources , table_name )
1261
+ self ._storeObjectsPandas (tp_sources , table_name , maximum_table_length = maximum_table_length )
1253
1262
else :
1254
1263
# Group by time partition
1255
1264
partitions = set (tp_sources ["apdb_time_part" ])
1256
1265
if len (partitions ) == 1 :
1257
1266
# Single partition - just save the whole thing.
1258
1267
time_part = partitions .pop ()
1259
- self ._storeObjectsPandas (sources , table_name , time_part = time_part )
1268
+ self ._storeObjectsPandas (sources , table_name , time_part = time_part ,
1269
+ maximum_table_length = maximum_table_length )
1260
1270
else :
1261
1271
# group by time partition.
1262
1272
for time_part , sub_frame in tp_sources .groupby (by = "apdb_time_part" ):
1263
1273
sub_frame .drop (columns = "apdb_time_part" , inplace = True )
1264
- self ._storeObjectsPandas (sub_frame , table_name , time_part = time_part )
1274
+ self ._storeObjectsPandas (sub_frame , table_name , time_part = time_part ,
1275
+ maximum_table_length = maximum_table_length )
1265
1276
1266
1277
if replica_chunk is not None :
1267
1278
extra_columns = dict (apdb_replica_chunk = replica_chunk .id )
1268
1279
if table_name is ApdbTables .DiaSource :
1269
1280
extra_table = ExtraTables .DiaSourceChunks
1270
1281
else :
1271
1282
extra_table = ExtraTables .DiaForcedSourceChunks
1272
- self ._storeObjectsPandas (sources , extra_table , extra_columns = extra_columns )
1283
+ self ._storeObjectsPandas (sources , extra_table , extra_columns = extra_columns ,
1284
+ maximum_table_length = maximum_table_length )
1273
1285
1274
1286
def _storeDiaSourcesPartitions (
1275
1287
self , sources : pandas .DataFrame , visit_time : astropy .time .Time , replica_chunk : ReplicaChunk | None
@@ -1299,6 +1311,7 @@ def _storeObjectsPandas(
1299
1311
table_name : ApdbTables | ExtraTables ,
1300
1312
extra_columns : Mapping | None = None ,
1301
1313
time_part : int | None = None ,
1314
+ maximum_table_length : int = 0 ,
1302
1315
) -> None :
1303
1316
"""Store generic objects.
1304
1317
@@ -1316,6 +1329,8 @@ def _storeObjectsPandas(
1316
1329
columns exist there.
1317
1330
time_part : `int`, optional
1318
1331
If not `None` then insert into a per-partition table.
1332
+ maximum_table_length : int, optional
1333
+ Maximum table length to write in a single operation.
1319
1334
1320
1335
Notes
1321
1336
-----
@@ -1329,6 +1344,9 @@ def _storeObjectsPandas(
1329
1344
extra_columns = {}
1330
1345
extra_fields = list (extra_columns .keys ())
1331
1346
1347
+ if maximum_table_length == 0 :
1348
+ maximum_table_length = 50_000_000
1349
+
1332
1350
# Fields that will come from dataframe.
1333
1351
df_fields = [column for column in records .columns if column not in extra_fields ]
1334
1352
@@ -1359,7 +1377,7 @@ def _storeObjectsPandas(
1359
1377
# Cassandra has 64k limit on batch size, normally that should be
1360
1378
# enough but some tests generate too many forced sources.
1361
1379
queries = []
1362
- for rec_chunk in chunk_iterable (records .itertuples (index = False ), 50_000_000 ):
1380
+ for rec_chunk in chunk_iterable (records .itertuples (index = False ), maximum_table_length ):
1363
1381
batch = cassandra .query .BatchStatement ()
1364
1382
for rec in rec_chunk :
1365
1383
values = []
0 commit comments