gchq · patchwork01 · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026
diff --git a/docs/usage/properties/instance/user/table_property_defaults.md b/docs/usage/properties/instance/user/table_property_defaults.md
@@ -28,6 +28,7 @@ The following instance properties relate to default values used by table propert
 | sleeper.default.table.statestore.transactionlog.delete.behind.snapshot.min.age.minutes      | The minimum age in minutes of a snapshot in order to allow deletion of transactions leading up to it. When deleting old transactions, there's a chance that processes may still read transactions starting from an older snapshot. We need to avoid deletion of any transactions associated with a snapshot that may still be used as the starting point for reading the log.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | 2                          | false                       |
 | sleeper.default.table.statestore.transactionlog.delete.number.behind.latest.snapshot        | The minimum number of transactions that a transaction must be behind the latest snapshot before being deleted. This is the number of transactions that will be kept and protected from deletion, whenever old transactions are deleted. This includes the transaction that the latest snapshot was created against. Any transactions after the snapshot will never be deleted as they are still in active use.<br>This should be configured in relation to the property which determines whether a process will load the latest snapshot or instead seek through the transaction log, since we need to preserve transactions that may still be read:<br>sleeper.default.statestore.snapshot.load.min.transactions.ahead<br>The snapshot that will be considered the latest snapshot is configured by a property to set the minimum age for it to count for this:<br>sleeper.default.statestore.transactionlog.delete.behind.snapshot.min.age<br>                                                                                                                                                       | 200                        | false                       |
 | sleeper.default.table.bulk.import.min.leaf.partitions                                       | Specifies the minimum number of leaf partitions that are needed to run a bulk import job. If this minimum has not been reached, bulk import jobs will refuse to start.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | 256                        | false                       |
+| sleeper.default.table.bulk.import.partition.splitting.attempts                              | Specifies the number of times bulk import tries to create leaf partitions to meet the minimum number of leaf partitions. This will be retried if another process splits the same partitions at the same time.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | 3                          | false                       |
 | sleeper.default.table.ingest.batcher.job.min.size                                           | Specifies the minimum total file size required for an ingest job to be batched and sent. An ingest job will be created if the batcher runs while this much data is waiting, and the minimum number of files is also met.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               | 1G                         | false                       |
 | sleeper.default.table.ingest.batcher.job.max.size                                           | Specifies the maximum total file size for a job in the ingest batcher. If more data is waiting than this, it will be split into multiple jobs. If a single file exceeds this, it will still be ingested in its own job. It's also possible some data may be left for a future run of the batcher if some recent files overflow the size of a job but aren't enough to create a job on their own.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       | 5G                         | false                       |
 | sleeper.default.table.ingest.batcher.job.min.files                                          | Specifies the minimum number of files for a job in the ingest batcher. An ingest job will be created if the batcher runs while this many files are waiting, and the minimum size of files is also met.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | 1                          | false                       |

diff --git a/docs/usage/properties/table/bulk_import.md b/docs/usage/properties/table/bulk_import.md
@@ -14,4 +14,5 @@ The following table properties relate to bulk import, i.e. ingesting data using
 | sleeper.table.bulk.import.emr.executor.max.capacity       | (Non-persistent EMR mode only) The maximum number of capacity units to provision as EC2 instances for executors in the EMR cluster.<br>This is measured in instance fleet capacity units. These are declared alongside the requested instance types, as each type will count for a certain number of units. By default the units are the number of instances.<br>This value overrides the default value in the instance properties. It can be overridden by a value in the bulk import job specification. | 10            |
 | sleeper.table.bulk.import.emr.release.label               | (Non-persistent EMR mode only) The EMR release label to be used when creating an EMR cluster for bulk importing data using Spark running on EMR.<br>This value overrides the default value in the instance properties. It can be overridden by a value in the bulk import job specification.                                                                                                                                                                                                              | emr-7.12.0    |
 | sleeper.table.bulk.import.min.leaf.partitions             | Specifies the minimum number of leaf partitions that are needed to run a bulk import job. If this minimum has not been reached, bulk import jobs will refuse to start                                                                                                                                                                                                                                                                                                                                     | 256           |
+| sleeper.table.bulk.import.partition.splitting.attempts    | Specifies the number of times bulk import tries to create leaf partitions to meet the minimum number of leaf partitions. This will be retried if another process splits the same partitions at the same time.                                                                                                                                                                                                                                                                                             | 3             |
 | sleeper.table.bulk.import.job.files.commit.async          | If true, bulk import will add files via requests sent to the state store committer lambda asynchronously. If false, bulk import will commit new files at the end of the job synchronously.<br>This is only applied if async commits are enabled for the table. The default value is set in an instance property.                                                                                                                                                                                          | true          |
diff --git a/example/full/instance.properties b/example/full/instance.properties
@@ -1915,6 +1915,12 @@ sleeper.logging.root.level=INFO
 # (default value shown below, uncomment to set a value)
 # sleeper.default.table.bulk.import.min.leaf.partitions=256
 
+# Specifies the number of times bulk import tries to create leaf partitions to meet the minimum number
+# of leaf partitions. This will be retried if another process splits the same partitions at the same
+# time.
+# (default value shown below, uncomment to set a value)
+# sleeper.default.table.bulk.import.partition.splitting.attempts=3
+
 # Specifies the minimum total file size required for an ingest job to be batched and sent. An ingest
 # job will be created if the batcher runs while this much data is waiting, and the minimum number of
 # files is also met.

diff --git a/example/full/table.properties b/example/full/table.properties
@@ -503,6 +503,12 @@ sleeper.table.statestore.classname=DynamoDBTransactionLogStateStore
 # (default value shown below, uncomment to set a value)
 # sleeper.table.bulk.import.min.leaf.partitions=256
 
+# Specifies the number of times bulk import tries to create leaf partitions to meet the minimum number
+# of leaf partitions. This will be retried if another process splits the same partitions at the same
+# time.
+# (default value shown below, uncomment to set a value)
+# sleeper.table.bulk.import.partition.splitting.attempts=3
+
 # If true, bulk import will add files via requests sent to the state store committer lambda
 # asynchronously. If false, bulk import will commit new files at the end of the job synchronously.
 # This is only applied if async commits are enabled for the table. The default value is set in an

diff --git a/...mport/bulk-import-runner/src/main/java/sleeper/bulkimport/runner/BulkImportJobDriver.java b/...mport/bulk-import-runner/src/main/java/sleeper/bulkimport/runner/BulkImportJobDriver.java
@@ -31,7 +31,6 @@
 import sleeper.configuration.properties.S3InstanceProperties;
 import sleeper.configuration.properties.S3TableProperties;
 import sleeper.core.partition.Partition;
-import sleeper.core.partition.PartitionTree;
 import sleeper.core.properties.instance.InstanceProperties;
 import sleeper.core.properties.table.TableProperties;
 import sleeper.core.properties.table.TablePropertiesProvider;
@@ -53,7 +52,6 @@
 import sleeper.core.util.LoggedDuration;
 import sleeper.ingest.tracker.job.IngestJobTrackerFactory;
 import sleeper.sketches.Sketches;
-import sleeper.splitter.core.extend.ExtendPartitionTreeBasedOnSketches;
 import sleeper.statestore.StateStoreFactory;
 import sleeper.statestore.commit.SqsFifoStateStoreCommitRequestSender;
 
@@ -70,7 +68,6 @@
 import java.util.function.Supplier;
 
 import static sleeper.core.properties.table.TableProperty.BULK_IMPORT_FILES_COMMIT_ASYNC;
-import static sleeper.core.properties.table.TableProperty.BULK_IMPORT_MIN_LEAF_PARTITION_COUNT;
 
 /**
  * Executes a Spark job that reads input Parquet files and writes to a Sleeper table. This takes a
@@ -83,14 +80,13 @@ public class BulkImportJobDriver<C extends BulkImportContext<C>> {
     private static final Logger LOGGER = LoggerFactory.getLogger(BulkImportJobDriver.class);
 
     private final ContextCreator<C> contextCreator;
-    private final DataSketcher<C> dataSketcher;
+    private final PartitionPreSplitter<C> preSplitter;
     private final BulkImporter<C> bulkImporter;
     private final TablePropertiesProvider tablePropertiesProvider;
     private final StateStoreProvider stateStoreProvider;
     private final IngestJobTracker tracker;
     private final StateStoreCommitRequestSender asyncSender;
     private final Supplier<Instant> getTime;
-    private final Supplier<String> partitionIdSupplier;
 
     public BulkImportJobDriver(
             ContextCreator<C> contextCreator,
@@ -103,14 +99,13 @@ public BulkImportJobDriver(
             Supplier<Instant> getTime,
             Supplier<String> partitionIdSupplier) {
         this.contextCreator = contextCreator;
-        this.dataSketcher = dataSketcher;
+        this.preSplitter = new PartitionPreSplitter<>(dataSketcher, stateStoreProvider, partitionIdSupplier);
         this.bulkImporter = bulkImporter;
         this.tablePropertiesProvider = tablePropertiesProvider;
         this.stateStoreProvider = stateStoreProvider;
         this.tracker = tracker;
         this.asyncSender = asyncSender;
         this.getTime = getTime;
-        this.partitionIdSupplier = partitionIdSupplier;
     }
 
     public void run(BulkImportJob job, String jobRunId, String taskId) throws IOException {
@@ -129,7 +124,7 @@ public void run(BulkImportJob job, String jobRunId, String taskId) throws IOExce
             // Note that we stop the Spark context after we've applied the changes in Sleeper.
             try (C context = contextCreator.createContext(tableProperties, allPartitions, job)) {
 
-                C contextAfterSplit = preSplitPartitionsIfNecessary(tableProperties, allPartitions, context);
+                C contextAfterSplit = preSplitter.preSplitPartitionsIfNecessary(tableProperties, allPartitions, context);
 
                 Instant startTime = getTime.get();
                 tracker.jobStarted(IngestJobStartedEvent.builder()
@@ -153,24 +148,6 @@ public void run(BulkImportJob job, String jobRunId, String taskId) throws IOExce
         }
     }
 
-    private C preSplitPartitionsIfNecessary(TableProperties tableProperties, List<Partition> allPartitions, C context) {
-        PartitionTree tree = new PartitionTree(allPartitions);
-        List<Partition> leafPartitions = tree.getLeafPartitions();
-        int minLeafPartitions = tableProperties.getInt(BULK_IMPORT_MIN_LEAF_PARTITION_COUNT);
-        if (leafPartitions.size() < minLeafPartitions) {
-            LOGGER.info("Extending partition tree from {} leaf partitions to {}", leafPartitions.size(), minLeafPartitions);
-            Map<String, Sketches> partitionIdToSketches = dataSketcher.generatePartitionIdToSketches(context);
-            StateStore stateStore = stateStoreProvider.getStateStore(tableProperties);
-            ExtendPartitionTreeBasedOnSketches.forBulkImport(tableProperties, partitionIdSupplier)
-                    .createTransaction(tree, partitionIdToSketches)
-                    .synchronousCommit(stateStore);
-            return context.withPartitions(stateStore.getAllPartitions());
-        } else {
-            LOGGER.info("Partition tree meets minimum of {} leaf partitions", minLeafPartitions);
-            return context;
-        }
-    }
-
     private void commitSuccessfulJob(TableProperties tableProperties, IngestJobRunIds runIds, Instant startTime, List<FileReference> fileReferences) {
 
         Instant finishTime = getTime.get();