apache · AmatyaAvadhanula · Oct 26, 2024 · Oct 29, 2024 · Oct 30, 2024 · kfaraz
diff --git a/...service/src/main/java/org/apache/druid/indexing/common/actions/SegmentAllocateAction.java b/...service/src/main/java/org/apache/druid/indexing/common/actions/SegmentAllocateAction.java
@@ -32,6 +32,7 @@
 import org.apache.druid.indexing.overlord.LockRequestForNewSegment;
 import org.apache.druid.indexing.overlord.LockResult;
 import org.apache.druid.indexing.overlord.Segments;
+import org.apache.druid.indexing.overlord.TaskLockbox;
 import org.apache.druid.java.util.common.IAE;
 import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.java.util.common.StringUtils;
@@ -219,6 +220,25 @@ public SegmentIdWithShardSpec perform(
       );
     }
     int attempt = 0;
+    final TaskLockbox lockbox = toolbox.getTaskLockbox();
+    if (lockbox.canAllocateSegmentWithReducedMetadataIO(getLockGranularity(), getTaskLockType())) {
+      LockResult result = lockbox.allocateSegmentWithReducedMetadataIO(
+          task,
+          taskLockType,
+          timestamp,
+          queryGranularity,
+          preferredSegmentGranularity,
+          sequenceName,
+          previousSegmentId,
+          skipSegmentLineageCheck,
+          partialShardSpec
+      );
+      if (result.isOk()) {
+        return result.getNewSegmentId();
+      } else {
+        return null;
+      }
+    }
     while (true) {
       attempt++;
 

diff --git a/...ervice/src/main/java/org/apache/druid/indexing/common/actions/SegmentAllocationQueue.java b/...ervice/src/main/java/org/apache/druid/indexing/common/actions/SegmentAllocationQueue.java
@@ -25,7 +25,6 @@
 import org.apache.druid.indexing.common.task.IndexTaskUtils;
 import org.apache.druid.indexing.common.task.Task;
 import org.apache.druid.indexing.overlord.IndexerMetadataStorageCoordinator;
-import org.apache.druid.indexing.overlord.Segments;
 import org.apache.druid.indexing.overlord.TaskLockbox;
 import org.apache.druid.indexing.overlord.config.TaskLockConfig;
 import org.apache.druid.java.util.common.ISE;
@@ -41,6 +40,7 @@
 import org.apache.druid.query.DruidMetrics;
 import org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec;
 import org.apache.druid.timeline.DataSegment;
+import org.apache.druid.timeline.Partitions;
 import org.joda.time.Interval;
 
 import java.util.ArrayList;
@@ -70,7 +70,7 @@ public class SegmentAllocationQueue
   private static final Logger log = new Logger(SegmentAllocationQueue.class);
 
   private static final int MAX_QUEUE_SIZE = 2000;
-  private static final int MAX_BATCH_SIZE = 500;
+  private static final int MAX_BATCH_SIZE = 5;
 
   private final long maxWaitTimeMillis;
 
@@ -87,6 +87,8 @@ public class SegmentAllocationQueue
   private final ConcurrentHashMap<AllocateRequestKey, AllocateRequestBatch> keyToBatch = new ConcurrentHashMap<>();
   private final BlockingDeque<AllocateRequestBatch> processingQueue = new LinkedBlockingDeque<>(MAX_QUEUE_SIZE);
 
+  private final boolean skipSegmentPayloadFetchForAllocation;
+
   @Inject
   public SegmentAllocationQueue(
       TaskLockbox taskLockbox,
@@ -100,6 +102,7 @@ public SegmentAllocationQueue(
     this.taskLockbox = taskLockbox;
     this.metadataStorage = metadataStorage;
     this.maxWaitTimeMillis = taskLockConfig.getBatchAllocationWaitTime();
+    this.skipSegmentPayloadFetchForAllocation = taskLockConfig.isSegmentAllocationReduceMetadataIO();
 
     this.executor = taskLockConfig.isBatchSegmentAllocation()
                     ? executorFactory.create(1, "SegmentAllocQueue-%s") : null;
@@ -380,13 +383,11 @@ private boolean processBatch(AllocateRequestBatch requestBatch)
 
   private Set<DataSegment> retrieveUsedSegments(AllocateRequestKey key)
   {
-    return new HashSet<>(
-        metadataStorage.retrieveUsedSegmentsForInterval(
-            key.dataSource,
-            key.preferredAllocationInterval,
-            Segments.ONLY_VISIBLE
-        )
-    );
+    return metadataStorage.getSegmentTimelineForAllocation(
+        key.dataSource,
+        key.preferredAllocationInterval,
+        (key.lockGranularity == LockGranularity.TIME_CHUNK) && skipSegmentPayloadFetchForAllocation
+    ).findNonOvershadowedObjectsInInterval(Intervals.ETERNITY, Partitions.ONLY_COMPLETE);
   }
 
   private int allocateSegmentsForBatch(AllocateRequestBatch requestBatch, Set<DataSegment> usedSegments)
@@ -493,7 +494,8 @@ private int allocateSegmentsForInterval(
         requestKey.dataSource,
         tryInterval,
         requestKey.skipSegmentLineageCheck,
-        requestKey.lockGranularity
+        requestKey.lockGranularity,
+        skipSegmentPayloadFetchForAllocation
     );
 
     int successfulRequests = 0;

diff --git a/.../main/java/org/apache/druid/indexing/common/actions/SegmentTransactionalInsertAction.java b/.../main/java/org/apache/druid/indexing/common/actions/SegmentTransactionalInsertAction.java
@@ -32,6 +32,7 @@
 import org.apache.druid.indexing.overlord.CriticalAction;
 import org.apache.druid.indexing.overlord.DataSourceMetadata;
 import org.apache.druid.indexing.overlord.SegmentPublishResult;
+import org.apache.druid.indexing.overlord.TaskLockbox;
 import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.segment.SegmentSchemaMapping;
 import org.apache.druid.segment.SegmentUtils;
@@ -213,18 +214,25 @@ public SegmentPublishResult perform(Task task, TaskActionToolbox toolbox)
       }
     }
 
+    final TaskLockbox lockbox = toolbox.getTaskLockbox();
     try {
-      retVal = toolbox.getTaskLockbox().doInCriticalSection(
+      retVal = lockbox.doInCriticalSection(
           task,
           allSegments.stream().map(DataSegment::getInterval).collect(Collectors.toSet()),
           CriticalAction.<SegmentPublishResult>builder()
               .onValidLocks(
-                  () -> toolbox.getIndexerMetadataStorageCoordinator().commitSegmentsAndMetadata(
-                      segments,
-                      startMetadata,
-                      endMetadata,
-                      segmentSchemaMapping
-                  )
+                  () -> {
+                    SegmentPublishResult result =
+                        toolbox.getIndexerMetadataStorageCoordinator()
+                               .commitSegmentsAndMetadata(
+                                   segments,
+                                   startMetadata,
+                                   endMetadata,
+                                   segmentSchemaMapping
+                               );
+                    lockbox.cacheSegmentPublishResults(task, result.getSegments());
+                    return result;
+                  }
               )
               .onInvalidLocks(
                   () -> SegmentPublishResult.fail(