Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster batch segment allocation by reducing metadata IO #17420

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.apache.druid.indexing.overlord.LockRequestForNewSegment;
import org.apache.druid.indexing.overlord.LockResult;
import org.apache.druid.indexing.overlord.Segments;
import org.apache.druid.indexing.overlord.TaskLockbox;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.StringUtils;
Expand Down Expand Up @@ -219,6 +220,25 @@ public SegmentIdWithShardSpec perform(
);
}
int attempt = 0;
final TaskLockbox lockbox = toolbox.getTaskLockbox();
if (lockbox.canAllocateSegmentWithReducedMetadataIO(getLockGranularity(), getTaskLockType())) {
LockResult result = lockbox.allocateSegmentWithReducedMetadataIO(
task,
taskLockType,
timestamp,
queryGranularity,
preferredSegmentGranularity,
sequenceName,
previousSegmentId,
skipSegmentLineageCheck,
partialShardSpec
);
if (result.isOk()) {
return result.getNewSegmentId();
} else {
return null;
}
}
while (true) {
attempt++;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import org.apache.druid.indexing.common.task.IndexTaskUtils;
import org.apache.druid.indexing.common.task.Task;
import org.apache.druid.indexing.overlord.IndexerMetadataStorageCoordinator;
import org.apache.druid.indexing.overlord.Segments;
import org.apache.druid.indexing.overlord.TaskLockbox;
import org.apache.druid.indexing.overlord.config.TaskLockConfig;
import org.apache.druid.java.util.common.ISE;
Expand All @@ -41,6 +40,7 @@
import org.apache.druid.query.DruidMetrics;
import org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec;
import org.apache.druid.timeline.DataSegment;
import org.apache.druid.timeline.Partitions;
import org.joda.time.Interval;

import java.util.ArrayList;
Expand Down Expand Up @@ -70,7 +70,7 @@ public class SegmentAllocationQueue
private static final Logger log = new Logger(SegmentAllocationQueue.class);

private static final int MAX_QUEUE_SIZE = 2000;
private static final int MAX_BATCH_SIZE = 500;
private static final int MAX_BATCH_SIZE = 5;

private final long maxWaitTimeMillis;

Expand All @@ -87,6 +87,8 @@ public class SegmentAllocationQueue
private final ConcurrentHashMap<AllocateRequestKey, AllocateRequestBatch> keyToBatch = new ConcurrentHashMap<>();
private final BlockingDeque<AllocateRequestBatch> processingQueue = new LinkedBlockingDeque<>(MAX_QUEUE_SIZE);

private final boolean skipSegmentPayloadFetchForAllocation;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please rename as suggested.


@Inject
public SegmentAllocationQueue(
TaskLockbox taskLockbox,
Expand All @@ -100,6 +102,7 @@ public SegmentAllocationQueue(
this.taskLockbox = taskLockbox;
this.metadataStorage = metadataStorage;
this.maxWaitTimeMillis = taskLockConfig.getBatchAllocationWaitTime();
this.skipSegmentPayloadFetchForAllocation = taskLockConfig.isSegmentAllocationReduceMetadataIO();

this.executor = taskLockConfig.isBatchSegmentAllocation()
? executorFactory.create(1, "SegmentAllocQueue-%s") : null;
Expand Down Expand Up @@ -380,13 +383,11 @@ private boolean processBatch(AllocateRequestBatch requestBatch)

private Set<DataSegment> retrieveUsedSegments(AllocateRequestKey key)
{
return new HashSet<>(
metadataStorage.retrieveUsedSegmentsForInterval(
key.dataSource,
key.preferredAllocationInterval,
Segments.ONLY_VISIBLE
)
);
return metadataStorage.getSegmentTimelineForAllocation(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it inefficient here if skipSegmentPayloadFetchForAllocation is true? We are getting segments from retrieveUsedSegmentsForAllocation then creating a timeline via SegmentTimeline.forSegments and then getting segments back again via findNonOvershadowedObjectsInInterval. Why do we even need to create a timeline?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean when it is false?
If it is true, we are simply getting all the used ids in the retrieval call, so we'd have to create a SegmentTimeline as we're interested only in the visible segment set

key.dataSource,
key.preferredAllocationInterval,
(key.lockGranularity == LockGranularity.TIME_CHUNK) && skipSegmentPayloadFetchForAllocation
).findNonOvershadowedObjectsInInterval(Intervals.ETERNITY, Partitions.ONLY_COMPLETE);
}

private int allocateSegmentsForBatch(AllocateRequestBatch requestBatch, Set<DataSegment> usedSegments)
Expand Down Expand Up @@ -493,7 +494,8 @@ private int allocateSegmentsForInterval(
requestKey.dataSource,
tryInterval,
requestKey.skipSegmentLineageCheck,
requestKey.lockGranularity
requestKey.lockGranularity,
skipSegmentPayloadFetchForAllocation
);

int successfulRequests = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.apache.druid.indexing.overlord.CriticalAction;
import org.apache.druid.indexing.overlord.DataSourceMetadata;
import org.apache.druid.indexing.overlord.SegmentPublishResult;
import org.apache.druid.indexing.overlord.TaskLockbox;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.segment.SegmentSchemaMapping;
import org.apache.druid.segment.SegmentUtils;
Expand Down Expand Up @@ -213,18 +214,25 @@ public SegmentPublishResult perform(Task task, TaskActionToolbox toolbox)
}
}

final TaskLockbox lockbox = toolbox.getTaskLockbox();
try {
retVal = toolbox.getTaskLockbox().doInCriticalSection(
retVal = lockbox.doInCriticalSection(
task,
allSegments.stream().map(DataSegment::getInterval).collect(Collectors.toSet()),
CriticalAction.<SegmentPublishResult>builder()
.onValidLocks(
() -> toolbox.getIndexerMetadataStorageCoordinator().commitSegmentsAndMetadata(
segments,
startMetadata,
endMetadata,
segmentSchemaMapping
)
() -> {
SegmentPublishResult result =
toolbox.getIndexerMetadataStorageCoordinator()
.commitSegmentsAndMetadata(
segments,
startMetadata,
endMetadata,
segmentSchemaMapping
);
lockbox.cacheSegmentPublishResults(task, result.getSegments());
return result;
}
)
.onInvalidLocks(
() -> SegmentPublishResult.fail(
Expand Down
Loading
Loading