From 8e493f313eba21ce9946f2604c214bb66840d7f3 Mon Sep 17 00:00:00 2001 From: Liyun Xiu Date: Fri, 28 Jun 2024 12:02:49 -0700 Subject: [PATCH] Add batching processor base type AbstractBatchingProcessor (#14554) Signed-off-by: Liyun Xiu --- CHANGELOG.md | 1 + .../ingest/AbstractBatchingProcessor.java | 136 +++++++++++++++ .../AbstractBatchingProcessorTests.java | 160 ++++++++++++++++++ 3 files changed, 297 insertions(+) create mode 100644 server/src/main/java/org/opensearch/ingest/AbstractBatchingProcessor.java create mode 100644 server/src/test/java/org/opensearch/ingest/AbstractBatchingProcessorTests.java diff --git a/CHANGELOG.md b/CHANGELOG.md index c6b2d815750f9..8835032785430 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - [Remote Store] Rate limiter for remote store low priority uploads ([#14374](https://github.com/opensearch-project/OpenSearch/pull/14374/)) - Apply the date histogram rewrite optimization to range aggregation ([#13865](https://github.com/opensearch-project/OpenSearch/pull/13865)) - [Writable Warm] Add composite directory implementation and integrate it with FileCache ([12782](https://github.com/opensearch-project/OpenSearch/pull/12782)) +- Add batching supported processor base type AbstractBatchingProcessor ([#14554](https://github.com/opensearch-project/OpenSearch/pull/14554)) - Fix race condition while parsing derived fields from search definition ([14445](https://github.com/opensearch-project/OpenSearch/pull/14445)) - Add allowlist setting for ingest-common and search-pipeline-common processors ([#14439](https://github.com/opensearch-project/OpenSearch/issues/14439)) diff --git a/server/src/main/java/org/opensearch/ingest/AbstractBatchingProcessor.java b/server/src/main/java/org/opensearch/ingest/AbstractBatchingProcessor.java new file mode 100644 index 0000000000000..55413b9bbdad1 --- /dev/null +++ b/server/src/main/java/org/opensearch/ingest/AbstractBatchingProcessor.java @@ -0,0 +1,136 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.ingest; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Consumer; + +import static org.opensearch.ingest.ConfigurationUtils.newConfigurationException; + +/** + * Abstract base class for batch processors. + * + * @opensearch.internal + */ +public abstract class AbstractBatchingProcessor extends AbstractProcessor { + + public static final String BATCH_SIZE_FIELD = "batch_size"; + private static final int DEFAULT_BATCH_SIZE = 1; + protected final int batchSize; + + protected AbstractBatchingProcessor(String tag, String description, int batchSize) { + super(tag, description); + this.batchSize = batchSize; + } + + /** + * Internal logic to process batched documents, must be implemented by concrete batch processors. + * + * @param ingestDocumentWrappers {@link List} of {@link IngestDocumentWrapper} to be processed. + * @param handler {@link Consumer} to be called with the results of the processing. + */ + protected abstract void subBatchExecute( + List ingestDocumentWrappers, + Consumer> handler + ); + + @Override + public void batchExecute(List ingestDocumentWrappers, Consumer> handler) { + if (ingestDocumentWrappers.isEmpty()) { + handler.accept(Collections.emptyList()); + return; + } + + // if batch size is larger than document size, send one batch + if (this.batchSize >= ingestDocumentWrappers.size()) { + subBatchExecute(ingestDocumentWrappers, handler); + return; + } + + // split documents into multiple batches and send each batch to batch processors + List> batches = cutBatches(ingestDocumentWrappers); + int size = ingestDocumentWrappers.size(); + AtomicInteger counter = new AtomicInteger(size); + List allResults = Collections.synchronizedList(new ArrayList<>()); + for (List batch : batches) { + this.subBatchExecute(batch, batchResults -> { + allResults.addAll(batchResults); + if (counter.addAndGet(-batchResults.size()) == 0) { + handler.accept(allResults); + } + assert counter.get() >= 0 : "counter is negative"; + }); + } + } + + private List> cutBatches(List ingestDocumentWrappers) { + List> batches = new ArrayList<>(); + for (int i = 0; i < ingestDocumentWrappers.size(); i += this.batchSize) { + batches.add(ingestDocumentWrappers.subList(i, Math.min(i + this.batchSize, ingestDocumentWrappers.size()))); + } + return batches; + } + + /** + * Factory class for creating {@link AbstractBatchingProcessor} instances. + * + * @opensearch.internal + */ + public abstract static class Factory implements Processor.Factory { + final String processorType; + + protected Factory(String processorType) { + this.processorType = processorType; + } + + /** + * Creates a new processor instance. + * + * @param processorFactories The processor factories. + * @param tag The processor tag. + * @param description The processor description. + * @param config The processor configuration. + * @return The new AbstractBatchProcessor instance. + * @throws Exception If the processor could not be created. + */ + @Override + public AbstractBatchingProcessor create( + Map processorFactories, + String tag, + String description, + Map config + ) throws Exception { + int batchSize = ConfigurationUtils.readIntProperty(this.processorType, tag, config, BATCH_SIZE_FIELD, DEFAULT_BATCH_SIZE); + if (batchSize < 1) { + throw newConfigurationException(this.processorType, tag, BATCH_SIZE_FIELD, "batch size must be a positive integer"); + } + return newProcessor(tag, description, batchSize, config); + } + + /** + * Returns a new processor instance. + * + * @param tag tag of the processor + * @param description description of the processor + * @param batchSize batch size of the processor + * @param config configuration of the processor + * @return a new batch processor instance + */ + protected abstract AbstractBatchingProcessor newProcessor( + String tag, + String description, + int batchSize, + Map config + ); + } +} diff --git a/server/src/test/java/org/opensearch/ingest/AbstractBatchingProcessorTests.java b/server/src/test/java/org/opensearch/ingest/AbstractBatchingProcessorTests.java new file mode 100644 index 0000000000000..54fc30cb5befa --- /dev/null +++ b/server/src/test/java/org/opensearch/ingest/AbstractBatchingProcessorTests.java @@ -0,0 +1,160 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.ingest; + +import org.opensearch.OpenSearchParseException; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Consumer; + +public class AbstractBatchingProcessorTests extends OpenSearchTestCase { + + public void testBatchExecute_emptyInput() { + DummyProcessor processor = new DummyProcessor(3); + Consumer> handler = (results) -> assertTrue(results.isEmpty()); + processor.batchExecute(Collections.emptyList(), handler); + assertTrue(processor.getSubBatches().isEmpty()); + } + + public void testBatchExecute_singleBatchSize() { + DummyProcessor processor = new DummyProcessor(3); + List wrapperList = Arrays.asList( + IngestDocumentPreparer.createIngestDocumentWrapper(1), + IngestDocumentPreparer.createIngestDocumentWrapper(2), + IngestDocumentPreparer.createIngestDocumentWrapper(3) + ); + List resultList = new ArrayList<>(); + processor.batchExecute(wrapperList, resultList::addAll); + assertEquals(wrapperList, resultList); + assertEquals(1, processor.getSubBatches().size()); + assertEquals(wrapperList, processor.getSubBatches().get(0)); + } + + public void testBatchExecute_multipleBatches() { + DummyProcessor processor = new DummyProcessor(2); + List wrapperList = Arrays.asList( + IngestDocumentPreparer.createIngestDocumentWrapper(1), + IngestDocumentPreparer.createIngestDocumentWrapper(2), + IngestDocumentPreparer.createIngestDocumentWrapper(3), + IngestDocumentPreparer.createIngestDocumentWrapper(4), + IngestDocumentPreparer.createIngestDocumentWrapper(5) + ); + List resultList = new ArrayList<>(); + processor.batchExecute(wrapperList, resultList::addAll); + assertEquals(wrapperList, resultList); + assertEquals(3, processor.getSubBatches().size()); + assertEquals(wrapperList.subList(0, 2), processor.getSubBatches().get(0)); + assertEquals(wrapperList.subList(2, 4), processor.getSubBatches().get(1)); + assertEquals(wrapperList.subList(4, 5), processor.getSubBatches().get(2)); + } + + public void testBatchExecute_randomBatches() { + int batchSize = randomIntBetween(2, 32); + int docCount = randomIntBetween(2, 32); + DummyProcessor processor = new DummyProcessor(batchSize); + List wrapperList = new ArrayList<>(); + for (int i = 0; i < docCount; ++i) { + wrapperList.add(IngestDocumentPreparer.createIngestDocumentWrapper(i)); + } + List resultList = new ArrayList<>(); + processor.batchExecute(wrapperList, resultList::addAll); + assertEquals(wrapperList, resultList); + assertEquals(docCount / batchSize + (docCount % batchSize == 0 ? 0 : 1), processor.getSubBatches().size()); + } + + public void testBatchExecute_defaultBatchSize() { + DummyProcessor processor = new DummyProcessor(1); + List wrapperList = Arrays.asList( + IngestDocumentPreparer.createIngestDocumentWrapper(1), + IngestDocumentPreparer.createIngestDocumentWrapper(2), + IngestDocumentPreparer.createIngestDocumentWrapper(3) + ); + List resultList = new ArrayList<>(); + processor.batchExecute(wrapperList, resultList::addAll); + assertEquals(wrapperList, resultList); + assertEquals(3, processor.getSubBatches().size()); + assertEquals(wrapperList.subList(0, 1), processor.getSubBatches().get(0)); + assertEquals(wrapperList.subList(1, 2), processor.getSubBatches().get(1)); + assertEquals(wrapperList.subList(2, 3), processor.getSubBatches().get(2)); + } + + public void testFactory_invalidBatchSize() { + Map config = new HashMap<>(); + config.put("batch_size", 0); + DummyProcessor.DummyProcessorFactory factory = new DummyProcessor.DummyProcessorFactory("DummyProcessor"); + OpenSearchParseException exception = assertThrows(OpenSearchParseException.class, () -> factory.create(config)); + assertEquals("[batch_size] batch size must be a positive integer", exception.getMessage()); + } + + public void testFactory_defaultBatchSize() throws Exception { + Map config = new HashMap<>(); + DummyProcessor.DummyProcessorFactory factory = new DummyProcessor.DummyProcessorFactory("DummyProcessor"); + DummyProcessor processor = (DummyProcessor) factory.create(config); + assertEquals(1, processor.batchSize); + } + + public void testFactory_callNewProcessor() throws Exception { + Map config = new HashMap<>(); + config.put("batch_size", 3); + DummyProcessor.DummyProcessorFactory factory = new DummyProcessor.DummyProcessorFactory("DummyProcessor"); + DummyProcessor processor = (DummyProcessor) factory.create(config); + assertEquals(3, processor.batchSize); + } + + static class DummyProcessor extends AbstractBatchingProcessor { + private List> subBatches = new ArrayList<>(); + + public List> getSubBatches() { + return subBatches; + } + + protected DummyProcessor(int batchSize) { + super("tag", "description", batchSize); + } + + @Override + public void subBatchExecute(List ingestDocumentWrappers, Consumer> handler) { + subBatches.add(ingestDocumentWrappers); + handler.accept(ingestDocumentWrappers); + } + + @Override + public IngestDocument execute(IngestDocument ingestDocument) throws Exception { + return ingestDocument; + } + + @Override + public String getType() { + return null; + } + + public static class DummyProcessorFactory extends Factory { + + protected DummyProcessorFactory(String processorType) { + super(processorType); + } + + public AbstractBatchingProcessor create(Map config) throws Exception { + final Map processorFactories = new HashMap<>(); + return super.create(processorFactories, "tag", "description", config); + } + + @Override + protected AbstractBatchingProcessor newProcessor(String tag, String description, int batchSize, Map config) { + return new DummyProcessor(batchSize); + } + } + } +}