From e69752c3824e73e7cd2302b112e02c5091a8f096 Mon Sep 17 00:00:00 2001 From: Yuye Zhu Date: Wed, 17 Apr 2024 18:04:46 +0800 Subject: [PATCH] Test: bwc test for text chunking processor (#661) * bwc test for text chunking processor Signed-off-by: yuye-aws * spotless apply Signed-off-by: yuye-aws * update changelog Signed-off-by: yuye-aws * spotless apply Signed-off-by: yuye-aws * add test document for restart upgrade Signed-off-by: yuye-aws * rename pipeline configuration file Signed-off-by: yuye-aws * fix pipeline create bug Signed-off-by: yuye-aws * fix pipeline create bug Signed-off-by: yuye-aws * filter tests for lower versions Signed-off-by: yuye-aws * index create in chunking bwc test Signed-off-by: yuye-aws * index create in chunking bwc test Signed-off-by: yuye-aws * index create in chunking bwc test Signed-off-by: yuye-aws * index validate in chunking bwc test Signed-off-by: yuye-aws * filter bwc test for lower version Signed-off-by: yuye-aws * bug fix in document ingestion in text chunking test Signed-off-by: yuye-aws * ensure index creation in text chunking bwc test Signed-off-by: yuye-aws * add comment Signed-off-by: yuye-aws * update index setting Signed-off-by: yuye-aws * update change log Signed-off-by: yuye-aws * update gradle comment format Signed-off-by: yuye-aws * update gradle file format Signed-off-by: yuye-aws * rename bwc test filename Signed-off-by: yuye-aws * update gradle file format Signed-off-by: yuye-aws * update gradle file to filter tests Signed-off-by: yuye-aws * merge method createPipelineProcessorWithoutModelId Signed-off-by: yuye-aws * text chunking processor it: create pipeline method rename Signed-off-by: yuye-aws * fix it failure Signed-off-by: yuye-aws * include index mapping for text chunking index setting Signed-off-by: yuye-aws * update nitpicking Signed-off-by: yuye-aws --------- Signed-off-by: yuye-aws --- CHANGELOG.md | 1 + qa/restart-upgrade/build.gradle | 18 +++- .../AbstractRestartUpgradeRestTestCase.java | 9 ++ .../bwc/TextChunkingProcessorIT.java | 75 +++++++++++++++ .../processor/ChunkingIndexSettings.json | 17 ++++ ...ForTextChunkingProcessorConfiguration.json | 18 ++++ qa/rolling-upgrade/build.gradle | 30 +++++- .../bwc/AbstractRollingUpgradeTestCase.java | 9 ++ .../bwc/NeuralSparseSearchIT.java | 2 +- .../bwc/TextChunkingProcessorIT.java | 93 +++++++++++++++++++ .../processor/ChunkingIndexSettings.json | 17 ++++ ...rSparseEncodingProcessorConfiguration.json | 20 ++-- ...ForTextChunkingProcessorConfiguration.json | 18 ++++ .../processor/TextChunkingProcessorIT.java | 20 +--- .../opensearch/neuralsearch/TestUtils.java | 1 + 15 files changed, 317 insertions(+), 31 deletions(-) create mode 100644 qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java create mode 100644 qa/restart-upgrade/src/test/resources/processor/ChunkingIndexSettings.json create mode 100644 qa/restart-upgrade/src/test/resources/processor/PipelineForTextChunkingProcessorConfiguration.json create mode 100644 qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java create mode 100644 qa/rolling-upgrade/src/test/resources/processor/ChunkingIndexSettings.json create mode 100644 qa/rolling-upgrade/src/test/resources/processor/PipelineForTextChunkingProcessorConfiguration.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 5021a36d1..ed09e0836 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.13...2.x) ### Features ### Enhancements +- BWC tests for text chunking processor ([#661](https://github.com/opensearch-project/neural-search/pull/661)) - Allowing execution of hybrid query on index alias with filters ([#670](https://github.com/opensearch-project/neural-search/pull/670)) ### Bug Fixes - Add support for request_cache flag in hybrid query ([#663](https://github.com/opensearch-project/neural-search/pull/663)) diff --git a/qa/restart-upgrade/build.gradle b/qa/restart-upgrade/build.gradle index 1a6d0a104..8fca43f3a 100644 --- a/qa/restart-upgrade/build.gradle +++ b/qa/restart-upgrade/build.gradle @@ -65,7 +65,7 @@ task testAgainstOldCluster(type: StandaloneRestIntegTestTask) { systemProperty 'tests.skip_delete_model_index', 'true' systemProperty 'tests.plugin_bwc_version', ext.neural_search_bwc_version - //Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10 + // Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10 // because these features were released in 2.11 version. if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10")){ filter { @@ -83,6 +83,13 @@ task testAgainstOldCluster(type: StandaloneRestIntegTestTask) { } } + // Excluding the text chunking processor test because we introduce this feature in 2.13 + if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){ + filter { + excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*" + } + } + nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}") nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}") systemProperty 'tests.security.manager', 'false' @@ -107,7 +114,7 @@ task testAgainstNewCluster(type: StandaloneRestIntegTestTask) { systemProperty 'tests.is_old_cluster', 'false' systemProperty 'tests.plugin_bwc_version', ext.neural_search_bwc_version - //Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10 + // Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10 // because these features were released in 2.11 version. if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10")){ filter { @@ -125,6 +132,13 @@ task testAgainstNewCluster(type: StandaloneRestIntegTestTask) { } } + // Excluding the text chunking processor test because we introduce this feature in 2.13 + if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){ + filter { + excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*" + } + } + nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}") nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}") systemProperty 'tests.security.manager', 'false' diff --git a/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRestartUpgradeRestTestCase.java b/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRestartUpgradeRestTestCase.java index c2d2657f4..395573c6a 100644 --- a/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRestartUpgradeRestTestCase.java +++ b/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRestartUpgradeRestTestCase.java @@ -4,9 +4,11 @@ */ package org.opensearch.neuralsearch.bwc; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.util.Locale; +import java.util.Objects; import java.util.Optional; import org.junit.Before; import org.opensearch.common.settings.Settings; @@ -99,4 +101,11 @@ protected void createPipelineForSparseEncodingProcessor(final String modelId, fi ); createPipelineProcessor(requestBody, pipelineName, modelId); } + + protected void createPipelineForTextChunkingProcessor(String pipelineName) throws Exception { + String requestBody = Files.readString( + Path.of(classLoader.getResource("processor/PipelineForTextChunkingProcessorConfiguration.json").toURI()) + ); + createPipelineProcessor(requestBody, pipelineName, ""); + } } diff --git a/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java b/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java new file mode 100644 index 000000000..20eb0d05c --- /dev/null +++ b/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java @@ -0,0 +1,75 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.bwc; + +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import org.opensearch.index.query.MatchAllQueryBuilder; +import static org.opensearch.neuralsearch.TestUtils.NODES_BWC_CLUSTER; + +public class TextChunkingProcessorIT extends AbstractRestartUpgradeRestTestCase { + + private static final String PIPELINE_NAME = "pipeline-text-chunking"; + private static final String INPUT_FIELD = "body"; + private static final String OUTPUT_FIELD = "body_chunk"; + private static final String TEST_INDEX_SETTING_PATH = "processor/ChunkingIndexSettings.json"; + private static final String TEST_INGEST_TEXT = + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; + List expectedPassages = List.of( + "This is an example document to be chunked. The document ", + "contains a single paragraph, two sentences and 24 tokens by ", + "standard tokenizer in OpenSearch." + ); + + // Test rolling-upgrade text chunking processor + // Create Text Chunking Processor, Ingestion Pipeline and add document + // Validate process, pipeline and document count in restart-upgrade scenario + public void testTextChunkingProcessor_E2EFlow() throws Exception { + waitForClusterHealthGreen(NODES_BWC_CLUSTER); + String indexName = getIndexNameForTest(); + if (isRunningAgainstOldCluster()) { + createPipelineForTextChunkingProcessor(PIPELINE_NAME); + createChunkingIndex(indexName); + addDocument(indexName, "0", INPUT_FIELD, TEST_INGEST_TEXT, null, null); + validateTestIndex(indexName, OUTPUT_FIELD, 1, expectedPassages); + } else { + try { + addDocument(indexName, "1", INPUT_FIELD, TEST_INGEST_TEXT, null, null); + validateTestIndex(indexName, OUTPUT_FIELD, 2, expectedPassages); + } finally { + wipeOfTestResources(indexName, PIPELINE_NAME, null, null); + } + } + } + + private void createChunkingIndex(String indexName) throws Exception { + URL documentURLPath = classLoader.getResource(TEST_INDEX_SETTING_PATH); + Objects.requireNonNull(documentURLPath); + String indexSetting = Files.readString(Path.of(documentURLPath.toURI())); + createIndexWithConfiguration(indexName, indexSetting, PIPELINE_NAME); + } + + private void validateTestIndex(String indexName, String fieldName, int documentCount, Object expected) { + int docCount = getDocCount(indexName); + assertEquals(documentCount, docCount); + MatchAllQueryBuilder query = new MatchAllQueryBuilder(); + Map searchResults = search(indexName, query, 10); + assertNotNull(searchResults); + Map document = getFirstInnerHit(searchResults); + assertNotNull(document); + Object documentSource = document.get("_source"); + assert (documentSource instanceof Map); + @SuppressWarnings("unchecked") + Map documentSourceMap = (Map) documentSource; + assert (documentSourceMap).containsKey(fieldName); + Object ingestOutputs = documentSourceMap.get(fieldName); + assertEquals(expected, ingestOutputs); + } +} diff --git a/qa/restart-upgrade/src/test/resources/processor/ChunkingIndexSettings.json b/qa/restart-upgrade/src/test/resources/processor/ChunkingIndexSettings.json new file mode 100644 index 000000000..956ffc585 --- /dev/null +++ b/qa/restart-upgrade/src/test/resources/processor/ChunkingIndexSettings.json @@ -0,0 +1,17 @@ +{ + "settings":{ + "default_pipeline": "%s", + "number_of_shards": 3, + "number_of_replicas": 1 + }, + "mappings": { + "properties": { + "body": { + "type": "text" + }, + "body_chunk": { + "type": "text" + } + } + } +} diff --git a/qa/restart-upgrade/src/test/resources/processor/PipelineForTextChunkingProcessorConfiguration.json b/qa/restart-upgrade/src/test/resources/processor/PipelineForTextChunkingProcessorConfiguration.json new file mode 100644 index 000000000..6c727b3b4 --- /dev/null +++ b/qa/restart-upgrade/src/test/resources/processor/PipelineForTextChunkingProcessorConfiguration.json @@ -0,0 +1,18 @@ +{ + "description": "An example fixed token length chunker pipeline with standard tokenizer", + "processors" : [ + { + "text_chunking": { + "field_map": { + "body": "body_chunk" + }, + "algorithm": { + "fixed_token_length": { + "token_limit": 10, + "tokenizer": "standard" + } + } + } + } + ] +} diff --git a/qa/rolling-upgrade/build.gradle b/qa/rolling-upgrade/build.gradle index 591e83d58..eedea2d2d 100644 --- a/qa/rolling-upgrade/build.gradle +++ b/qa/rolling-upgrade/build.gradle @@ -83,6 +83,13 @@ task testAgainstOldCluster(type: StandaloneRestIntegTestTask) { } } + // Excluding the text chunking processor test because we introduce this feature in 2.13 + if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){ + filter { + excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*" + } + } + nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}") nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}") systemProperty 'tests.security.manager', 'false' @@ -126,6 +133,13 @@ task testAgainstOneThirdUpgradedCluster(type: StandaloneRestIntegTestTask) { } } + // Excluding the text chunking processor test because we introduce this feature in 2.13 + if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){ + filter { + excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*" + } + } + nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}") nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}") systemProperty 'tests.security.manager', 'false' @@ -150,7 +164,7 @@ task testAgainstTwoThirdsUpgradedCluster(type: StandaloneRestIntegTestTask) { systemProperty 'tests.skip_delete_model_index', 'true' systemProperty 'tests.plugin_bwc_version', ext.neural_search_bwc_version - //Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10 + // Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10 // because these features were released in 2.11 version. if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10")){ filter { @@ -168,6 +182,13 @@ task testAgainstTwoThirdsUpgradedCluster(type: StandaloneRestIntegTestTask) { } } + // Excluding the text chunking processor test because we introduce this feature in 2.13 + if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){ + filter { + excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*" + } + } + nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}") nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}") systemProperty 'tests.security.manager', 'false' @@ -210,6 +231,13 @@ task testRollingUpgrade(type: StandaloneRestIntegTestTask) { } } + // Excluding the text chunking processor test because we introduce this feature in 2.13 + if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){ + filter { + excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*" + } + } + nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}") nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}") systemProperty 'tests.security.manager', 'false' diff --git a/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRollingUpgradeTestCase.java b/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRollingUpgradeTestCase.java index 16ed2d229..ed1613e2f 100644 --- a/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRollingUpgradeTestCase.java +++ b/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRollingUpgradeTestCase.java @@ -4,9 +4,11 @@ */ package org.opensearch.neuralsearch.bwc; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.util.Locale; +import java.util.Objects; import java.util.Optional; import org.junit.Before; import org.opensearch.common.settings.Settings; @@ -130,4 +132,11 @@ protected void createPipelineForSparseEncodingProcessor(String modelId, String p ); createPipelineProcessor(requestBody, pipelineName, modelId); } + + protected void createPipelineForTextChunkingProcessor(String pipelineName) throws Exception { + String requestBody = Files.readString( + Path.of(classLoader.getResource("processor/PipelineForTextChunkingProcessorConfiguration.json").toURI()) + ); + createPipelineProcessor(requestBody, pipelineName, ""); + } } diff --git a/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/NeuralSparseSearchIT.java b/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/NeuralSparseSearchIT.java index 70513686b..d0f13c766 100644 --- a/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/NeuralSparseSearchIT.java +++ b/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/NeuralSparseSearchIT.java @@ -36,7 +36,7 @@ public class NeuralSparseSearchIT extends AbstractRollingUpgradeTestCase { // Test rolling-upgrade test sparse embedding processor // Create Sparse Encoding Processor, Ingestion Pipeline and add document - // Validate process , pipeline and document count in restart-upgrade scenario + // Validate process , pipeline and document count in rolling-upgrade scenario public void testSparseEncodingProcessor_E2EFlow() throws Exception { waitForClusterHealthGreen(NODES_BWC_CLUSTER); switch (getClusterType()) { diff --git a/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java b/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java new file mode 100644 index 000000000..ed869c876 --- /dev/null +++ b/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java @@ -0,0 +1,93 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.bwc; + +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import org.opensearch.index.query.MatchAllQueryBuilder; +import static org.opensearch.neuralsearch.TestUtils.NODES_BWC_CLUSTER; + +public class TextChunkingProcessorIT extends AbstractRollingUpgradeTestCase { + + private static final String PIPELINE_NAME = "pipeline-text-chunking"; + private static final String INPUT_FIELD = "body"; + private static final String OUTPUT_FIELD = "body_chunk"; + private static final String TEST_INDEX_SETTING_PATH = "processor/ChunkingIndexSettings.json"; + private static final int NUM_DOCS_PER_ROUND = 1; + private static final String TEST_INGEST_TEXT = + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; + + List expectedPassages = List.of( + "This is an example document to be chunked. The document ", + "contains a single paragraph, two sentences and 24 tokens by ", + "standard tokenizer in OpenSearch." + ); + + // Test rolling-upgrade text chunking processor + // Create Text Chunking Processor, Ingestion Pipeline and add document + // Validate process, pipeline and document count in rolling-upgrade scenario + public void testTextChunkingProcessor_E2EFlow() throws Exception { + waitForClusterHealthGreen(NODES_BWC_CLUSTER); + String indexName = getIndexNameForTest(); + switch (getClusterType()) { + case OLD: + createPipelineForTextChunkingProcessor(PIPELINE_NAME); + createChunkingIndex(indexName); + addDocument(indexName, "0", INPUT_FIELD, TEST_INGEST_TEXT, null, null); + break; + case MIXED: + int totalDocsCountMixed; + if (isFirstMixedRound()) { + totalDocsCountMixed = NUM_DOCS_PER_ROUND; + validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountMixed, expectedPassages); + addDocument(indexName, "1", INPUT_FIELD, TEST_INGEST_TEXT, null, null); + } else { + totalDocsCountMixed = 2 * NUM_DOCS_PER_ROUND; + validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountMixed, expectedPassages); + } + break; + case UPGRADED: + try { + int totalDocsCountUpgraded = 3 * NUM_DOCS_PER_ROUND; + addDocument(indexName, "2", INPUT_FIELD, TEST_INGEST_TEXT, null, null); + validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountUpgraded, expectedPassages); + } finally { + wipeOfTestResources(indexName, PIPELINE_NAME, null, null); + } + break; + default: + throw new IllegalStateException("Unexpected value: " + getClusterType()); + } + } + + private void createChunkingIndex(String indexName) throws Exception { + URL documentURLPath = classLoader.getResource(TEST_INDEX_SETTING_PATH); + Objects.requireNonNull(documentURLPath); + String indexSetting = Files.readString(Path.of(documentURLPath.toURI())); + createIndexWithConfiguration(indexName, indexSetting, PIPELINE_NAME); + } + + private void validateTestIndex(String indexName, String fieldName, int documentCount, Object expected) { + int docCount = getDocCount(indexName); + assertEquals(documentCount, docCount); + MatchAllQueryBuilder query = new MatchAllQueryBuilder(); + Map searchResults = search(indexName, query, 10); + assertNotNull(searchResults); + Map document = getFirstInnerHit(searchResults); + assertNotNull(document); + Object documentSource = document.get("_source"); + assert (documentSource instanceof Map); + @SuppressWarnings("unchecked") + Map documentSourceMap = (Map) documentSource; + assert (documentSourceMap).containsKey(fieldName); + Object ingestOutputs = documentSourceMap.get(fieldName); + assertEquals(expected, ingestOutputs); + } +} diff --git a/qa/rolling-upgrade/src/test/resources/processor/ChunkingIndexSettings.json b/qa/rolling-upgrade/src/test/resources/processor/ChunkingIndexSettings.json new file mode 100644 index 000000000..956ffc585 --- /dev/null +++ b/qa/rolling-upgrade/src/test/resources/processor/ChunkingIndexSettings.json @@ -0,0 +1,17 @@ +{ + "settings":{ + "default_pipeline": "%s", + "number_of_shards": 3, + "number_of_replicas": 1 + }, + "mappings": { + "properties": { + "body": { + "type": "text" + }, + "body_chunk": { + "type": "text" + } + } + } +} diff --git a/qa/rolling-upgrade/src/test/resources/processor/PipelineForSparseEncodingProcessorConfiguration.json b/qa/rolling-upgrade/src/test/resources/processor/PipelineForSparseEncodingProcessorConfiguration.json index d9a358c24..fe885a0a2 100644 --- a/qa/rolling-upgrade/src/test/resources/processor/PipelineForSparseEncodingProcessorConfiguration.json +++ b/qa/rolling-upgrade/src/test/resources/processor/PipelineForSparseEncodingProcessorConfiguration.json @@ -1,13 +1,13 @@ { - "description": "An sparse encoding ingest pipeline", - "processors": [ - { - "sparse_encoding": { - "model_id": "%s", - "field_map": { - "passage_text": "passage_embedding" - } + "description": "An sparse encoding ingest pipeline", + "processors": [ + { + "sparse_encoding": { + "model_id": "%s", + "field_map": { + "passage_text": "passage_embedding" } } - ] - } + } + ] +} diff --git a/qa/rolling-upgrade/src/test/resources/processor/PipelineForTextChunkingProcessorConfiguration.json b/qa/rolling-upgrade/src/test/resources/processor/PipelineForTextChunkingProcessorConfiguration.json new file mode 100644 index 000000000..6c727b3b4 --- /dev/null +++ b/qa/rolling-upgrade/src/test/resources/processor/PipelineForTextChunkingProcessorConfiguration.json @@ -0,0 +1,18 @@ +{ + "description": "An example fixed token length chunker pipeline with standard tokenizer", + "processors" : [ + { + "text_chunking": { + "field_map": { + "body": "body_chunk" + }, + "algorithm": { + "fixed_token_length": { + "token_limit": 10, + "tokenizer": "standard" + } + } + } + } + ] +} diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java index dd517aa17..d85865bb5 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java @@ -24,7 +24,6 @@ import org.opensearch.common.xcontent.XContentType; import org.opensearch.index.query.MatchAllQueryBuilder; import org.opensearch.neuralsearch.BaseNeuralSearchIT; -import static org.opensearch.neuralsearch.TestUtils.DEFAULT_USER_AGENT; public class TextChunkingProcessorIT extends BaseNeuralSearchIT { private static final String INDEX_NAME = "text_chunking_test_index"; @@ -197,20 +196,7 @@ private void createPipelineProcessor(String pipelineName) throws Exception { URL pipelineURLPath = classLoader.getResource(PIPELINE_CONFIGS_BY_NAME.get(pipelineName)); Objects.requireNonNull(pipelineURLPath); String requestBody = Files.readString(Path.of(pipelineURLPath.toURI())); - Response pipelineCreateResponse = makeRequest( - client(), - "PUT", - "/_ingest/pipeline/" + pipelineName, - null, - toHttpEntity(String.format(LOCALE, requestBody)), - ImmutableList.of(new BasicHeader(HttpHeaders.USER_AGENT, DEFAULT_USER_AGENT)) - ); - Map node = XContentHelper.convertToMap( - XContentType.JSON.xContent(), - EntityUtils.toString(pipelineCreateResponse.getEntity()), - false - ); - assertEquals("true", node.get("acknowledged").toString()); + createPipelineProcessor(requestBody, pipelineName, ""); } private void createTextChunkingIndex(String indexName, String pipelineName) throws Exception { @@ -222,13 +208,13 @@ private void createTextChunkingIndex(String indexName, String pipelineName) thro private void ingestDocument(String documentPath) throws Exception { URL documentURLPath = classLoader.getResource(documentPath); Objects.requireNonNull(documentURLPath); - String ingestDocument = Files.readString(Path.of(documentURLPath.toURI())); + String document = Files.readString(Path.of(documentURLPath.toURI())); Response response = makeRequest( client(), "POST", INDEX_NAME + "/_doc?refresh", null, - toHttpEntity(ingestDocument), + toHttpEntity(document), ImmutableList.of(new BasicHeader(HttpHeaders.USER_AGENT, "Kibana")) ); Map map = XContentHelper.convertToMap( diff --git a/src/testFixtures/java/org/opensearch/neuralsearch/TestUtils.java b/src/testFixtures/java/org/opensearch/neuralsearch/TestUtils.java index a6f4a3e0f..14efca5f7 100644 --- a/src/testFixtures/java/org/opensearch/neuralsearch/TestUtils.java +++ b/src/testFixtures/java/org/opensearch/neuralsearch/TestUtils.java @@ -62,6 +62,7 @@ public class TestUtils { public static final String DEFAULT_COMBINATION_METHOD = "arithmetic_mean"; public static final String PARAM_NAME_WEIGHTS = "weights"; public static final String SPARSE_ENCODING_PROCESSOR = "sparse_encoding"; + public static final String TEXT_CHUNKING_PROCESSOR = "text_chunking"; public static final int MAX_TIME_OUT_INTERVAL = 3000; public static final int MAX_RETRY = 5;