[Backport 2.x] Added Score Normalization and Combination feature, man…

…ual backport (#263) * Added Score Normalization and Combination feature (#241) Signed-off-by: Martin Gaievski <gaievski@amazon.com>
opensearch-project · Aug 25, 2023 · 185050a · 185050a
1 parent adf3925
commit 185050a
Show file tree

Hide file tree

Showing 58 changed files with 8,556 additions and 25 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 ## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.7...2.x)
 ### Features
+* Added Score Normalization and Combination feature ([#241](https://github.com/opensearch-project/neural-search/pull/241/))
 ### Enhancements
 ### Bug Fixes
 ### Infrastructure

diff --git a/build.gradle b/build.gradle
@@ -253,6 +253,10 @@ testClusters.integTest {
     // Increase heap size from default of 512mb to 1gb. When heap size is 512mb, our integ tests sporadically fail due
     // to ml-commons memory circuit breaker exception
     jvmArgs("-Xms1g", "-Xmx1g")
+
+    // enable features for testing
+    // hybrid search
+    systemProperty('plugins.neural_search.hybrid_search_enabled', 'true')
 }
 
 // Remote Integration Tests

diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java
@@ -5,41 +5,64 @@
 
 package org.opensearch.neuralsearch.plugin;
 
+import static org.opensearch.neuralsearch.settings.NeuralSearchSettings.NEURAL_SEARCH_HYBRID_SEARCH_ENABLED;
+
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
+import java.util.Optional;
 import java.util.function.Supplier;
 
+import lombok.extern.log4j.Log4j2;
+
 import org.opensearch.client.Client;
 import org.opensearch.cluster.metadata.IndexNameExpressionResolver;
 import org.opensearch.cluster.service.ClusterService;
+import org.opensearch.common.settings.Setting;
+import org.opensearch.common.util.FeatureFlags;
 import org.opensearch.core.common.io.stream.NamedWriteableRegistry;
 import org.opensearch.core.xcontent.NamedXContentRegistry;
 import org.opensearch.env.Environment;
 import org.opensearch.env.NodeEnvironment;
 import org.opensearch.ingest.Processor;
 import org.opensearch.ml.client.MachineLearningNodeClient;
 import org.opensearch.neuralsearch.ml.MLCommonsClientAccessor;
+import org.opensearch.neuralsearch.processor.NormalizationProcessor;
+import org.opensearch.neuralsearch.processor.NormalizationProcessorWorkflow;
 import org.opensearch.neuralsearch.processor.TextEmbeddingProcessor;
+import org.opensearch.neuralsearch.processor.combination.ScoreCombinationFactory;
+import org.opensearch.neuralsearch.processor.combination.ScoreCombiner;
+import org.opensearch.neuralsearch.processor.factory.NormalizationProcessorFactory;
 import org.opensearch.neuralsearch.processor.factory.TextEmbeddingProcessorFactory;
+import org.opensearch.neuralsearch.processor.normalization.ScoreNormalizationFactory;
+import org.opensearch.neuralsearch.processor.normalization.ScoreNormalizer;
+import org.opensearch.neuralsearch.query.HybridQueryBuilder;
 import org.opensearch.neuralsearch.query.NeuralQueryBuilder;
+import org.opensearch.neuralsearch.search.query.HybridQueryPhaseSearcher;
 import org.opensearch.plugins.ActionPlugin;
 import org.opensearch.plugins.ExtensiblePlugin;
 import org.opensearch.plugins.IngestPlugin;
 import org.opensearch.plugins.Plugin;
+import org.opensearch.plugins.SearchPipelinePlugin;
 import org.opensearch.plugins.SearchPlugin;
 import org.opensearch.repositories.RepositoriesService;
 import org.opensearch.script.ScriptService;
+import org.opensearch.search.pipeline.SearchPhaseResultsProcessor;
+import org.opensearch.search.query.QueryPhaseSearcher;
 import org.opensearch.threadpool.ThreadPool;
 import org.opensearch.watcher.ResourceWatcherService;
 
 /**
  * Neural Search plugin class
  */
-public class NeuralSearch extends Plugin implements ActionPlugin, SearchPlugin, IngestPlugin, ExtensiblePlugin {
-
+@Log4j2
+public class NeuralSearch extends Plugin implements ActionPlugin, SearchPlugin, IngestPlugin, ExtensiblePlugin, SearchPipelinePlugin {
     private MLCommonsClientAccessor clientAccessor;
+    private NormalizationProcessorWorkflow normalizationProcessorWorkflow;
+    private final ScoreNormalizationFactory scoreNormalizationFactory = new ScoreNormalizationFactory();
+    private final ScoreCombinationFactory scoreCombinationFactory = new ScoreCombinationFactory();;
 
     @Override
     public Collection<Object> createComponents(
@@ -56,12 +79,15 @@ public Collection<Object> createComponents(
         final Supplier<RepositoriesService> repositoriesServiceSupplier
     ) {
         NeuralQueryBuilder.initialize(clientAccessor);
+        normalizationProcessorWorkflow = new NormalizationProcessorWorkflow(new ScoreNormalizer(), new ScoreCombiner());
         return List.of(clientAccessor);
     }
 
+    @Override
     public List<QuerySpec<?>> getQueries() {
-        return Collections.singletonList(
-            new QuerySpec<>(NeuralQueryBuilder.NAME, NeuralQueryBuilder::new, NeuralQueryBuilder::fromXContent)
+        return Arrays.asList(
+            new QuerySpec<>(NeuralQueryBuilder.NAME, NeuralQueryBuilder::new, NeuralQueryBuilder::fromXContent),
+            new QuerySpec<>(HybridQueryBuilder.NAME, HybridQueryBuilder::new, HybridQueryBuilder::fromXContent)
         );
     }
 
@@ -70,4 +96,33 @@ public Map<String, Processor.Factory> getProcessors(Processor.Parameters paramet
         clientAccessor = new MLCommonsClientAccessor(new MachineLearningNodeClient(parameters.client));
         return Collections.singletonMap(TextEmbeddingProcessor.TYPE, new TextEmbeddingProcessorFactory(clientAccessor, parameters.env));
     }
+
+    @Override
+    public Optional<QueryPhaseSearcher> getQueryPhaseSearcher() {
+        if (FeatureFlags.isEnabled(NEURAL_SEARCH_HYBRID_SEARCH_ENABLED.getKey())) {
+            log.info("Registering hybrid query phase searcher with feature flag [{}]", NEURAL_SEARCH_HYBRID_SEARCH_ENABLED.getKey());
+            return Optional.of(new HybridQueryPhaseSearcher());
+        }
+        log.info(
+            "Not registering hybrid query phase searcher because feature flag [{}] is disabled",
+            NEURAL_SEARCH_HYBRID_SEARCH_ENABLED.getKey()
+        );
+        // we want feature be disabled by default due to risk of colliding and breaking concurrent search in core
+        return Optional.empty();
+    }
+
+    @Override
+    public Map<String, org.opensearch.search.pipeline.Processor.Factory<SearchPhaseResultsProcessor>> getSearchPhaseResultsProcessors(
+        Parameters parameters
+    ) {
+        return Map.of(
+            NormalizationProcessor.TYPE,
+            new NormalizationProcessorFactory(normalizationProcessorWorkflow, scoreNormalizationFactory, scoreCombinationFactory)
+        );
+    }
+
+    @Override
+    public List<Setting<?>> getSettings() {
+        return List.of(NEURAL_SEARCH_HYBRID_SEARCH_ENABLED);
+    }
 }
diff --git a/src/main/java/org/opensearch/neuralsearch/processor/NormalizationProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/NormalizationProcessor.java
@@ -0,0 +1,122 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.neuralsearch.processor;
+
+import java.util.List;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.stream.Collectors;
+
+import lombok.AllArgsConstructor;
+import lombok.extern.log4j.Log4j2;
+
+import org.opensearch.action.search.QueryPhaseResultConsumer;
+import org.opensearch.action.search.SearchPhaseContext;
+import org.opensearch.action.search.SearchPhaseName;
+import org.opensearch.action.search.SearchPhaseResults;
+import org.opensearch.neuralsearch.processor.combination.ScoreCombinationTechnique;
+import org.opensearch.neuralsearch.processor.normalization.ScoreNormalizationTechnique;
+import org.opensearch.neuralsearch.search.CompoundTopDocs;
+import org.opensearch.search.SearchPhaseResult;
+import org.opensearch.search.internal.SearchContext;
+import org.opensearch.search.pipeline.SearchPhaseResultsProcessor;
+import org.opensearch.search.query.QuerySearchResult;
+
+/**
+ * Processor for score normalization and combination on post query search results. Updates query results with
+ * normalized and combined scores for next phase (typically it's FETCH)
+ */
+@Log4j2
+@AllArgsConstructor
+public class NormalizationProcessor implements SearchPhaseResultsProcessor {
+    public static final String TYPE = "normalization-processor";
+
+    private final String tag;
+    private final String description;
+    private final ScoreNormalizationTechnique normalizationTechnique;
+    private final ScoreCombinationTechnique combinationTechnique;
+    private final NormalizationProcessorWorkflow normalizationWorkflow;
+
+    /**
+     * Method abstracts functional aspect of score normalization and score combination. Exact methods for each processing stage
+     * are set as part of class constructor
+     * @param searchPhaseResult {@link SearchPhaseResults} DTO that has query search results. Results will be mutated as part of this method execution
+     * @param searchPhaseContext {@link SearchContext}
+     */
+    @Override
+    public <Result extends SearchPhaseResult> void process(
+        final SearchPhaseResults<Result> searchPhaseResult,
+        final SearchPhaseContext searchPhaseContext
+    ) {
+        if (shouldSkipProcessor(searchPhaseResult)) {
+            log.debug("Query results are not compatible with normalization processor");
+            return;
+        }
+        List<QuerySearchResult> querySearchResults = getQueryPhaseSearchResults(searchPhaseResult);
+        normalizationWorkflow.execute(querySearchResults, normalizationTechnique, combinationTechnique);
+    }
+
+    @Override
+    public SearchPhaseName getBeforePhase() {
+        return SearchPhaseName.QUERY;
+    }
+
+    @Override
+    public SearchPhaseName getAfterPhase() {
+        return SearchPhaseName.FETCH;
+    }
+
+    @Override
+    public String getType() {
+        return TYPE;
+    }
+
+    @Override
+    public String getTag() {
+        return tag;
+    }
+
+    @Override
+    public String getDescription() {
+        return description;
+    }
+
+    @Override
+    public boolean isIgnoreFailure() {
+        return false;
+    }
+
+    private <Result extends SearchPhaseResult> boolean shouldSkipProcessor(SearchPhaseResults<Result> searchPhaseResult) {
+        if (Objects.isNull(searchPhaseResult) || !(searchPhaseResult instanceof QueryPhaseResultConsumer)) {
+            return true;
+        }
+
+        QueryPhaseResultConsumer queryPhaseResultConsumer = (QueryPhaseResultConsumer) searchPhaseResult;
+        Optional<SearchPhaseResult> optionalSearchPhaseResult = queryPhaseResultConsumer.getAtomicArray()
+            .asList()
+            .stream()
+            .filter(Objects::nonNull)
+            .findFirst();
+        return isNotHybridQuery(optionalSearchPhaseResult);
+    }
+
+    private boolean isNotHybridQuery(final Optional<SearchPhaseResult> optionalSearchPhaseResult) {
+        return optionalSearchPhaseResult.isEmpty()
+            || Objects.isNull(optionalSearchPhaseResult.get().queryResult())
+            || Objects.isNull(optionalSearchPhaseResult.get().queryResult().topDocs())
+            || !(optionalSearchPhaseResult.get().queryResult().topDocs().topDocs instanceof CompoundTopDocs);
+    }
+
+    private <Result extends SearchPhaseResult> List<QuerySearchResult> getQueryPhaseSearchResults(
+        final SearchPhaseResults<Result> results
+    ) {
+        return results.getAtomicArray()
+            .asList()
+            .stream()
+            .map(result -> result == null ? null : result.queryResult())
+            .collect(Collectors.toList());
+    }
+}
diff --git a/src/main/java/org/opensearch/neuralsearch/processor/NormalizationProcessorWorkflow.java b/src/main/java/org/opensearch/neuralsearch/processor/NormalizationProcessorWorkflow.java
@@ -0,0 +1,88 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.neuralsearch.processor;
+
+import java.util.List;
+import java.util.Objects;
+import java.util.stream.Collectors;
+
+import lombok.AllArgsConstructor;
+import lombok.extern.log4j.Log4j2;
+
+import org.opensearch.common.lucene.search.TopDocsAndMaxScore;
+import org.opensearch.neuralsearch.processor.combination.ScoreCombinationTechnique;
+import org.opensearch.neuralsearch.processor.combination.ScoreCombiner;
+import org.opensearch.neuralsearch.processor.normalization.ScoreNormalizationTechnique;
+import org.opensearch.neuralsearch.processor.normalization.ScoreNormalizer;
+import org.opensearch.neuralsearch.search.CompoundTopDocs;
+import org.opensearch.search.query.QuerySearchResult;
+
+/**
+ * Class abstracts steps required for score normalization and combination, this includes pre-processing of incoming data
+ * and post-processing of final results
+ */
+@AllArgsConstructor
+@Log4j2
+public class NormalizationProcessorWorkflow {
+
+    private final ScoreNormalizer scoreNormalizer;
+    private final ScoreCombiner scoreCombiner;
+
+    /**
+     * Start execution of this workflow
+     * @param querySearchResults input data with QuerySearchResult from multiple shards
+     * @param normalizationTechnique technique for score normalization
+     * @param combinationTechnique technique for score combination
+     */
+    public void execute(
+        final List<QuerySearchResult> querySearchResults,
+        final ScoreNormalizationTechnique normalizationTechnique,
+        final ScoreCombinationTechnique combinationTechnique
+    ) {
+        // pre-process data
+        log.debug("Pre-process query results");
+        List<CompoundTopDocs> queryTopDocs = getQueryTopDocs(querySearchResults);
+
+        // normalize
+        log.debug("Do score normalization");
+        scoreNormalizer.normalizeScores(queryTopDocs, normalizationTechnique);
+
+        // combine
+        log.debug("Do score combination");
+        scoreCombiner.combineScores(queryTopDocs, combinationTechnique);
+
+        // post-process data
+        log.debug("Post-process query results after score normalization and combination");
+        updateOriginalQueryResults(querySearchResults, queryTopDocs);
+    }
+
+    /**
+     * Getting list of CompoundTopDocs from list of QuerySearchResult. Each CompoundTopDocs is for individual shard
+     * @param querySearchResults collection of QuerySearchResult for all shards
+     * @return collection of CompoundTopDocs, one object for each shard
+     */
+    private List<CompoundTopDocs> getQueryTopDocs(final List<QuerySearchResult> querySearchResults) {
+        List<CompoundTopDocs> queryTopDocs = querySearchResults.stream()
+            .filter(searchResult -> Objects.nonNull(searchResult.topDocs()))
+            .filter(searchResult -> searchResult.topDocs().topDocs instanceof CompoundTopDocs)
+            .map(searchResult -> (CompoundTopDocs) searchResult.topDocs().topDocs)
+            .collect(Collectors.toList());
+        return queryTopDocs;
+    }
+
+    private void updateOriginalQueryResults(final List<QuerySearchResult> querySearchResults, final List<CompoundTopDocs> queryTopDocs) {
+        for (int i = 0; i < querySearchResults.size(); i++) {
+            QuerySearchResult querySearchResult = querySearchResults.get(i);
+            if (!(querySearchResult.topDocs().topDocs instanceof CompoundTopDocs) || Objects.isNull(queryTopDocs.get(i))) {
+                continue;
+            }
+            CompoundTopDocs updatedTopDocs = queryTopDocs.get(i);
+            float maxScore = updatedTopDocs.totalHits.value > 0 ? updatedTopDocs.scoreDocs[0].score : 0.0f;
+            TopDocsAndMaxScore updatedTopDocsAndMaxScore = new TopDocsAndMaxScore(updatedTopDocs, maxScore);
+            querySearchResult.topDocs(updatedTopDocsAndMaxScore, null);
+        }
+    }
+}