Address code review comments

Signed-off-by: zane-neo <zaniu@amazon.com>
zane-neo · Sep 28, 2023 · 3cc3a17 · 3cc3a17
1 parent 86b0d9c
commit 3cc3a17
Show file tree

Hide file tree

Showing 12 changed files with 72 additions and 83 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,9 +14,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 ## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.10...2.x)
 ### Features
-Support sparse semantic retrieval by introducing `sparse_encoding` ingest processor and query builder ([#333](https://github.com/opensearch-project/neural-search/pull/333))
+Support sparse semantic retrieval by introducing `neural_sparse` ingest processor and query builder ([#333](https://github.com/opensearch-project/neural-search/pull/333))
 ### Enhancements
-Add `max_token_score` parameter to improve the execution efficiency for `sparse_encoding` query clause ([#348](https://github.com/opensearch-project/neural-search/pull/348))
+Add `max_token_score` parameter to improve the execution efficiency for `neural_sparse` query clause ([#348](https://github.com/opensearch-project/neural-search/pull/348))
 ### Bug Fixes
 ### Infrastructure
 ### Documentation

diff --git a/...arch/query/BoundedLinearFeatureQuery.java → ...che/lucene/BoundedLinearFeatureQuery.java b/...arch/query/BoundedLinearFeatureQuery.java → ...che/lucene/BoundedLinearFeatureQuery.java
@@ -1,39 +1,22 @@
 /*
+ * Copyright OpenSearch Contributors
  * SPDX-License-Identifier: Apache-2.0
- *
- * The OpenSearch Contributors require contributions made to
- * this file be licensed under the Apache-2.0 license or a
- * compatible open source license.
  */
 
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * SPDX-License-Identifier: Apache-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Modifications Copyright OpenSearch Contributors. See
- * GitHub history for details.
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
  */
 
 /*
  * This class is built based on lucene FeatureQuery. We use LinearFuntion to
  * build the query and add an upperbound to it.
  */
 
-package org.opensearch.neuralsearch.query;
+package org.apache.lucene;
 
 import java.io.IOException;
 import java.util.Objects;
@@ -66,6 +49,10 @@
  * To mitigate this issue, we rewrite the FeatureQuery to BoundedLinearFeatureQuery. The caller can
  * set the token score upperbound of this query. And according to our use case, we use LinearFunction
  * as the score function.
+ *
+ * This class combines both <a href="https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java">FeatureQuery</a>
+ * and <a href="https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/document/FeatureField.java">FeatureField</a> together
+ * and will be deprecated after OpenSearch upgraded lucene to version 9.8.
  */
 
 public final class BoundedLinearFeatureQuery extends Query {
@@ -219,6 +206,7 @@ public String toString(String field) {
     // the field and decodeFeatureValue are modified from FeatureField.decodeFeatureValue
     static final int MAX_FREQ = Float.floatToIntBits(Float.MAX_VALUE) >>> 15;
 
+    // Rewriting this function to make scoreUpperBound work.
     private float decodeFeatureValue(float freq) {
         if (freq > MAX_FREQ) {
             return scoreUpperBound;

diff --git a/.../neuralsearch/processor/NLPProcessor.java → ...lsearch/processor/InferenceProcessor.java b/.../neuralsearch/processor/NLPProcessor.java → ...lsearch/processor/InferenceProcessor.java
@@ -32,7 +32,7 @@
  * and set the target fields according to the field name map.
  */
 @Log4j2
-public abstract class NLPProcessor extends AbstractProcessor {
+public abstract class InferenceProcessor extends AbstractProcessor {
 
     public static final String MODEL_ID_FIELD = "model_id";
     public static final String FIELD_MAP_FIELD = "field_map";
@@ -51,7 +51,7 @@ public abstract class NLPProcessor extends AbstractProcessor {
 
     private final Environment environment;
 
-    public NLPProcessor(
+    public InferenceProcessor(
         String tag,
         String description,
         String type,
@@ -249,7 +249,7 @@ protected void setVectorFieldsToDocument(IngestDocument ingestDocument, Map<Stri
     @SuppressWarnings({ "unchecked" })
     @VisibleForTesting
     Map<String, Object> buildNLPResult(Map<String, Object> processorMap, List<?> results, Map<String, Object> sourceAndMetadataMap) {
-        NLPProcessor.IndexWrapper indexWrapper = new NLPProcessor.IndexWrapper(0);
+        InferenceProcessor.IndexWrapper indexWrapper = new InferenceProcessor.IndexWrapper(0);
         Map<String, Object> result = new LinkedHashMap<>();
         for (Map.Entry<String, Object> knnMapEntry : processorMap.entrySet()) {
             String knnKey = knnMapEntry.getKey();
@@ -270,7 +270,7 @@ private void putNLPResultToSourceMapForMapType(
         String processorKey,
         Object sourceValue,
         List<?> results,
-        NLPProcessor.IndexWrapper indexWrapper,
+        InferenceProcessor.IndexWrapper indexWrapper,
         Map<String, Object> sourceAndMetadataMap
     ) {
         if (processorKey == null || sourceAndMetadataMap == null || sourceValue == null) return;
@@ -294,7 +294,7 @@ private void putNLPResultToSourceMapForMapType(
     private List<Map<String, Object>> buildNLPResultForListType(
         List<String> sourceValue,
         List<?> results,
-        NLPProcessor.IndexWrapper indexWrapper
+        InferenceProcessor.IndexWrapper indexWrapper
     ) {
         List<Map<String, Object>> keyToResult = new ArrayList<>();
         IntStream.range(0, sourceValue.size())

diff --git a/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java
@@ -22,10 +22,10 @@
  * and field_map can be used to indicate which fields needs text embedding and the corresponding keys for the sparse encoding results.
  */
 @Log4j2
-public final class SparseEncodingProcessor extends NLPProcessor {
+public final class SparseEncodingProcessor extends InferenceProcessor {
 
-    public static final String TYPE = "sparse_encoding";
-    public static final String LIST_TYPE_NESTED_MAP_KEY = "sparse_encoding";
+    public static final String TYPE = "neural_sparse";
+    public static final String LIST_TYPE_NESTED_MAP_KEY = "neural_sparse";
 
     public SparseEncodingProcessor(
         String tag,

diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java
@@ -21,7 +21,7 @@
  * and field_map can be used to indicate which fields needs text embedding and the corresponding keys for the text embedding results.
  */
 @Log4j2
-public final class TextEmbeddingProcessor extends NLPProcessor {
+public final class TextEmbeddingProcessor extends InferenceProcessor {
 
     public static final String TYPE = "text_embedding";
     public static final String LIST_TYPE_NESTED_MAP_KEY = "knn";

diff --git a/src/main/java/org/opensearch/neuralsearch/query/SparseEncodingQueryBuilder.java b/src/main/java/org/opensearch/neuralsearch/query/SparseEncodingQueryBuilder.java
@@ -21,6 +21,7 @@
 import org.apache.commons.lang.StringUtils;
 import org.apache.commons.lang.builder.EqualsBuilder;
 import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.lucene.BoundedLinearFeatureQuery;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.BoostQuery;
@@ -44,7 +45,7 @@
 import com.google.common.annotations.VisibleForTesting;
 
 /**
- * SparseEncodingQueryBuilder is responsible for handling "sparse_encoding" query types. It uses an ML SPARSE_ENCODING model
+ * SparseEncodingQueryBuilder is responsible for handling "neural_sparse" query types. It uses an ML neural_sparse model
  * or SPARSE_TOKENIZE model to produce a Map with String keys and Float values for input text. Then it will be transformed
  * to Lucene FeatureQuery wrapped by Lucene BooleanQuery.
  */
@@ -56,7 +57,7 @@
 @NoArgsConstructor
 @AllArgsConstructor
 public class SparseEncodingQueryBuilder extends AbstractQueryBuilder<SparseEncodingQueryBuilder> {
-    public static final String NAME = "sparse_encoding";
+    public static final String NAME = "neural_sparse";
     @VisibleForTesting
     static final ParseField QUERY_TEXT_FIELD = new ParseField("query_text");
     @VisibleForTesting
@@ -104,7 +105,7 @@ protected void doXContent(XContentBuilder xContentBuilder, Params params) throws
         xContentBuilder.startObject(fieldName);
         xContentBuilder.field(QUERY_TEXT_FIELD.getPreferredName(), queryText);
         xContentBuilder.field(MODEL_ID_FIELD.getPreferredName(), modelId);
-        if (null != maxTokenScore) xContentBuilder.field(MAX_TOKEN_SCORE_FIELD.getPreferredName(), maxTokenScore);
+        if (maxTokenScore != null) xContentBuilder.field(MAX_TOKEN_SCORE_FIELD.getPreferredName(), maxTokenScore);
         printBoostAndQueryName(xContentBuilder);
         xContentBuilder.endObject();
         xContentBuilder.endObject();
@@ -153,7 +154,7 @@ public static SparseEncodingQueryBuilder fromXContent(XContentParser parser) thr
             sparseEncodingQueryBuilder.modelId(),
             String.format(Locale.ROOT, "%s field must be provided for [%s] query", MODEL_ID_FIELD.getPreferredName(), NAME)
         );
-        if (null != sparseEncodingQueryBuilder.maxTokenScore && sparseEncodingQueryBuilder.maxTokenScore <= 0) {
+        if (sparseEncodingQueryBuilder.maxTokenScore != null && sparseEncodingQueryBuilder.maxTokenScore <= 0) {
             throw new IllegalArgumentException(MAX_TOKEN_SCORE_FIELD.getPreferredName() + " must be larger than 0.");
         }
 
@@ -227,7 +228,7 @@ protected Query doToQuery(QueryShardContext context) throws IOException {
         Map<String, Float> queryTokens = queryTokensSupplier.get();
         validateQueryTokens(queryTokens);
 
-        final Float scoreUpperBound = null != maxTokenScore ? maxTokenScore : Float.MAX_VALUE;
+        final Float scoreUpperBound = maxTokenScore != null ? maxTokenScore : Float.MAX_VALUE;
 
         BooleanQuery.Builder builder = new BooleanQuery.Builder();
         for (Map.Entry<String, Float> entry : queryTokens.entrySet()) {

diff --git a/src/main/java/org/opensearch/neuralsearch/util/TokenWeightUtil.java b/src/main/java/org/opensearch/neuralsearch/util/TokenWeightUtil.java
@@ -12,7 +12,7 @@
 import java.util.stream.Collectors;
 
 /**
- * Utility class for working with sparse_encoding queries and ingest processor.
+ * Utility class for working with neural_sparse queries and ingest processor.
  * Used to fetch the (token, weight) Map from the response returned by {@link org.opensearch.neuralsearch.ml.MLCommonsClientAccessor}
  *
  */

diff --git a/src/test/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessIT.java b/src/test/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessIT.java
@@ -25,7 +25,7 @@
 
 public class SparseEncodingProcessIT extends BaseSparseEncodingIT {
 
-    private static final String INDEX_NAME = "sparse_encoding_index";
+    private static final String INDEX_NAME = "neural_sparse_index";
 
     private static final String PIPELINE_NAME = "pipeline-sparse-encoding";