elastic · benwtrent · Jan 19, 2021 · Jan 15, 2021 · Jan 15, 2021 · Jan 15, 2021
diff --git a/...src/main/java/org/elasticsearch/client/ml/inference/MlInferenceNamedXContentProvider.java b/...src/main/java/org/elasticsearch/client/ml/inference/MlInferenceNamedXContentProvider.java
@@ -19,6 +19,7 @@
 package org.elasticsearch.client.ml.inference;
 
 import org.elasticsearch.client.ml.inference.preprocessing.CustomWordEmbedding;
+import org.elasticsearch.client.ml.inference.preprocessing.Multi;
 import org.elasticsearch.client.ml.inference.preprocessing.NGram;
 import org.elasticsearch.client.ml.inference.trainedmodel.ClassificationConfig;
 import org.elasticsearch.client.ml.inference.trainedmodel.InferenceConfig;
@@ -60,6 +61,8 @@ public List<NamedXContentRegistry.Entry> getNamedXContentParsers() {
             CustomWordEmbedding::fromXContent));
         namedXContent.add(new NamedXContentRegistry.Entry(PreProcessor.class, new ParseField(NGram.NAME),
             NGram::fromXContent));
+        namedXContent.add(new NamedXContentRegistry.Entry(PreProcessor.class, new ParseField(Multi.NAME),
+            Multi::fromXContent));
 
         // Model
         namedXContent.add(new NamedXContentRegistry.Entry(TrainedModel.class, new ParseField(Tree.NAME), Tree::fromXContent));

diff --git a/...t-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/Multi.java b/...t-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/Multi.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.client.ml.inference.preprocessing;
+
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Objects;
+
+import org.elasticsearch.client.ml.inference.NamedXContentObjectHelper;
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.xcontent.ConstructingObjectParser;
+import org.elasticsearch.common.xcontent.ToXContent;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentParser;
+
+/**
+ * Multi-PreProcessor for chaining together multiple processors
+ */
+public class Multi implements PreProcessor {
+
+    public static final String NAME = "multi_encoding";
+    public static final ParseField PROCESSORS = new ParseField("processors");
+    public static final ParseField CUSTOM = new ParseField("custom");
+
+    @SuppressWarnings("unchecked")
+    public static final ConstructingObjectParser<Multi, Void> PARSER = new ConstructingObjectParser<>(
+        NAME,
+        true,
+        a -> new Multi((List<PreProcessor>)a[0], (Boolean)a[1]));
+    static {
+        PARSER.declareNamedObjects(ConstructingObjectParser.constructorArg(),
+            (p, c, n) -> p.namedObject(PreProcessor.class, n, null),
+            (_unused) -> {/* Does not matter client side*/ },
+            PROCESSORS);
+        PARSER.declareBoolean(ConstructingObjectParser.optionalConstructorArg(), CUSTOM);
+    }
+
+    public static Multi fromXContent(XContentParser parser) {
+        return PARSER.apply(parser, null);
+    }
+
+    private final List<PreProcessor> processors;
+    private final Boolean custom;
+
+    Multi(List<PreProcessor> processors, Boolean custom) {
+        this.processors = Objects.requireNonNull(processors, PROCESSORS.getPreferredName());
+        this.custom = custom;
+    }
+
+    @Override
+    public String getName() {
+        return NAME;
+    }
+
+    @Override
+    public XContentBuilder toXContent(XContentBuilder builder, ToXContent.Params params) throws IOException {
+        builder.startObject();
+        NamedXContentObjectHelper.writeNamedObjects(builder, params, true, PROCESSORS.getPreferredName(), processors);
+        if (custom != null) {
+            builder.field(CUSTOM.getPreferredName(), custom);
+        }
+        builder.endObject();
+        return builder;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        Multi multi = (Multi) o;
+        return Objects.equals(multi.processors, processors) && Objects.equals(custom, multi.custom);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(custom, processors);
+    }
+
+    public static Builder builder(List<PreProcessor> processors) {
+        return new Builder(processors);
+    }
+
+    public static class Builder {
+        private final List<PreProcessor> processors;
+        private Boolean custom;
+
+        public Builder(List<PreProcessor> processors) {
+            this.processors = processors;
+        }
+
+        public Builder setCustom(boolean custom) {
+            this.custom = custom;
+            return this;
+        }
+
+        public Multi build() {
+            return new Multi(processors, custom);
+        }
+    }
+
+}
diff --git a/...t-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/NGram.java b/...t-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/NGram.java
@@ -24,8 +24,12 @@
 import org.elasticsearch.common.xcontent.XContentParser;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 import java.util.Objects;
+import java.util.function.IntFunction;
+import java.util.stream.IntStream;
 
 
 /**
@@ -134,6 +138,10 @@ public Boolean getCustom() {
         return custom;
     }
 
+    public List<String> outputFields() {
+        return allPossibleNGramOutputFeatureNames();
+    }
+
     @Override
     public boolean equals(Object o) {
         if (this == o) return true;
@@ -152,6 +160,30 @@ public int hashCode() {
         return Objects.hash(field, featurePrefix, start, length, custom, nGrams);
     }
 
+    private String nGramFeature(int nGram, int pos) {
+        return featurePrefix
+            + "."
+            + nGram
+            + pos;
+    }
+
+    private List<String> allPossibleNGramOutputFeatureNames() {
+        int totalNgrams = 0;
+        for (int nGram : nGrams) {
+            totalNgrams += (length - (nGram - 1));
+        }
+        if (totalNgrams <= 0) {
+            return Collections.emptyList();
+        }
+        List<String> ngramOutputs = new ArrayList<>(totalNgrams);
+
+        for (int nGram : nGrams) {
+            IntFunction<String> func = i -> nGramFeature(nGram, i);
+            IntStream.range(0, (length - (nGram - 1))).mapToObj(func).forEach(ngramOutputs::add);
+        }
+        return ngramOutputs;
+    }
+
     public static Builder builder(String field) {
         return new Builder(field);
     }

diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/RestHighLevelClientTests.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/RestHighLevelClientTests.java
@@ -76,6 +76,7 @@
 import org.elasticsearch.client.ml.dataframe.stats.regression.RegressionStats;
 import org.elasticsearch.client.ml.inference.preprocessing.CustomWordEmbedding;
 import org.elasticsearch.client.ml.inference.preprocessing.FrequencyEncoding;
+import org.elasticsearch.client.ml.inference.preprocessing.Multi;
 import org.elasticsearch.client.ml.inference.preprocessing.NGram;
 import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncoding;
 import org.elasticsearch.client.ml.inference.preprocessing.TargetMeanEncoding;
@@ -707,7 +708,7 @@ public void testDefaultNamedXContents() {
 
     public void testProvidedNamedXContents() {
         List<NamedXContentRegistry.Entry> namedXContents = RestHighLevelClient.getProvidedNamedXContents();
-        assertEquals(75, namedXContents.size());
+        assertEquals(76, namedXContents.size());
         Map<Class<?>, Integer> categories = new HashMap<>();
         List<String> names = new ArrayList<>();
         for (NamedXContentRegistry.Entry namedXContent : namedXContents) {
@@ -794,9 +795,16 @@ public void testProvidedNamedXContents() {
                 registeredMetricName(Regression.NAME, MeanSquaredLogarithmicErrorMetric.NAME),
                 registeredMetricName(Regression.NAME, HuberMetric.NAME),
                 registeredMetricName(Regression.NAME, RSquaredMetric.NAME)));
-        assertEquals(Integer.valueOf(5), categories.get(org.elasticsearch.client.ml.inference.preprocessing.PreProcessor.class));
+        assertEquals(Integer.valueOf(6), categories.get(org.elasticsearch.client.ml.inference.preprocessing.PreProcessor.class));
         assertThat(names,
-            hasItems(FrequencyEncoding.NAME, OneHotEncoding.NAME, TargetMeanEncoding.NAME, CustomWordEmbedding.NAME, NGram.NAME));
+            hasItems(
+                FrequencyEncoding.NAME,
+                OneHotEncoding.NAME,
+                TargetMeanEncoding.NAME,
+                CustomWordEmbedding.NAME,
+                NGram.NAME,
+                Multi.NAME
+            ));
         assertEquals(Integer.valueOf(3), categories.get(org.elasticsearch.client.ml.inference.trainedmodel.TrainedModel.class));
         assertThat(names, hasItems(Tree.NAME, Ensemble.NAME, LangIdentNeuralNetwork.NAME));
         assertEquals(Integer.valueOf(4),

diff --git a/...evel/src/test/java/org/elasticsearch/client/ml/inference/TrainedModelDefinitionTests.java b/...evel/src/test/java/org/elasticsearch/client/ml/inference/TrainedModelDefinitionTests.java
@@ -19,6 +19,8 @@
 package org.elasticsearch.client.ml.inference;
 
 import org.elasticsearch.client.ml.inference.preprocessing.FrequencyEncodingTests;
+import org.elasticsearch.client.ml.inference.preprocessing.MultiTests;
+import org.elasticsearch.client.ml.inference.preprocessing.NGramTests;
 import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncodingTests;
 import org.elasticsearch.client.ml.inference.preprocessing.TargetMeanEncodingTests;
 import org.elasticsearch.client.ml.inference.trainedmodel.TargetType;
@@ -66,9 +68,12 @@ public static TrainedModelDefinition.Builder createRandomBuilder(TargetType targ
         return new TrainedModelDefinition.Builder()
             .setPreProcessors(
                 randomBoolean() ? null :
-                    Stream.generate(() -> randomFrom(FrequencyEncodingTests.createRandom(),
+                    Stream.generate(() -> randomFrom(
+                        FrequencyEncodingTests.createRandom(),
                         OneHotEncodingTests.createRandom(),
-                        TargetMeanEncodingTests.createRandom()))
+                        TargetMeanEncodingTests.createRandom(),
+                        NGramTests.createRandom(),
+                        MultiTests.createRandom()))
                         .limit(numberOfProcessors)
                         .collect(Collectors.toList()))
             .setTrainedModel(randomFrom(TreeTests.buildRandomTree(Arrays.asList("foo", "bar"), 6, targetType),

diff --git a/...test/java/org/elasticsearch/client/ml/inference/preprocessing/FrequencyEncodingTests.java b/...test/java/org/elasticsearch/client/ml/inference/preprocessing/FrequencyEncodingTests.java
@@ -50,14 +50,19 @@ protected FrequencyEncoding createTestInstance() {
     }
 
     public static FrequencyEncoding createRandom() {
+        return createRandom(randomAlphaOfLength(10));
+    }
+
+    public static FrequencyEncoding createRandom(String inputField) {
         int valuesSize = randomIntBetween(1, 10);
         Map<String, Double> valueMap = new HashMap<>();
         for (int i = 0; i < valuesSize; i++) {
             valueMap.put(randomAlphaOfLength(10), randomDoubleBetween(0.0, 1.0, false));
         }
-        return new FrequencyEncoding(randomAlphaOfLength(10),
+        return new FrequencyEncoding(inputField,
             randomAlphaOfLength(10),
             valueMap,
             randomBoolean() ? null : randomBoolean());
     }
+
 }
diff --git a/...h-level/src/test/java/org/elasticsearch/client/ml/inference/preprocessing/MultiTests.java b/...h-level/src/test/java/org/elasticsearch/client/ml/inference/preprocessing/MultiTests.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.client.ml.inference.preprocessing;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider;
+import org.elasticsearch.common.xcontent.NamedXContentRegistry;
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.test.AbstractXContentTestCase;
+
+
+public class MultiTests extends AbstractXContentTestCase<Multi> {
+
+    @Override
+    protected Multi doParseInstance(XContentParser parser) throws IOException {
+        return Multi.fromXContent(parser);
+    }
+
+    @Override
+    protected Predicate<String> getRandomFieldsExcludeFilter() {
+        return field -> !field.isEmpty();
+    }
+
+    @Override
+    protected NamedXContentRegistry xContentRegistry() {
+        return new NamedXContentRegistry(new MlInferenceNamedXContentProvider().getNamedXContentParsers());
+    }
+
+    @Override
+    protected boolean supportsUnknownFields() {
+        return true;
+    }
+
+    @Override
+    protected Multi createTestInstance() {
+        return createRandom();
+    }
+
+    public static Multi createRandom() {
+        final List<PreProcessor> processors;
+        Boolean isCustom = randomBoolean() ? null : randomBoolean();
+        if (isCustom == null || isCustom == false) {
+            NGram nGram = new NGram(randomAlphaOfLength(10), Arrays.asList(1, 2), 0, 10, isCustom, "f");
+            List<PreProcessor> preProcessorList = new ArrayList<>();
+            preProcessorList.add(nGram);
+            Stream.generate(() -> randomFrom(
+                FrequencyEncodingTests.createRandom(randomFrom(nGram.outputFields())),
+                TargetMeanEncodingTests.createRandom(randomFrom(nGram.outputFields())),
+                OneHotEncodingTests.createRandom(randomFrom(nGram.outputFields()))
+            )).limit(randomIntBetween(1, 10)).forEach(preProcessorList::add);
+            processors = preProcessorList;
+        } else {
+            processors = Stream.generate(
+                () -> randomFrom(
+                    FrequencyEncodingTests.createRandom(),
+                    TargetMeanEncodingTests.createRandom(),
+                    OneHotEncodingTests.createRandom(),
+                    NGramTests.createRandom()
+                )
+            ).limit(randomIntBetween(2, 10)).collect(Collectors.toList());
+        }
+        return new Multi(processors, isCustom);
+    }
+
+}
diff --git a/...h-level/src/test/java/org/elasticsearch/client/ml/inference/preprocessing/NGramTests.java b/...h-level/src/test/java/org/elasticsearch/client/ml/inference/preprocessing/NGramTests.java
@@ -44,10 +44,11 @@ protected NGram createTestInstance() {
     }
 
     public static NGram createRandom() {
+        int length = randomIntBetween(1, 10);
         return new NGram(randomAlphaOfLength(10),
-            IntStream.range(1, 5).limit(5).boxed().collect(Collectors.toList()),
+            IntStream.range(1, Math.min(5, length + 1)).limit(5).boxed().collect(Collectors.toList()),
             randomBoolean() ? null : randomIntBetween(0, 10),
-            randomBoolean() ? null : randomIntBetween(1, 10),
+            randomBoolean() ? null : length,
             randomBoolean() ? null : randomBoolean(),
             randomBoolean() ? null : randomAlphaOfLength(10));
     }

diff --git a/...rc/test/java/org/elasticsearch/client/ml/inference/preprocessing/OneHotEncodingTests.java b/...rc/test/java/org/elasticsearch/client/ml/inference/preprocessing/OneHotEncodingTests.java
@@ -50,12 +50,18 @@ protected OneHotEncoding createTestInstance() {
     }
 
     public static OneHotEncoding createRandom() {
+        return createRandom(randomAlphaOfLength(10));
+    }
+
+    public static OneHotEncoding createRandom(String inputField) {
         int valuesSize = randomIntBetween(1, 10);
         Map<String, String> valueMap = new HashMap<>();
         for (int i = 0; i < valuesSize; i++) {
             valueMap.put(randomAlphaOfLength(10), randomAlphaOfLength(10));
         }
-        return new OneHotEncoding(randomAlphaOfLength(10), valueMap, randomBoolean() ? null : randomBoolean());
+        return new OneHotEncoding(inputField,
+            valueMap,
+            randomBoolean() ? null : randomBoolean());
     }
 
 }