droberts195
diff --git a/‎docs/reference/aggregations/bucket/categorize-text-aggregation.asciidoc‎
Lines changed: 57 additions & 67 deletions b/‎docs/reference/aggregations/bucket/categorize-text-aggregation.asciidoc‎
Lines changed: 57 additions & 67 deletions
diff --git a/‎…gration/CategorizationAggregationIT.java‎ ‎…gration/CategorizeTextAggregationIT.java‎x-pack/plugin/ml/src/internalClusterTest/java/org/elasticsearch/xpack/ml/integration/CategorizationAggregationIT.java renamed to x-pack/plugin/ml/src/internalClusterTest/java/org/elasticsearch/xpack/ml/integration/CategorizeTextAggregationIT.java
Lines changed: 4 additions & 4 deletions b/‎…gration/CategorizationAggregationIT.java‎ ‎…gration/CategorizeTextAggregationIT.java‎x-pack/plugin/ml/src/internalClusterTest/java/org/elasticsearch/xpack/ml/integration/CategorizationAggregationIT.java renamed to x-pack/plugin/ml/src/internalClusterTest/java/org/elasticsearch/xpack/ml/integration/CategorizeTextAggregationIT.java
Lines changed: 4 additions & 4 deletions
diff --git a/‎x-pack/plugin/ml/src/internalClusterTest/java/org/elasticsearch/xpack/ml/integration/CategorizeTextDistributedIT.java‎
Lines changed: 2 additions & 2 deletions b/‎x-pack/plugin/ml/src/internalClusterTest/java/org/elasticsearch/xpack/ml/integration/CategorizeTextDistributedIT.java‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java‎
Lines changed: 1 addition & 10 deletions b/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java‎
Lines changed: 1 addition & 10 deletions
diff --git a/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/CategorizationBytesRefHash.java‎
Lines changed: 13 additions & 27 deletions b/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/CategorizationBytesRefHash.java‎
Lines changed: 13 additions & 27 deletions
diff --git a/‎…ategorizationPartOfSpeechDictionary.java‎ ‎…ategorizationPartOfSpeechDictionary.java‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization2/CategorizationPartOfSpeechDictionary.java renamed to x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/CategorizationPartOfSpeechDictionary.java
Lines changed: 2 additions & 2 deletions b/‎…ategorizationPartOfSpeechDictionary.java‎ ‎…ategorizationPartOfSpeechDictionary.java‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization2/CategorizationPartOfSpeechDictionary.java renamed to x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/CategorizationPartOfSpeechDictionary.java
Lines changed: 2 additions & 2 deletions
@@ -20,53 +20,6 @@ NOTE: If you have considerable memory allocated to your JVM but are receiving ci
 [[bucket-categorize-text-agg-syntax]]
 ==== Parameters
 
-`field`::
-(Required, string)
-The semi-structured text field to categorize.
-
-`max_unique_tokens`::
-(Optional, integer, default: `50`)
-The maximum number of unique tokens at any position up to `max_matched_tokens`.
-Must be larger than 1. Smaller values use less memory and create fewer categories.
-Larger values will use more memory and create narrower categories.
-Max allowed value is `100`.
-
-`max_matched_tokens`::
-(Optional, integer, default: `5`)
-The maximum number of token positions to match on before attempting to merge categories.
-Larger values will use more memory and create narrower categories.
-Max allowed value is `100`.
-
-Example:
-`max_matched_tokens` of 2 would disallow merging of the categories
-[`foo` `bar` `baz`]
-[`foo` `baz` `bozo`]
-As the first 2 tokens are required to match for the category.
-
-NOTE: Once `max_unique_tokens` is reached at a given position, a new `*` token is
-added and all new tokens at that position are matched by the `*` token.
-
-`similarity_threshold`::
-(Optional, integer, default: `50`)
-The minimum percentage of tokens that must match for text to be added to the
-category bucket.
-Must be between 1 and 100. The larger the value the narrower the categories.
-Larger values will increase memory usage and create narrower categories.
-
-`categorization_filters`::
-(Optional, array of strings)
-This property expects an array of regular expressions. The expressions
-are used to filter out matching sequences from the categorization field values.
-You can use this functionality to fine tune the categorization by excluding
-sequences from consideration when categories are defined. For example, you can
-exclude SQL statements that appear in your log files. This
-property cannot be used at the same time as `categorization_analyzer`. If you
-only want to define simple regular expression filters that are applied prior to
-tokenization, setting this property is the easiest method. If you also want to
-customize the tokenizer or post-tokenization filtering, use the
-`categorization_analyzer` property instead and include the filters as
-`pattern_replace` character filters.
-
 `categorization_analyzer`::
 (Optional, object or string)
 The categorization analyzer specifies how the text is analyzed and tokenized before
@@ -95,14 +48,33 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=tokenizer]
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=filter]
 =====
 
-`shard_size`::
+`categorization_filters`::
+(Optional, array of strings)
+This property expects an array of regular expressions. The expressions
+are used to filter out matching sequences from the categorization field values.
+You can use this functionality to fine tune the categorization by excluding
+sequences from consideration when categories are defined. For example, you can
+exclude SQL statements that appear in your log files. This
+property cannot be used at the same time as `categorization_analyzer`. If you
+only want to define simple regular expression filters that are applied prior to
+tokenization, setting this property is the easiest method. If you also want to
+customize the tokenizer or post-tokenization filtering, use the
+`categorization_analyzer` property instead and include the filters as
+`pattern_replace` character filters.
+
+`field`::
+(Required, string)
+The semi-structured text field to categorize.
+
+`max_matched_tokens`::
 (Optional, integer)
-The number of categorization buckets to return from each shard before merging
-all the results.
+This parameter does nothing now, but is permitted for compatibility with the original
+implementation.
 
-`size`::
-(Optional, integer, default: `10`)
-The number of buckets to return.
+`max_unique_tokens`::
+(Optional, integer)
+This parameter does nothing now, but is permitted for compatibility with the original
+implementation.
 
 `min_doc_count`::
 (Optional, integer)
@@ -113,8 +85,23 @@ The minimum number of documents for a bucket to be returned to the results.
 The minimum number of documents for a bucket to be returned from the shard before
 merging.
 
-==== Basic use
+`shard_size`::
+(Optional, integer)
+The number of categorization buckets to return from each shard before merging
+all the results.
+
+`similarity_threshold`::
+(Optional, integer, default: `70`)
+The minimum percentage of token weight that must match for text to be added to the
+category bucket.
+Must be between 1 and 100. The larger the value the narrower the categories.
+Larger values will increase memory usage and create narrower categories.
 
+`size`::
+(Optional, integer, default: `10`)
+The number of buckets to return.
+
+==== Basic use
 
 WARNING: Re-analyzing _large_ result sets will require a lot of time and memory. This aggregation should be
          used in conjunction with <<async-search, Async search>>. Additionally, you may consider
@@ -223,11 +210,15 @@ category results
 --------------------------------------------------
 
 Here is an example using `categorization_filters`.
-The default analyzer is a whitespace analyzer with a custom token filter
-which filters out tokens that start with any number.
+The default analyzer uses the `ml_standard` tokenizer which is similar to a whitespace tokenizer
+but filters out tokens that could be interpreted as hexadecimal numbers. The default analyzer
+also uses the `first_line_with_letters` character filter, so that only the first meaningful line
+of multi-line messages is considered.
 But, it may be that a token is a known highly-variable token (formatted usernames, emails, etc.). In that case, it is good to supply
-custom `categorization_filters` to filter out those tokens for better categories. These filters will also reduce memory usage as fewer
-tokens are held in memory for the categories.
+custom `categorization_filters` to filter out those tokens for better categories. These filters may also reduce memory usage as fewer
+tokens are held in memory for the categories. (If there are sufficient examples of different usernames, emails, etc., then
+categories will form that naturally discard them as variables, but for small input data where only one example exists this won't
+happen.)
 
 [source,console]
 --------------------------------------------------
@@ -238,8 +229,7 @@ POST log-messages/_search?filter_path=aggregations
       "categorize_text": {
         "field": "message",
         "categorization_filters": ["\\w+\\_\\d{3}"], <1>
-        "max_matched_tokens": 2, <2>
-        "similarity_threshold": 30 <3>
+        "similarity_threshold": 30 <2>
       }
     }
   }
@@ -248,12 +238,12 @@ POST log-messages/_search?filter_path=aggregations
 // TEST[setup:categorize_text]
 <1> The filters to apply to the analyzed tokens. It filters
 out tokens like `bar_123`.
-<2> Require at least 2 tokens before the log categories attempt to merge together
-<3> Require 30% of the tokens to match before expanding a log categories
-    to add a new log entry
+<2> Require 30% of token weight to match before adding a message to an
+    existing category rather than creating a new one.
 
-The resulting categories are now broad, matching the first token
-and merging the log groups.
+The resulting categories are now very broad, merging the log groups.
+(A `similarity_threshold` of 30% is generally too low. Settings over
+50% are usually better.)
 
 [source,console-result]
 --------------------------------------------------
@@ -263,11 +253,11 @@ and merging the log groups.
       "buckets" : [
         {
           "doc_count" : 4,
-          "key" : "Node *"
+          "key" : "Node"
         },
         {
           "doc_count" : 2,
-          "key" : "User *"
+          "key" : "User"
         }
       ]
     }
 
@@ -27,7 +27,7 @@
 import static org.hamcrest.Matchers.not;
 import static org.hamcrest.Matchers.notANumber;
 
-public class CategorizationAggregationIT extends BaseMlIntegTestCase {
+public class CategorizeTextAggregationIT extends BaseMlIntegTestCase {
 
     private static final String DATA_INDEX = "categorization-agg-data";
 
@@ -77,17 +77,17 @@ public void testAggregationWithBroadCategories() {
             .setSize(0)
             .setTrackTotalHits(false)
             .addAggregation(
+                // Overriding the similarity threshold to just 11% (default is 70%) results in the
+                // "Node started" and "Node stopped" messages being grouped in the same category
                 new CategorizeTextAggregationBuilder("categorize", "msg").setSimilarityThreshold(11)
-                    .setMaxUniqueTokens(2)
-                    .setMaxMatchedTokens(1)
                     .subAggregation(AggregationBuilders.max("max").field("time"))
                     .subAggregation(AggregationBuilders.min("min").field("time"))
             )
             .get();
         InternalCategorizationAggregation agg = response.getAggregations().get("categorize");
         assertThat(agg.getBuckets(), hasSize(2));
 
-        assertCategorizationBucket(agg.getBuckets().get(0), "Node *", 4);
+        assertCategorizationBucket(agg.getBuckets().get(0), "Node", 4);
         assertCategorizationBucket(agg.getBuckets().get(1), "Failed to shutdown error org.aaaa.bbbb.Cccc line caused by foo exception", 2);
     }
 
 
@@ -16,8 +16,8 @@
 import org.elasticsearch.cluster.metadata.IndexMetadata;
 import org.elasticsearch.cluster.routing.ShardRouting;
 import org.elasticsearch.common.settings.Settings;
-import org.elasticsearch.xpack.ml.aggs.categorization2.CategorizeTextAggregationBuilder;
-import org.elasticsearch.xpack.ml.aggs.categorization2.InternalCategorizationAggregation;
+import org.elasticsearch.xpack.ml.aggs.categorization.CategorizeTextAggregationBuilder;
+import org.elasticsearch.xpack.ml.aggs.categorization.InternalCategorizationAggregation;
 import org.elasticsearch.xpack.ml.support.BaseMlIntegTestCase;
 
 import java.util.Arrays;
 
@@ -1417,16 +1417,7 @@ public List<AggregationSpec> getAggregations() {
                 CategorizeTextAggregationBuilder::new,
                 CategorizeTextAggregationBuilder.PARSER
             ).addResultReader(InternalCategorizationAggregation::new)
-                .setAggregatorRegistrar(s -> s.registerUsage(CategorizeTextAggregationBuilder.NAME)),
-            // TODO: in the long term only keep one or other of these categorization aggregations
-            new AggregationSpec(
-                org.elasticsearch.xpack.ml.aggs.categorization2.CategorizeTextAggregationBuilder.NAME,
-                org.elasticsearch.xpack.ml.aggs.categorization2.CategorizeTextAggregationBuilder::new,
-                org.elasticsearch.xpack.ml.aggs.categorization2.CategorizeTextAggregationBuilder.PARSER
-            ).addResultReader(org.elasticsearch.xpack.ml.aggs.categorization2.InternalCategorizationAggregation::new)
-                .setAggregatorRegistrar(
-                    s -> s.registerUsage(org.elasticsearch.xpack.ml.aggs.categorization2.CategorizeTextAggregationBuilder.NAME)
-                )
+                .setAggregatorRegistrar(s -> s.registerUsage(CategorizeTextAggregationBuilder.NAME))
         );
     }
 
 
@@ -15,14 +15,6 @@
 
 class CategorizationBytesRefHash implements Releasable {
 
-    /**
-     * Our special wild card value.
-     */
-    static final BytesRef WILD_CARD_REF = new BytesRef("*");
-    /**
-     * For all WILD_CARD references, the token ID is always -1
-     */
-    static final int WILD_CARD_ID = -1;
     private final BytesRefHash bytesRefHash;
 
     CategorizationBytesRefHash(BytesRefHash bytesRefHash) {
@@ -46,34 +38,28 @@ BytesRef[] getDeeps(int[] ids) {
     }
 
     BytesRef getDeep(long id) {
-        if (id == WILD_CARD_ID) {
-            return WILD_CARD_REF;
-        }
         BytesRef shallow = bytesRefHash.get(id, new BytesRef());
         return BytesRef.deepCopyOf(shallow);
     }
 
     int put(BytesRef bytesRef) {
-        if (WILD_CARD_REF.equals(bytesRef)) {
-            return WILD_CARD_ID;
-        }
         long hash = bytesRefHash.add(bytesRef);
         if (hash < 0) {
+            // BytesRefHash returns -1 - hash if the entry already existed, but we just want to return the hash
             return (int) (-1L - hash);
-        } else {
-            if (hash > Integer.MAX_VALUE) {
-                throw new AggregationExecutionException(
-                    LoggerMessageFormat.format(
-                        "more than [{}] unique terms encountered. "
-                            + "Consider restricting the documents queried or adding [{}] in the {} configuration",
-                        Integer.MAX_VALUE,
-                        CategorizeTextAggregationBuilder.CATEGORIZATION_FILTERS.getPreferredName(),
-                        CategorizeTextAggregationBuilder.NAME
-                    )
-                );
-            }
-            return (int) hash;
         }
+        if (hash > Integer.MAX_VALUE) {
+            throw new AggregationExecutionException(
+                LoggerMessageFormat.format(
+                    "more than [{}] unique terms encountered. "
+                        + "Consider restricting the documents queried or adding [{}] in the {} configuration",
+                    Integer.MAX_VALUE,
+                    CategorizeTextAggregationBuilder.CATEGORIZATION_FILTERS.getPreferredName(),
+                    CategorizeTextAggregationBuilder.NAME
+                )
+            );
+        }
+        return (int) hash;
     }
 
     @Override
 
@@ -5,7 +5,7 @@
  * 2.0.
  */
 
-package org.elasticsearch.xpack.ml.aggs.categorization2;
+package org.elasticsearch.xpack.ml.aggs.categorization;
 
 import java.io.BufferedReader;
 import java.io.IOException;
@@ -25,7 +25,7 @@
  */
 public class CategorizationPartOfSpeechDictionary {
 
-    static final String DICTIONARY_FILE_PATH = "/org/elasticsearch/xpack/ml/aggs/categorization2/ml-en.dict";
+    static final String DICTIONARY_FILE_PATH = "/org/elasticsearch/xpack/ml/aggs/categorization/ml-en.dict";
 
     static final String PART_OF_SPEECH_SEPARATOR = "@";