[Feature] Adding a char_group tokenizer (#24186)

synhershko · jpountz · commit 5f172b6795c2 · 2018-05-22T16:26:31.000+02:00
=== Char Group Tokenizer

The `char_group` tokenizer breaks text into terms whenever it encounters
a
character which is in a defined set. It is mostly useful for cases where
a simple
custom tokenization is desired, and the overhead of use of the
&lt;&lt;analysis-pattern-tokenizer, `pattern` tokenizer&gt;&gt;
is not acceptable.

=== Configuration

The `char_group` tokenizer accepts one parameter:

`tokenize_on_chars`::
    A string containing a list of characters to tokenize the string on.
Whenever a character
    from this list is encountered, a new token is started. Also supports
escaped values like `\\n` and `\\f`,
    and in addition `\\s` to represent whitespace, `\\d` to represent
digits and `\\w` to represent letters.
    Defaults to an empty list.

=== Example output

```The 2 QUICK Brown-Foxes jumped over the lazy dog's bone for $2```

When the configuration `\\s-:&lt;&gt;` is used for `tokenize_on_chars`, the
above sentence would produce the following terms:

```[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone,
for, $2 ]```
diff --git a/docs/reference/analysis/tokenizers.asciidoc b/docs/reference/analysis/tokenizers.asciidoc
@@ -103,6 +103,11 @@ The `simple_pattern` tokenizer uses a regular expression to capture matching
 text as terms. It uses a restricted subset of regular expression features
 and is generally faster than the `pattern` tokenizer.
 
+<<analysis-chargroup-tokenizer,Char Group Tokenizer>>::
+
+The `char_group` tokenizer is configurable through sets of characters to split
+on, which is usually less expensive than running regular expressions.
+
 <<analysis-simplepatternsplit-tokenizer,Simple Pattern Split Tokenizer>>::
 
 The `simple_pattern_split` tokenizer uses the same restricted regular expression
@@ -143,6 +148,8 @@ include::tokenizers/keyword-tokenizer.asciidoc[]
 
 include::tokenizers/pattern-tokenizer.asciidoc[]
 
+include::tokenizers/chargroup-tokenizer.asciidoc[]
+
 include::tokenizers/simplepattern-tokenizer.asciidoc[]
 
 include::tokenizers/simplepatternsplit-tokenizer.asciidoc[]
diff --git a/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc
@@ -0,0 +1,80 @@
+[[analysis-chargroup-tokenizer]]
+=== Char Group Tokenizer
+
+The `char_group` tokenizer breaks text into terms whenever it encounters a
+character which is in a defined set. It is mostly useful for cases where a simple
+custom tokenization is desired, and the overhead of use of the <<analysis-pattern-tokenizer, `pattern` tokenizer>>
+is not acceptable.
+
+[float]
+=== Configuration
+
+The `char_group` tokenizer accepts one parameter:
+
+[horizontal]
+`tokenize_on_chars`::
+    A list containing a list of characters to tokenize the string on. Whenever a character 
+    from this list is encountered, a new token is started. This accepts either single
+    characters like eg. `-`, or character groups: `whitespace`, `letter`, `digit`,
+    `punctuation`, `symbol`.
+
+
+[float]
+=== Example output
+
+[source,js]
+---------------------------
+POST _analyze
+{
+  "tokenizer": {
+    "type": "char_group",
+    "tokenize_on_chars": [
+      "whitespace",
+      "-",
+      "\n"
+    ]
+  },
+  "text": "The QUICK brown-fox"
+}
+---------------------------
+// CONSOLE
+
+returns
+
+[source,js]
+---------------------------
+{
+  "tokens": [
+    {
+      "token": "The",
+      "start_offset": 0,
+      "end_offset": 3,
+      "type": "word",
+      "position": 0
+    },
+    {
+      "token": "QUICK",
+      "start_offset": 4,
+      "end_offset": 9,
+      "type": "word",
+      "position": 1
+    },
+    {
+      "token": "brown",
+      "start_offset": 10,
+      "end_offset": 15,
+      "type": "word",
+      "position": 2
+    },
+    {
+      "token": "fox",
+      "start_offset": 16,
+      "end_offset": 19,
+      "type": "word",
+      "position": 3
+    }
+  ]
+}
+---------------------------
+// TESTRESPONSE
+
diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.CharTokenizer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
+
+import java.util.HashSet;
+import java.util.Set;
+
+public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
+
+    private final Set<Integer> tokenizeOnChars = new HashSet<>();
+    private boolean tokenizeOnSpace = false;
+    private boolean tokenizeOnLetter = false;
+    private boolean tokenizeOnDigit = false;
+    private boolean tokenizeOnPunctuation = false;
+    private boolean tokenizeOnSymbol = false;
+
+    public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, name, settings);
+
+        for (final String c : settings.getAsList("tokenize_on_chars")) {
+            if (c == null || c.length() == 0) {
+                throw new RuntimeException("[tokenize_on_chars] cannot contain empty characters");
+            }
+
+            if (c.length() == 1) {
+                tokenizeOnChars.add((int) c.charAt(0));
+            }
+            else if (c.charAt(0) == '\\') {
+                tokenizeOnChars.add((int) parseEscapedChar(c));
+            } else {
+                switch (c) {
+                    case "letter":
+                        tokenizeOnLetter = true;
+                        break;
+                    case "digit":
+                        tokenizeOnDigit = true;
+                        break;
+                    case "whitespace":
+                        tokenizeOnSpace = true;
+                        break;
+                    case "punctuation":
+                        tokenizeOnPunctuation = true;
+                        break;
+                    case "symbol":
+                        tokenizeOnSymbol = true;
+                        break;
+                    default:
+                        throw new RuntimeException("Invalid escaped char in [" + c + "]");
+                }
+            }
+        }
+    }
+
+    private char parseEscapedChar(final String s) {
+        int len = s.length();
+        char c = s.charAt(0);
+        if (c == '\\') {
+            if (1 >= len)
+                throw new RuntimeException("Invalid escaped char in [" + s + "]");
+            c = s.charAt(1);
+            switch (c) {
+                case '\\':
+                    return '\\';
+                case 'n':
+                    return '\n';
+                case 't':
+                    return '\t';
+                case 'r':
+                    return '\r';
+                case 'b':
+                    return '\b';
+                case 'f':
+                    return '\f';
+                case 'u':
+                    if (len > 6) {
+                        throw new RuntimeException("Invalid escaped char in [" + s + "]");
+                    }
+                    return (char) Integer.parseInt(s.substring(2), 16);
+                default:
+                    throw new RuntimeException("Invalid escaped char " + c + " in [" + s + "]");
+            }
+        } else {
+            throw new RuntimeException("Invalid escaped char [" + s + "]");
+        }
+    }
+
+    @Override
+    public Tokenizer create() {
+        return new CharTokenizer() {
+            @Override
+            protected boolean isTokenChar(int c) {
+                if (tokenizeOnSpace && Character.isWhitespace(c)) {
+                    return false;
+                }
+                if (tokenizeOnLetter && Character.isLetter(c)) {
+                    return false;
+                }
+                if (tokenizeOnDigit && Character.isDigit(c)) {
+                    return false;
+                }
+                if (tokenizeOnPunctuation && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) {
+                    return false;
+                }
+                if (tokenizeOnSymbol && CharMatcher.Basic.SYMBOL.isTokenChar(c)) {
+                    return false;
+                }
+                return !tokenizeOnChars.contains(c);
+            }
+        };
+    }
+}
diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@@ -184,6 +184,7 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
         tokenizers.put("ngram", NGramTokenizerFactory::new);
         tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new);
         tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
+        tokenizers.put("char_group", CharGroupTokenizerFactory::new);
         tokenizers.put("classic", ClassicTokenizerFactory::new);
         tokenizers.put("letter", LetterTokenizerFactory::new);
         tokenizers.put("lowercase", LowerCaseTokenizerFactory::new);
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.test.ESTokenStreamTestCase;
+import org.elasticsearch.test.IndexSettingsModule;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Arrays;
+
+
+public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase {
+    public void testParseTokenChars() {
+        final Index index = new Index("test", "_na_");
+        final Settings indexSettings = newAnalysisSettingsBuilder().build();
+        IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
+        final String name = "cg";
+        for (String[] conf : Arrays.asList(
+                new String[] { "\\v" },
+                new String[] { "\\u00245" },
+                new String[] { "commas" },
+                new String[] { "a", "b", "c", "\\$" })) {
+            final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", conf).build();
+            expectThrows(RuntimeException.class, () -> new CharGroupTokenizerFactory(indexProperties, null, name, settings).create());
+        }
+
+        for (String[] conf : Arrays.asList(
+                new String[0],
+                new String[] { "\\n" },
+                new String[] { "\\u0024" },
+                new String[] { "whitespace" },
+                new String[] { "a", "b", "c" },
+                new String[] { "a", "b", "c", "\\r" },
+                new String[] { "\\r" },
+                new String[] { "f", "o", "o", "symbol" })) {
+            final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", Arrays.asList(conf)).build();
+            new CharGroupTokenizerFactory(indexProperties, null, name, settings).create();
+            // no exception
+        }
+    }
+
+    public void testTokenization() throws IOException {
+        final Index index = new Index("test", "_na_");
+        final String name = "cg";
+        final Settings indexSettings = newAnalysisSettingsBuilder().build();
+        final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", "whitespace", ":", "\\u0024").build();
+        Tokenizer tokenizer = new CharGroupTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings),
+                null, name, settings).create();
+        tokenizer.setReader(new StringReader("foo bar $34 test:test2"));
+        assertTokenStreamContents(tokenizer, new String[] {"foo", "bar", "34", "test", "test2"});
+    }
+}