Skip to content

Commit 5f172b6

Browse files
synhershkojpountz
authored andcommitted
[Feature] Adding a char_group tokenizer (#24186)
=== Char Group Tokenizer The `char_group` tokenizer breaks text into terms whenever it encounters a character which is in a defined set. It is mostly useful for cases where a simple custom tokenization is desired, and the overhead of use of the <<analysis-pattern-tokenizer, `pattern` tokenizer>> is not acceptable. === Configuration The `char_group` tokenizer accepts one parameter: `tokenize_on_chars`:: A string containing a list of characters to tokenize the string on. Whenever a character from this list is encountered, a new token is started. Also supports escaped values like `\\n` and `\\f`, and in addition `\\s` to represent whitespace, `\\d` to represent digits and `\\w` to represent letters. Defaults to an empty list. === Example output ```The 2 QUICK Brown-Foxes jumped over the lazy dog's bone for $2``` When the configuration `\\s-:<>` is used for `tokenize_on_chars`, the above sentence would produce the following terms: ```[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone, for, $2 ]```
1 parent 74474e9 commit 5f172b6

File tree

5 files changed

+297
-0
lines changed

5 files changed

+297
-0
lines changed

docs/reference/analysis/tokenizers.asciidoc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,11 @@ The `simple_pattern` tokenizer uses a regular expression to capture matching
103103
text as terms. It uses a restricted subset of regular expression features
104104
and is generally faster than the `pattern` tokenizer.
105105

106+
<<analysis-chargroup-tokenizer,Char Group Tokenizer>>::
107+
108+
The `char_group` tokenizer is configurable through sets of characters to split
109+
on, which is usually less expensive than running regular expressions.
110+
106111
<<analysis-simplepatternsplit-tokenizer,Simple Pattern Split Tokenizer>>::
107112

108113
The `simple_pattern_split` tokenizer uses the same restricted regular expression
@@ -143,6 +148,8 @@ include::tokenizers/keyword-tokenizer.asciidoc[]
143148

144149
include::tokenizers/pattern-tokenizer.asciidoc[]
145150

151+
include::tokenizers/chargroup-tokenizer.asciidoc[]
152+
146153
include::tokenizers/simplepattern-tokenizer.asciidoc[]
147154

148155
include::tokenizers/simplepatternsplit-tokenizer.asciidoc[]
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
[[analysis-chargroup-tokenizer]]
2+
=== Char Group Tokenizer
3+
4+
The `char_group` tokenizer breaks text into terms whenever it encounters a
5+
character which is in a defined set. It is mostly useful for cases where a simple
6+
custom tokenization is desired, and the overhead of use of the <<analysis-pattern-tokenizer, `pattern` tokenizer>>
7+
is not acceptable.
8+
9+
[float]
10+
=== Configuration
11+
12+
The `char_group` tokenizer accepts one parameter:
13+
14+
[horizontal]
15+
`tokenize_on_chars`::
16+
A list containing a list of characters to tokenize the string on. Whenever a character
17+
from this list is encountered, a new token is started. This accepts either single
18+
characters like eg. `-`, or character groups: `whitespace`, `letter`, `digit`,
19+
`punctuation`, `symbol`.
20+
21+
22+
[float]
23+
=== Example output
24+
25+
[source,js]
26+
---------------------------
27+
POST _analyze
28+
{
29+
"tokenizer": {
30+
"type": "char_group",
31+
"tokenize_on_chars": [
32+
"whitespace",
33+
"-",
34+
"\n"
35+
]
36+
},
37+
"text": "The QUICK brown-fox"
38+
}
39+
---------------------------
40+
// CONSOLE
41+
42+
returns
43+
44+
[source,js]
45+
---------------------------
46+
{
47+
"tokens": [
48+
{
49+
"token": "The",
50+
"start_offset": 0,
51+
"end_offset": 3,
52+
"type": "word",
53+
"position": 0
54+
},
55+
{
56+
"token": "QUICK",
57+
"start_offset": 4,
58+
"end_offset": 9,
59+
"type": "word",
60+
"position": 1
61+
},
62+
{
63+
"token": "brown",
64+
"start_offset": 10,
65+
"end_offset": 15,
66+
"type": "word",
67+
"position": 2
68+
},
69+
{
70+
"token": "fox",
71+
"start_offset": 16,
72+
"end_offset": 19,
73+
"type": "word",
74+
"position": 3
75+
}
76+
]
77+
}
78+
---------------------------
79+
// TESTRESPONSE
80+
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.analysis.common;
21+
22+
import org.apache.lucene.analysis.Tokenizer;
23+
import org.apache.lucene.analysis.util.CharTokenizer;
24+
import org.elasticsearch.common.settings.Settings;
25+
import org.elasticsearch.env.Environment;
26+
import org.elasticsearch.index.IndexSettings;
27+
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
28+
29+
import java.util.HashSet;
30+
import java.util.Set;
31+
32+
public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
33+
34+
private final Set<Integer> tokenizeOnChars = new HashSet<>();
35+
private boolean tokenizeOnSpace = false;
36+
private boolean tokenizeOnLetter = false;
37+
private boolean tokenizeOnDigit = false;
38+
private boolean tokenizeOnPunctuation = false;
39+
private boolean tokenizeOnSymbol = false;
40+
41+
public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
42+
super(indexSettings, name, settings);
43+
44+
for (final String c : settings.getAsList("tokenize_on_chars")) {
45+
if (c == null || c.length() == 0) {
46+
throw new RuntimeException("[tokenize_on_chars] cannot contain empty characters");
47+
}
48+
49+
if (c.length() == 1) {
50+
tokenizeOnChars.add((int) c.charAt(0));
51+
}
52+
else if (c.charAt(0) == '\\') {
53+
tokenizeOnChars.add((int) parseEscapedChar(c));
54+
} else {
55+
switch (c) {
56+
case "letter":
57+
tokenizeOnLetter = true;
58+
break;
59+
case "digit":
60+
tokenizeOnDigit = true;
61+
break;
62+
case "whitespace":
63+
tokenizeOnSpace = true;
64+
break;
65+
case "punctuation":
66+
tokenizeOnPunctuation = true;
67+
break;
68+
case "symbol":
69+
tokenizeOnSymbol = true;
70+
break;
71+
default:
72+
throw new RuntimeException("Invalid escaped char in [" + c + "]");
73+
}
74+
}
75+
}
76+
}
77+
78+
private char parseEscapedChar(final String s) {
79+
int len = s.length();
80+
char c = s.charAt(0);
81+
if (c == '\\') {
82+
if (1 >= len)
83+
throw new RuntimeException("Invalid escaped char in [" + s + "]");
84+
c = s.charAt(1);
85+
switch (c) {
86+
case '\\':
87+
return '\\';
88+
case 'n':
89+
return '\n';
90+
case 't':
91+
return '\t';
92+
case 'r':
93+
return '\r';
94+
case 'b':
95+
return '\b';
96+
case 'f':
97+
return '\f';
98+
case 'u':
99+
if (len > 6) {
100+
throw new RuntimeException("Invalid escaped char in [" + s + "]");
101+
}
102+
return (char) Integer.parseInt(s.substring(2), 16);
103+
default:
104+
throw new RuntimeException("Invalid escaped char " + c + " in [" + s + "]");
105+
}
106+
} else {
107+
throw new RuntimeException("Invalid escaped char [" + s + "]");
108+
}
109+
}
110+
111+
@Override
112+
public Tokenizer create() {
113+
return new CharTokenizer() {
114+
@Override
115+
protected boolean isTokenChar(int c) {
116+
if (tokenizeOnSpace && Character.isWhitespace(c)) {
117+
return false;
118+
}
119+
if (tokenizeOnLetter && Character.isLetter(c)) {
120+
return false;
121+
}
122+
if (tokenizeOnDigit && Character.isDigit(c)) {
123+
return false;
124+
}
125+
if (tokenizeOnPunctuation && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) {
126+
return false;
127+
}
128+
if (tokenizeOnSymbol && CharMatcher.Basic.SYMBOL.isTokenChar(c)) {
129+
return false;
130+
}
131+
return !tokenizeOnChars.contains(c);
132+
}
133+
};
134+
}
135+
}

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
184184
tokenizers.put("ngram", NGramTokenizerFactory::new);
185185
tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new);
186186
tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
187+
tokenizers.put("char_group", CharGroupTokenizerFactory::new);
187188
tokenizers.put("classic", ClassicTokenizerFactory::new);
188189
tokenizers.put("letter", LetterTokenizerFactory::new);
189190
tokenizers.put("lowercase", LowerCaseTokenizerFactory::new);
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.analysis.common;
21+
22+
import org.apache.lucene.analysis.Tokenizer;
23+
import org.elasticsearch.common.settings.Settings;
24+
import org.elasticsearch.index.Index;
25+
import org.elasticsearch.index.IndexSettings;
26+
import org.elasticsearch.test.ESTokenStreamTestCase;
27+
import org.elasticsearch.test.IndexSettingsModule;
28+
29+
import java.io.IOException;
30+
import java.io.StringReader;
31+
import java.util.Arrays;
32+
33+
34+
public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase {
35+
public void testParseTokenChars() {
36+
final Index index = new Index("test", "_na_");
37+
final Settings indexSettings = newAnalysisSettingsBuilder().build();
38+
IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
39+
final String name = "cg";
40+
for (String[] conf : Arrays.asList(
41+
new String[] { "\\v" },
42+
new String[] { "\\u00245" },
43+
new String[] { "commas" },
44+
new String[] { "a", "b", "c", "\\$" })) {
45+
final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", conf).build();
46+
expectThrows(RuntimeException.class, () -> new CharGroupTokenizerFactory(indexProperties, null, name, settings).create());
47+
}
48+
49+
for (String[] conf : Arrays.asList(
50+
new String[0],
51+
new String[] { "\\n" },
52+
new String[] { "\\u0024" },
53+
new String[] { "whitespace" },
54+
new String[] { "a", "b", "c" },
55+
new String[] { "a", "b", "c", "\\r" },
56+
new String[] { "\\r" },
57+
new String[] { "f", "o", "o", "symbol" })) {
58+
final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", Arrays.asList(conf)).build();
59+
new CharGroupTokenizerFactory(indexProperties, null, name, settings).create();
60+
// no exception
61+
}
62+
}
63+
64+
public void testTokenization() throws IOException {
65+
final Index index = new Index("test", "_na_");
66+
final String name = "cg";
67+
final Settings indexSettings = newAnalysisSettingsBuilder().build();
68+
final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", "whitespace", ":", "\\u0024").build();
69+
Tokenizer tokenizer = new CharGroupTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings),
70+
null, name, settings).create();
71+
tokenizer.setReader(new StringReader("foo bar $34 test:test2"));
72+
assertTokenStreamContents(tokenizer, new String[] {"foo", "bar", "34", "test", "test2"});
73+
}
74+
}

0 commit comments

Comments
 (0)