Skip to content

Remove special-casing of Synonym filters in AnalysisRegistry #34034

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Sep 28, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,8 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
filters.put("sorani_normalization", SoraniNormalizationFilterFactory::new);
filters.put("stemmer_override", requiresAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
filters.put("stemmer", StemmerTokenFilterFactory::new);
filters.put("synonym", requiresAnalysisSettings(SynonymTokenFilterFactory::new));
filters.put("synonym_graph", requiresAnalysisSettings(SynonymGraphTokenFilterFactory::new));
filters.put("trim", TrimTokenFilterFactory::new);
filters.put("truncate", requiresAnalysisSettings(TruncateTokenFilterFactory::new));
filters.put("unique", UniqueTokenFilterFactory::new);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
Expand All @@ -26,16 +26,18 @@
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;

import java.io.IOException;
import java.util.List;
import java.util.function.Function;

public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {

public SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
String name, Settings settings) throws IOException {
super(indexSettings, env, analysisRegistry, name, settings);
SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env,
String name, Settings settings) {
super(indexSettings, env, name, settings);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
Expand All @@ -26,23 +26,28 @@
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.CustomAnalyzer;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.List;
import java.util.function.Function;

public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {

protected final String format;
protected final boolean expand;
protected final boolean lenient;
private final String format;
private final boolean expand;
private final boolean lenient;
protected final Settings settings;
protected final Environment environment;

public SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
String name, Settings settings) throws IOException {
SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env,
String name, Settings settings) {
super(indexSettings, name, settings);
this.settings = settings;

Expand Down Expand Up @@ -83,15 +88,15 @@ public TokenStream create(TokenStream tokenStream) {
};
}

protected Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
List<TokenFilterFactory> tokenFilters) {
Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
List<TokenFilterFactory> tokenFilters) {
return new CustomAnalyzer("synonyms", tokenizer, charFilters.toArray(new CharFilterFactory[0]),
tokenFilters.stream()
.map(TokenFilterFactory::getSynonymFilter)
.toArray(TokenFilterFactory[]::new));
}

protected SynonymMap buildSynonyms(Analyzer analyzer, Reader rules) {
SynonymMap buildSynonyms(Analyzer analyzer, Reader rules) {
try {
SynonymMap.Builder parser;
if ("wordnet".equalsIgnoreCase(format)) {
Expand All @@ -107,7 +112,7 @@ protected SynonymMap buildSynonyms(Analyzer analyzer, Reader rules) {
}
}

protected Reader getRulesFromSettings(Environment env) {
Reader getRulesFromSettings(Environment env) {
Reader rulesReader;
if (settings.getAsList("synonyms", null) != null) {
List<String> rulesList = Analysis.getWordList(env, settings, "synonyms");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase;

import java.util.List;
Expand Down Expand Up @@ -106,6 +105,7 @@ protected Map<String, Class<?>> getTokenFilters() {
filters.put("stemmeroverride", StemmerOverrideTokenFilterFactory.class);
filters.put("kstem", KStemTokenFilterFactory.class);
filters.put("synonym", SynonymTokenFilterFactory.class);
filters.put("synonymgraph", SynonymGraphTokenFilterFactory.class);
filters.put("dictionarycompoundword", DictionaryCompoundWordTokenFilterFactory.class);
filters.put("hyphenationcompoundword", HyphenationCompoundWordTokenFilterFactory.class);
filters.put("reversestring", ReverseTokenFilterFactory.class);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,31 @@

import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.query.Operator;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.test.ESIntegTestCase;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;

import static org.elasticsearch.client.Requests.searchRequest;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchPhrasePrefixQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchPhraseQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
import static org.elasticsearch.index.query.QueryBuilders.termQuery;
import static org.elasticsearch.search.builder.SearchSourceBuilder.highlight;
import static org.elasticsearch.search.builder.SearchSourceBuilder.searchSource;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight;
import static org.hamcrest.Matchers.anyOf;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.startsWith;

Expand Down Expand Up @@ -153,4 +163,165 @@ public void testMultiPhraseCutoff() throws IOException {
+ "<em>http://www.facebook.com</em> <em>http://elasticsearch.org</em> "
+ "<em>http://xing.com</em> <em>http://cnn.com</em> http://quora.com"));
}

public void testSynonyms() throws IOException {
Settings.Builder builder = Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.synonym.tokenizer", "standard")
.putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase")
.put("index.analysis.filter.synonym.type", "synonym")
.putList("index.analysis.filter.synonym.synonyms", "fast,quick");

assertAcked(prepareCreate("test").setSettings(builder.build())
.addMapping("type1", "field1",
"type=text,term_vector=with_positions_offsets,search_analyzer=synonym," +
"analyzer=standard,index_options=offsets"));
ensureGreen();

client().prepareIndex("test", "type1", "0").setSource(
"field1", "The quick brown fox jumps over the lazy dog").get();
refresh();
for (String highlighterType : new String[] {"plain", "fvh", "unified"}) {
logger.info("--> highlighting (type=" + highlighterType + ") and searching on field1");
SearchSourceBuilder source = searchSource()
.query(matchQuery("field1", "quick brown fox").operator(Operator.AND))
.highlighter(
highlight()
.field("field1")
.order("score")
.preTags("<x>")
.postTags("</x>")
.highlighterType(highlighterType));
SearchResponse searchResponse = client().search(searchRequest("test").source(source)).actionGet();
assertHighlight(searchResponse, 0, "field1", 0, 1,
equalTo("The <x>quick</x> <x>brown</x> <x>fox</x> jumps over the lazy dog"));

source = searchSource()
.query(matchQuery("field1", "fast brown fox").operator(Operator.AND))
.highlighter(highlight().field("field1").order("score").preTags("<x>").postTags("</x>"));
searchResponse = client().search(searchRequest("test").source(source)).actionGet();
assertHighlight(searchResponse, 0, "field1", 0, 1,
equalTo("The <x>quick</x> <x>brown</x> <x>fox</x> jumps over the lazy dog"));
}
}

public void testPhrasePrefix() throws IOException {
Settings.Builder builder = Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.synonym.tokenizer", "standard")
.putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase")
.put("index.analysis.filter.synonym.type", "synonym")
.putList("index.analysis.filter.synonym.synonyms", "quick => fast");

assertAcked(prepareCreate("first_test_index").setSettings(builder.build()).addMapping("type1", type1TermVectorMapping()));

ensureGreen();

client().prepareIndex("first_test_index", "type1", "0").setSource(
"field0", "The quick brown fox jumps over the lazy dog",
"field1", "The quick brown fox jumps over the lazy dog").get();
client().prepareIndex("first_test_index", "type1", "1").setSource("field1",
"The quick browse button is a fancy thing, right bro?").get();
refresh();
logger.info("--> highlighting and searching on field0");

SearchSourceBuilder source = searchSource()
.query(matchPhrasePrefixQuery("field0", "bro"))
.highlighter(highlight().field("field0").order("score").preTags("<x>").postTags("</x>"));
SearchResponse searchResponse = client().search(searchRequest("first_test_index").source(source)).actionGet();

assertHighlight(searchResponse, 0, "field0", 0, 1, equalTo("The quick <x>brown</x> fox jumps over the lazy dog"));

source = searchSource()
.query(matchPhrasePrefixQuery("field0", "quick bro"))
.highlighter(highlight().field("field0").order("score").preTags("<x>").postTags("</x>"));

searchResponse = client().search(searchRequest("first_test_index").source(source)).actionGet();
assertHighlight(searchResponse, 0, "field0", 0, 1,
equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog"));

logger.info("--> highlighting and searching on field1");
source = searchSource()
.query(boolQuery()
.should(matchPhrasePrefixQuery("field1", "test"))
.should(matchPhrasePrefixQuery("field1", "bro"))
)
.highlighter(highlight().field("field1").order("score").preTags("<x>").postTags("</x>"));

searchResponse = client().search(searchRequest("first_test_index").source(source)).actionGet();
assertThat(searchResponse.getHits().totalHits, equalTo(2L));
for (int i = 0; i < 2; i++) {
assertHighlight(searchResponse, i, "field1", 0, 1, anyOf(
equalTo("The quick <x>browse</x> button is a fancy thing, right <x>bro</x>?"),
equalTo("The quick <x>brown</x> fox jumps over the lazy dog")));
}

source = searchSource()
.query(matchPhrasePrefixQuery("field1", "quick bro"))
.highlighter(highlight().field("field1").order("score").preTags("<x>").postTags("</x>"));

searchResponse = client().search(searchRequest("first_test_index").source(source)).actionGet();

assertHighlight(searchResponse, 0, "field1", 0, 1, anyOf(
equalTo("The <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog")));
assertHighlight(searchResponse, 1, "field1", 0, 1, anyOf(
equalTo("The <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog")));

assertAcked(prepareCreate("second_test_index").setSettings(builder.build()).addMapping("doc",
"field4", "type=text,term_vector=with_positions_offsets,analyzer=synonym",
"field3", "type=text,analyzer=synonym"));
// with synonyms
client().prepareIndex("second_test_index", "doc", "0").setSource(
"type", "type2",
"field4", "The quick brown fox jumps over the lazy dog",
"field3", "The quick brown fox jumps over the lazy dog").get();
client().prepareIndex("second_test_index", "doc", "1").setSource(
"type", "type2",
"field4", "The quick browse button is a fancy thing, right bro?").get();
client().prepareIndex("second_test_index", "doc", "2").setSource(
"type", "type2",
"field4", "a quick fast blue car").get();
refresh();

source = searchSource().postFilter(termQuery("type", "type2")).query(matchPhrasePrefixQuery("field3", "fast bro"))
.highlighter(highlight().field("field3").order("score").preTags("<x>").postTags("</x>"));

searchResponse = client().search(searchRequest("second_test_index").source(source)).actionGet();

assertHighlight(searchResponse, 0, "field3", 0, 1,
equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog"));

logger.info("--> highlighting and searching on field4");
source = searchSource().postFilter(termQuery("type", "type2")).query(matchPhrasePrefixQuery("field4", "the fast bro"))
.highlighter(highlight().field("field4").order("score").preTags("<x>").postTags("</x>"));
searchResponse = client().search(searchRequest("second_test_index").source(source)).actionGet();

assertHighlight(searchResponse, 0, "field4", 0, 1, anyOf(
equalTo("<x>The</x> <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
equalTo("<x>The</x> <x>quick</x> <x>brown</x> fox jumps over the lazy dog")));
assertHighlight(searchResponse, 1, "field4", 0, 1, anyOf(
equalTo("<x>The</x> <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
equalTo("<x>The</x> <x>quick</x> <x>brown</x> fox jumps over the lazy dog")));

logger.info("--> highlighting and searching on field4");
source = searchSource().postFilter(termQuery("type", "type2"))
.query(matchPhrasePrefixQuery("field4", "a fast quick blue ca"))
.highlighter(highlight().field("field4").order("score").preTags("<x>").postTags("</x>"));
searchResponse = client().search(searchRequest("second_test_index").source(source)).actionGet();

assertHighlight(searchResponse, 0, "field4", 0, 1,
anyOf(equalTo("<x>a quick fast blue car</x>"),
equalTo("<x>a</x> <x>quick</x> <x>fast</x> <x>blue</x> <x>car</x>")));
}

public static XContentBuilder type1TermVectorMapping() throws IOException {
return XContentFactory.jsonBuilder().startObject().startObject("type1")
.startObject("properties")
.startObject("field1").field("type", "text").field("term_vector", "with_positions_offsets").endObject()
.startObject("field2").field("type", "text").field("term_vector", "with_positions_offsets").endObject()
.endObject()
.endObject().endObject();
}
}
Loading