Skip to content

Commit

Permalink
[Backport 2.x] Use Lucene provided Persian stem (opensearch-project#1…
Browse files Browse the repository at this point in the history
…4847) (opensearch-project#14894)

Lucene provided Persian stem apparently isn't hooked yet and this change is
doing that based on what is done for Arabic stem support.

Signed-off-by: Ebrahim Byagowi <ebrahim@gnu.org>
Signed-off-by: Daniel (dB.) Doubrovkine <dblock@amazon.com>
Co-authored-by: Daniel (dB.) Doubrovkine <dblock@amazon.com>
(cherry picked from commit 4e45c9e)

Signed-off-by: Daniel (dB.) Doubrovkine <dblock@amazon.com>
Co-authored-by: Daniel (dB.) Doubrovkine <dblock@amazon.com>
Signed-off-by: kkewwei <kkewwei@163.com>
  • Loading branch information
2 people authored and kkewwei committed Jul 24, 2024
1 parent 93d0983 commit f944c5e
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Add SplitResponseProcessor to Search Pipelines (([#14800](https://github.com/opensearch-project/OpenSearch/issues/14800)))
- Optimize TransportNodesAction to not send DiscoveryNodes for NodeStats, NodesInfo and ClusterStats call ([14749](https://github.com/opensearch-project/OpenSearch/pull/14749))
- Refactor remote-routing-table service inline with remote state interfaces([#14668](https://github.com/opensearch-project/OpenSearch/pull/14668))
- Add persian_stem filter (([#14847](https://github.com/opensearch-project/OpenSearch/pull/14847)))
- Reduce logging in DEBUG for MasterService:run ([#14795](https://github.com/opensearch-project/OpenSearch/pull/14795))
- Enabling term version check on local state for all ClusterManager Read Transport Actions ([#14273](https://github.com/opensearch-project/OpenSearch/pull/14273))
- Add rest, transport layer changes for hot to warm tiering - dedicated setup (([#13980](https://github.com/opensearch-project/OpenSearch/pull/13980))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
import org.apache.lucene.analysis.eu.BasqueAnalyzer;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.apache.lucene.analysis.fa.PersianStemFilter;
import org.apache.lucene.analysis.fi.FinnishAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.ga.IrishAnalyzer;
Expand Down Expand Up @@ -308,6 +309,7 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
filters.put("pattern_capture", requiresAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new));
filters.put("persian_normalization", PersianNormalizationFilterFactory::new);
filters.put("persian_stem", PersianStemTokenFilterFactory::new);
filters.put("porter_stem", PorterStemTokenFilterFactory::new);
filters.put(
"predicate_token_filter",
Expand Down Expand Up @@ -549,6 +551,7 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
return new NGramTokenFilter(reader, 1, 2, false);
}));
filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("persian_stem", true, PersianStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/

package org.opensearch.analysis.common;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.fa.PersianStemFilter;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AbstractTokenFilterFactory;

public class PersianStemTokenFilterFactory extends AbstractTokenFilterFactory {

PersianStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new PersianStemFilter(tokenStream);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.es.SpanishLightStemFilter;
import org.apache.lucene.analysis.fa.PersianStemFilter;
import org.apache.lucene.analysis.fi.FinnishLightStemFilter;
import org.apache.lucene.analysis.fr.FrenchLightStemFilter;
import org.apache.lucene.analysis.fr.FrenchMinimalStemFilter;
Expand Down Expand Up @@ -239,6 +240,8 @@ public TokenStream create(TokenStream tokenStream) {
return new NorwegianLightStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK);
} else if ("minimal_nynorsk".equalsIgnoreCase(language) || "minimalNynorsk".equalsIgnoreCase(language)) {
return new NorwegianMinimalStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK);
} else if ("persian".equalsIgnoreCase(language)) {
return new PersianStemFilter(tokenStream);

// Portuguese stemmers
} else if ("portuguese".equalsIgnoreCase(language)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ protected Map<String, Class<?>> getTokenFilters() {
filters.put("brazilianstem", BrazilianStemTokenFilterFactory.class);
filters.put("czechstem", CzechStemTokenFilterFactory.class);
filters.put("germanstem", GermanStemTokenFilterFactory.class);
filters.put("persianstem", PersianStemTokenFilterFactory.class);
filters.put("telugunormalization", TeluguNormalizationFilterFactory.class);
filters.put("telugustem", TeluguStemFilterFactory.class);
// this filter is not exposed and should only be used internally
Expand Down Expand Up @@ -220,6 +221,7 @@ protected Map<String, Class<?>> getPreConfiguredTokenFilters() {
filters.put("ngram", null);
filters.put("nGram", null);
filters.put("persian_normalization", null);
filters.put("persian_stem", null);
filters.put("porter_stem", null);
filters.put("reverse", ReverseStringFilterFactory.class);
filters.put("russian_stem", SnowballPorterFilterFactory.class);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1781,6 +1781,37 @@
- length: { tokens: 1 }
- match: { tokens.0.token: abschliess }

---
"persian_stem":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_persian_stem:
type: persian_stem
- do:
indices.analyze:
index: test
body:
text: جامدات
tokenizer: keyword
filter: [my_persian_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: جامد }

# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: جامدات
tokenizer: keyword
filter: [persian_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: جامد }

---
"russian_stem":
- do:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ public abstract class AnalysisFactoryTestCase extends OpenSearchTestCase {
.put("patterncapturegroup", MovedToAnalysisCommon.class)
.put("patternreplace", MovedToAnalysisCommon.class)
.put("persiannormalization", MovedToAnalysisCommon.class)
.put("persianstem", MovedToAnalysisCommon.class)
.put("porterstem", MovedToAnalysisCommon.class)
.put("portuguesestem", MovedToAnalysisCommon.class)
.put("portugueselightstem", MovedToAnalysisCommon.class)
Expand Down Expand Up @@ -219,7 +220,6 @@ public abstract class AnalysisFactoryTestCase extends OpenSearchTestCase {
.put("spanishpluralstem", Void.class)
// LUCENE-10352
.put("daitchmokotoffsoundex", Void.class)
.put("persianstem", Void.class)
// https://github.com/apache/lucene/pull/12169
.put("word2vecsynonym", Void.class)
// https://github.com/apache/lucene/pull/12915
Expand Down

0 comments on commit f944c5e

Please sign in to comment.