Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LUCENE-9776: Hunspell: allow to inflect the last part of COMPOUNDRULE compounds #2397

Merged
merged 1 commit into from
Feb 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;

import java.util.ArrayList;
Expand Down Expand Up @@ -397,8 +398,7 @@ private boolean checkCompoundRules(
if (forms != null) {
words.add(forms);

if (dictionary.compoundRules != null
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

null check is done by the caller of this method

&& dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
if (dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
return true;
}
Expand All @@ -417,13 +417,17 @@ private boolean checkCompoundRules(

private boolean checkLastCompoundPart(
char[] wordChars, int start, int length, List<IntsRef> words) {
IntsRef forms = dictionary.lookupWord(wordChars, start, length);
if (forms == null) return false;
IntsRef ref = new IntsRef(new int[1], 0, 1);
words.add(ref);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reuse the "forms" object at the stack top, change its contents for each root candidate


words.add(forms);
boolean result = dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words));
Stemmer.RootProcessor stopOnMatching =
(stem, formID, morphDataId) -> {
ref.ints[0] = formID;
return dictionary.compoundRules.stream().noneMatch(r -> r.fullyMatches(words));
};
boolean found = !stemmer.doStem(wordChars, start, length, COMPOUND_RULE_END, stopOnMatching);
words.remove(words.size() - 1);
return result;
return found;
}

private static boolean isNumber(String s) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -247,19 +247,12 @@ boolean doStem(
if (dictionary.hasFlag(entryId, dictionary.needaffix)) {
continue;
}
// we can't add this form, it only belongs inside a compound word
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
continue;
if ((context == WordContext.COMPOUND_BEGIN || context == WordContext.COMPOUND_MIDDLE)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The "compoundforbid" check extracted (and the contexts enumerated), everything else went into an extracted method, since the same logic was needed in applyAffix

&& dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
return false;
}
if (context.isCompound()) {
if (context != WordContext.COMPOUND_END
&& dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
return false;
}
if (!dictionary.hasFlag(entryId, dictionary.compoundFlag)
&& !dictionary.hasFlag(entryId, context.requiredFlag(dictionary))) {
continue;
}
if (!isRootCompatibleWithContext(context, -1, entryId)) {
continue;
}
if (!callProcessor(word, offset, length, processor, forms, i)) {
return false;
Expand Down Expand Up @@ -540,8 +533,8 @@ private boolean isAffixCompatible(
if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) {
return false;
}
WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END;
if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit)) {
if (!context.isAffixAllowedWithoutSpecialPermit(isPrefix)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This check becomes a bit more complicated with the addition of a new context, so it's extracted to a method

&& !dictionary.hasFlag(append, dictionary.compoundPermit)) {
return false;
}
if (context == WordContext.COMPOUND_END
Expand All @@ -550,18 +543,17 @@ private boolean isAffixCompatible(
&& dictionary.hasFlag(append, dictionary.onlyincompound)) {
return false;
}
} else if (dictionary.hasFlag(append, dictionary.onlyincompound)) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two similar checks from below are now united here

return false;
}

if (recursionDepth == 0) {
// check if affix is allowed in a non-compound word
return context.isCompound() || !dictionary.hasFlag(append, dictionary.onlyincompound);
return true;
}

if (dictionary.isCrossProduct(affix)) {
// cross check incoming continuation class (flag of previous affix) against list.
if (context.isCompound() || !dictionary.hasFlag(append, dictionary.onlyincompound)) {
return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
}
return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
}

return false;
Expand Down Expand Up @@ -640,18 +632,10 @@ private boolean applyAffix(
}
}

if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
if (!isRootCompatibleWithContext(context, affix, entryId)) {
continue;
}
if (context.isCompound()) {
char cFlag = context.requiredFlag(dictionary);
if (!dictionary.hasFlag(entryId, cFlag)
&& !isFlagAppendedByAffix(affix, cFlag)
&& !dictionary.hasFlag(entryId, dictionary.compoundFlag)
&& !isFlagAppendedByAffix(affix, dictionary.compoundFlag)) {
continue;
}
}

if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
return false;
}
Expand Down Expand Up @@ -704,6 +688,20 @@ private boolean applyAffix(
return true;
}

private boolean isRootCompatibleWithContext(WordContext context, int lastAffix, int entryId) {
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
return false;
}
if (context.isCompound() && context != WordContext.COMPOUND_RULE_END) {
char cFlag = context.requiredFlag(dictionary);
return dictionary.hasFlag(entryId, cFlag)
|| isFlagAppendedByAffix(lastAffix, cFlag)
|| dictionary.hasFlag(entryId, dictionary.compoundFlag)
|| isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
}
return true;
}

private boolean callProcessor(
char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
CharsRef stem = new CharsRef(word, offset, length);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,35 @@
package org.apache.lucene.analysis.hunspell;

enum WordContext {
/** non-compound */
SIMPLE_WORD,

/** The first root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
COMPOUND_BEGIN,

/** A middle root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
COMPOUND_MIDDLE,
COMPOUND_END;

/** The final root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
COMPOUND_END,

/**
* The final root in a word with COMPOUNDRULE compounding. The difference to {@link #COMPOUND_END}
* is that this context doesn't require COMPOUNDFLAG/COMPOUNDEND flags, but allows ONLYINCOMPOUND.
*/
COMPOUND_RULE_END;

boolean isCompound() {
return this != SIMPLE_WORD;
}

boolean isAffixAllowedWithoutSpecialPermit(boolean isPrefix) {
if (isPrefix) {
return this == WordContext.COMPOUND_BEGIN;
}
return this == WordContext.COMPOUND_END || this == WordContext.COMPOUND_RULE_END;
}

char requiredFlag(Dictionary dictionary) {
switch (this) {
case COMPOUND_BEGIN:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@ ONLYINCOMPOUND c
COMPOUNDRULE 2
COMPOUNDRULE n*1t
COMPOUNDRULE n*mp

SFX S Y 1
SFX S 0 s
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
7/nm
8/nm
9/nm
0th/pt
0th/ptS
1st/p
1th/tc
2nd/p
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,5 @@
10001st
10011th
1ST
42ND
42ND
10ths