Skip to content

Commit

Permalink
LUCENE-9736: Hunspell: support MAP-based suggestions for groups of si…
Browse files Browse the repository at this point in the history
…milar letters (apache#2314)
  • Loading branch information
donnerpeter authored Feb 8, 2021
1 parent 061233c commit 6536263
Show file tree
Hide file tree
Showing 7 changed files with 73 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ public class Dictionary {
String[] neighborKeyGroups = new String[0];
boolean enableSplitSuggestions = true;
List<RepEntry> repTable = new ArrayList<>();
List<List<String>> mapTable = new ArrayList<>();

// FSTs used for ICONV/OCONV, output ord pointing to replacement text
FST<CharsRef> iconv;
Expand Down Expand Up @@ -399,6 +400,11 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
String[] parts = splitBySpace(reader, reader.readLine(), 3, Integer.MAX_VALUE);
repTable.add(new RepEntry(parts[1], parts[2]));
}
} else if ("MAP".equals(firstWord)) {
int count = parseNum(reader, line);
for (int i = 0; i < count; i++) {
mapTable.add(parseMapEntry(reader, reader.readLine()));
}
} else if ("KEY".equals(firstWord)) {
neighborKeyGroups = singleArgument(reader, line).split("\\|");
} else if ("NOSPLITSUGS".equals(firstWord)) {
Expand Down Expand Up @@ -462,6 +468,25 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
stripOffsets[currentIndex] = currentOffset;
}

private List<String> parseMapEntry(LineNumberReader reader, String line) throws ParseException {
String unparsed = singleArgument(reader, line);
List<String> mapEntry = new ArrayList<>();
for (int j = 0; j < unparsed.length(); j++) {
if (unparsed.charAt(j) == '(') {
int closing = unparsed.indexOf(')', j);
if (closing < 0) {
throw new ParseException("Unclosed parenthesis: " + line, reader.getLineNumber());
}

mapEntry.add(unparsed.substring(j + 1, closing));
j = closing;
} else {
mapEntry.add(String.valueOf(unparsed.charAt(j)));
}
}
return mapEntry;
}

private boolean hasLanguage(String... langCodes) {
if (language == null) return false;
String langCode = extractLanguageCode(language);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ private void tryVariationsOf(String word) {
boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
hasGoodSuggestions |= tryRep(word);

if (!speller.dictionary.mapTable.isEmpty()) {
enumerateMapReplacements(word, "", 0);
}

trySwappingChars(word);
tryLongSwap(word);
tryNeighborKeys(word);
Expand Down Expand Up @@ -116,6 +120,27 @@ private boolean tryRep(String word) {
return result.size() > before;
}

private void enumerateMapReplacements(String word, String accumulated, int offset) {
if (offset == word.length()) {
trySuggestion(accumulated);
return;
}

for (List<String> entries : speller.dictionary.mapTable) {
for (String entry : entries) {
if (word.regionMatches(offset, entry, 0, entry.length())) {
for (String replacement : entries) {
if (!entry.equals(replacement)) {
enumerateMapReplacements(word, accumulated + replacement, offset + entry.length());
}
}
}
}
}

enumerateMapReplacements(word, accumulated + word.charAt(offset), offset + 1);
}

private boolean checkSimpleWord(String part) {
return Boolean.TRUE.equals(speller.checkSimpleWord(part.toCharArray(), part.length(), null));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,10 @@ public void testModifyingSuggestions2() throws Exception {
doTest("sug2");
}

public void testMapSuggestions() throws Exception {
doTest("map");
}

protected void doTest(String name) throws Exception {
checkSpellCheckerExpectations(
Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name), true);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# With MAP suggestion, Hunspell can add missing accents to a word.

# switch off ngram suggestion for testing
MAXNGRAMSUGS 0

MAP 3
MAP u��
MAP o��
MAP �(ss)
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
3
Fr�hst�ck
t�k�rf�r�
gro�
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Frühstück
tükörfúró
groß
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fruhstuck
tukorfuro
gross

0 comments on commit 6536263

Please sign in to comment.