diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java index b8a9a676678f..e211111a0647 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java @@ -26,6 +26,7 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Objects; +import java.util.PriorityQueue; import java.util.Set; import java.util.TreeSet; import java.util.function.BiConsumer; @@ -59,7 +60,7 @@ List suggest(String word, WordCase originalCase, Set prevSuggest private List>> findSimilarDictionaryEntries( String word, WordCase originalCase) { - TreeSet>> roots = new TreeSet<>(); + PriorityQueue>> roots = new PriorityQueue<>(); processFST( dictionary.words, (key, forms) -> { @@ -80,9 +81,16 @@ private List>> findSimilarDictionaryEntries( ngram(3, word, lower, EnumSet.of(NGramOptions.LONGER_WORSE)) + commonPrefix(word, root); + if (roots.size() == MAX_ROOTS && sc < roots.peek().score) { + return; + } + entries.forEach(e -> roots.add(new Weighted<>(e, sc))); + while (roots.size() > MAX_ROOTS) { + roots.poll(); + } }); - return roots.stream().limit(MAX_ROOTS).collect(Collectors.toList()); + return roots.stream().sorted().collect(Collectors.toList()); } private void processFST(FST fst, BiConsumer keyValueConsumer) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java index 8ae56420cf51..4cd54ba8d324 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java @@ -25,10 +25,12 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.text.ParseException; import java.util.ArrayList; import java.util.List; import java.util.function.Consumer; import java.util.regex.Pattern; +import java.util.stream.Collectors; import org.apache.lucene.util.LuceneTestCase; import org.junit.Assume; import org.junit.AssumptionViolatedException; @@ -54,24 +56,43 @@ public static void resolveCorpora() { @Test public void en() throws Exception { - checkPerformance("en", 500_000); + checkAnalysisPerformance("en", 1_000_000); + } + + @Test + public void en_suggest() throws Exception { + checkSuggestionPerformance("en", 1_000); } @Test public void de() throws Exception { - checkPerformance("de", 200_000); + checkAnalysisPerformance("de", 200_000); + } + + @Test + public void de_suggest() throws Exception { + checkSuggestionPerformance("de", 30); } @Test public void fr() throws Exception { - checkPerformance("fr", 40_000); + checkAnalysisPerformance("fr", 40_000); } - private void checkPerformance(String code, int wordCount) throws Exception { - Path aff = findAffFile(code); + @Test + public void fr_suggest() throws Exception { + checkSuggestionPerformance("fr", 10); + } + private Dictionary loadDictionary(String code) throws IOException, ParseException { + Path aff = findAffFile(code); Dictionary dictionary = TestAllDictionaries.loadDictionary(aff); System.out.println("Loaded " + aff); + return dictionary; + } + + private void checkAnalysisPerformance(String code, int wordCount) throws Exception { + Dictionary dictionary = loadDictionary(code); List words = loadWords(code, wordCount, dictionary); @@ -94,6 +115,25 @@ private void checkPerformance(String code, int wordCount) throws Exception { System.out.println(); } + private void checkSuggestionPerformance(String code, int wordCount) throws Exception { + Dictionary dictionary = loadDictionary(code); + Hunspell speller = new Hunspell(dictionary); + List words = + loadWords(code, wordCount, dictionary).stream() + .filter(w -> !speller.spell(w)) + .collect(Collectors.toList()); + System.out.println("Checking " + words.size() + " misspelled words"); + + measure( + "Suggestions for " + code, + blackHole -> { + for (String word : words) { + blackHole.accept(speller.suggest(word)); + } + }); + System.out.println(); + } + private Path findAffFile(String code) throws IOException { return TestAllDictionaries.findAllAffixFiles() .filter(