Skip to content

Commit

Permalink
LUCENE-9787: Hunspell: speed up suggesting a bit by not creating a hu…
Browse files Browse the repository at this point in the history
…ge TreeSet (apache#2400)
  • Loading branch information
donnerpeter authored Feb 19, 2021
1 parent 58e3b7a commit 3ddc3c0
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Objects;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.TreeSet;
import java.util.function.BiConsumer;
Expand Down Expand Up @@ -59,7 +60,7 @@ List<String> suggest(String word, WordCase originalCase, Set<String> prevSuggest

private List<Weighted<Root<String>>> findSimilarDictionaryEntries(
String word, WordCase originalCase) {
TreeSet<Weighted<Root<String>>> roots = new TreeSet<>();
PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>();
processFST(
dictionary.words,
(key, forms) -> {
Expand All @@ -80,9 +81,16 @@ private List<Weighted<Root<String>>> findSimilarDictionaryEntries(
ngram(3, word, lower, EnumSet.of(NGramOptions.LONGER_WORSE))
+ commonPrefix(word, root);

if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
return;
}

entries.forEach(e -> roots.add(new Weighted<>(e, sc)));
while (roots.size() > MAX_ROOTS) {
roots.poll();
}
});
return roots.stream().limit(MAX_ROOTS).collect(Collectors.toList());
return roots.stream().sorted().collect(Collectors.toList());
}

private void processFST(FST<IntsRef> fst, BiConsumer<IntsRef, IntsRef> keyValueConsumer) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Assume;
import org.junit.AssumptionViolatedException;
Expand All @@ -54,24 +56,43 @@ public static void resolveCorpora() {

@Test
public void en() throws Exception {
checkPerformance("en", 500_000);
checkAnalysisPerformance("en", 1_000_000);
}

@Test
public void en_suggest() throws Exception {
checkSuggestionPerformance("en", 1_000);
}

@Test
public void de() throws Exception {
checkPerformance("de", 200_000);
checkAnalysisPerformance("de", 200_000);
}

@Test
public void de_suggest() throws Exception {
checkSuggestionPerformance("de", 30);
}

@Test
public void fr() throws Exception {
checkPerformance("fr", 40_000);
checkAnalysisPerformance("fr", 40_000);
}

private void checkPerformance(String code, int wordCount) throws Exception {
Path aff = findAffFile(code);
@Test
public void fr_suggest() throws Exception {
checkSuggestionPerformance("fr", 10);
}

private Dictionary loadDictionary(String code) throws IOException, ParseException {
Path aff = findAffFile(code);
Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
System.out.println("Loaded " + aff);
return dictionary;
}

private void checkAnalysisPerformance(String code, int wordCount) throws Exception {
Dictionary dictionary = loadDictionary(code);

List<String> words = loadWords(code, wordCount, dictionary);

Expand All @@ -94,6 +115,25 @@ private void checkPerformance(String code, int wordCount) throws Exception {
System.out.println();
}

private void checkSuggestionPerformance(String code, int wordCount) throws Exception {
Dictionary dictionary = loadDictionary(code);
Hunspell speller = new Hunspell(dictionary);
List<String> words =
loadWords(code, wordCount, dictionary).stream()
.filter(w -> !speller.spell(w))
.collect(Collectors.toList());
System.out.println("Checking " + words.size() + " misspelled words");

measure(
"Suggestions for " + code,
blackHole -> {
for (String word : words) {
blackHole.accept(speller.suggest(word));
}
});
System.out.println();
}

private Path findAffFile(String code) throws IOException {
return TestAllDictionaries.findAllAffixFiles()
.filter(
Expand Down

0 comments on commit 3ddc3c0

Please sign in to comment.