From 3be6ca688dacb1e24fbdd475e62515e35e201c6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ale=C5=9B=20Bu=C5=82oj=C4=8Dyk?= Date: Tue, 19 Sep 2023 20:00:16 +0300 Subject: [PATCH] Move GrammarFinder to GrammarDB --- .project | 11 ++ build.xml | 2 +- src/org/alex73/grammardb/GrammarFinder.java | 154 ++++++++++++++++++++ 3 files changed, 166 insertions(+), 1 deletion(-) create mode 100644 src/org/alex73/grammardb/GrammarFinder.java diff --git a/.project b/.project index dfa4b62..40d08fc 100644 --- a/.project +++ b/.project @@ -14,4 +14,15 @@ org.eclipse.jdt.core.javanature + + + 1695143686020 + + 10 + + org.eclipse.ui.ide.multiFilter + 1.0-projectRelativePath-matches-false-false-build + + + diff --git a/build.xml b/build.xml index 12b539e..d63ed68 100644 --- a/build.xml +++ b/build.xml @@ -1,7 +1,7 @@ - + diff --git a/src/org/alex73/grammardb/GrammarFinder.java b/src/org/alex73/grammardb/GrammarFinder.java new file mode 100644 index 0000000..6fbee79 --- /dev/null +++ b/src/org/alex73/grammardb/GrammarFinder.java @@ -0,0 +1,154 @@ +package org.alex73.grammardb; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; + +import org.alex73.grammardb.structures.Fan; +import org.alex73.grammardb.structures.Paradigm; + +/** + * Хутка шукае парадыгмы, ў якіх ёсць форма, падобная на патрэбнае слова. Задача + * не ў тым, каб знайсці дакладны спіс парадыгмаў, а каб абмежаваць далейшы + * пошук некалькімі дзясяткамі парадыгмаў, а не перабіраць усе 260 тысяч. + * + * Для гэтага ствараецца хэш-табліца, якая запаўняецца на старце, і потым пошук + * ідзе толькі па гэтай хэш-табліцы. + */ +public class GrammarFinder { + private static final int HASHTABLE_SIZE = 256 * 1024; + private static final Paradigm[] EMPTY = new Paradigm[0]; + private final Paradigm[][] table; + private final Map morph = new HashMap<>(); + private final Map fan = new HashMap<>(); + protected final char[] LETTERS_HASH = new char[0x2020]; + + public GrammarFinder(GrammarDB2 gr) { + for (char c = 0; c < LETTERS_HASH.length; c++) { + if (Character.isLetterOrDigit(c)) { + LETTERS_HASH[c] = Character.toLowerCase(c); + } + } + // дадаткова канвертуем мяккія у цвёрдыя + char[] map = new char[] { 'ґ', 'г', 'ў', 'у', 'й', 'і', 'ё', 'о', 'е', 'э', 'я', 'а', 'ю', 'у', 'ь', '\0' }; + for (int i = 0; i < map.length; i += 2) { + LETTERS_HASH[map[i]] = map[i + 1]; + LETTERS_HASH[Character.toUpperCase(map[i])] = map[i + 1]; + } + + long be = System.currentTimeMillis(); + final List> prepare = new ArrayList<>(HASHTABLE_SIZE); + for (int i = 0; i < HASHTABLE_SIZE; i++) { + prepare.add(new ArrayList<>()); + } + gr.getAllParadigms().parallelStream().forEach(p -> { + p.getVariant().forEach(v -> { + putToPrepare(v.getLemma(), prepare, p); + v.getForm().forEach(f -> { + if (f.getValue() != null && !f.getValue().isEmpty()) { + putToPrepare(f.getValue(), prepare, p); + } + }); + v.getMorph().forEach(m -> { + putToMorph(m); + }); + v.getFan().forEach(f -> { + putToFan(f); + }); + }); + }); + table = prepareToFinal(prepare); + long af = System.currentTimeMillis(); + System.out.println("GrammarFinder prepare time: " + (af - be) + "ms"); + } + + private void putToPrepare(String w, List> prepare, Paradigm p) { + int hash = hash(w); + int indexByHash = Math.abs(hash) % HASHTABLE_SIZE; + List list = prepare.get(indexByHash); + synchronized (list) { + for (int i = 0; i < list.size(); i++) { + if (list.get(i) == p) { + return; + } + } + list.add(p); + } + } + + private void putToMorph(String m) { + String key = m.replace("-", "").replace('ґ', 'г').toLowerCase(); + synchronized (morph) { + String prev = morph.put(key, m); + if (prev != null && !prev.equals(m)) { + throw new RuntimeException("Different morph for " + key + ": " + m + " / " + prev); + } + } + } + + private void putToFan(Fan f) { + String key = f.getS().replace("+", "").toLowerCase(); + synchronized (fan) { + String prev = fan.put(key, f.getValue()); + if (prev != null && !prev.equals(f.getValue())) { + System.err.println("Different fan for " + key + ": " + f.getValue() + " / " + prev); + } + } + } + + private Paradigm[][] prepareToFinal(List> prepare) { + Paradigm[][] result = new Paradigm[prepare.size()][]; + int maxLen = 0; + for (int i = 0; i < result.length; i++) { + List list = prepare.get(i); + if (!list.isEmpty()) { + result[i] = list.toArray(new Paradigm[list.size()]); + maxLen = Math.max(maxLen, result[i].length); + } + } + System.out.println("GrammarFinder max table tail: " + maxLen); + return result; + } + + /** + * Find paradigms by lemma or form (lower case). + */ + public Paradigm[] getParadigms(String word) { + int hash = hash(word); + int indexByHash = Math.abs(hash) % HASHTABLE_SIZE; + Paradigm[] result = table[indexByHash]; + return result != null ? result : EMPTY; + } + + public Stream getSimilarGroups() { + return Arrays.stream(table).filter(r -> r != null); + } + + public String getMorph(String word) { + word = StressUtils.unstress(word).replace('ґ', 'г').toLowerCase(); + return morph.get(word); + } + + public String getFan(String word) { + word = StressUtils.unstress(word).toLowerCase(); + return fan.get(word); + } + + private int hash(String word) { + if (word == null) { + return 0; + } + int result = 0; + for (int i = 0; i < word.length(); i++) { + char c = word.charAt(i); + c = c < LETTERS_HASH.length ? LETTERS_HASH[c] : 0; + if (c > 0) { + result = 31 * result + c; + } + } + return result; + } +}