WorksApplications · mh-northlander · Nov 19, 2024 · Nov 15, 2024 · Nov 15, 2024 · Nov 18, 2024
diff --git a/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java b/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2024 Works Applications Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.worksap.nlp.sudachi;
+
+import java.util.List;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import com.worksap.nlp.sudachi.dictionary.CharacterCategory;
+import com.worksap.nlp.sudachi.dictionary.Grammar;
+import com.worksap.nlp.sudachi.dictionary.GrammarImpl;
+
+/**
+ * A text normalizer.
+ */
+public class TextNormalizer {
+    private final Grammar grammar;
+    private final List<InputTextPlugin> inputTextPlugins;
+
+    /**
+     * Create a TextNormalizer from a grammar and input text plugins.
+     * 
+     * Grammar must have
+     * {@link com.worksap.nlp.sudachi.dictionary.CharacterCategory}.
+     */
+    public TextNormalizer(Grammar grammar, List<InputTextPlugin> inputTextPlugins) {
+        if (grammar.getCharacterCategory() == null) {
+            throw new IllegalArgumentException("grammar for TextNormalizer must have CharacterCategory.");
+        }
+        this.grammar = grammar;
+        this.inputTextPlugins = inputTextPlugins;
+    }
+
+    /**
+     * Create a TextNormalizer from a grammar.
+     * 
+     * Grammar must have a
+     * {@link com.worksap.nlp.sudachi.dictionary.CharacterCategory}.
+     * {@link DefaultInputTextPlugin} will be used.
+     */
+    public TextNormalizer(Grammar grammar) throws IOException {
+        this(grammar, setupDefaultInputTextPlugins(grammar));
+    }
+
+    /**
+     * Create a default TextNormalizer that uses default
+     * {@link com.worksap.nlp.sudachi.dictionary.CharacterCategory} and
+     * {@link DefaultInputTextPlugin}.
+     */
+    public static TextNormalizer defaultTextNormalizer() throws IOException {
+        Grammar grammar = new GrammarImpl();
+        grammar.setCharacterCategory(CharacterCategory.loadDefault());
+        return new TextNormalizer(grammar);
+    }
+
+    /**
+     * Create TextNormalizer based on the {@link JapaneseDictionary}.
+     */
+    public static TextNormalizer fromDictionary(JapaneseDictionary dictionary) {
+        return new TextNormalizer(dictionary.getGrammar(), dictionary.inputTextPlugins);
+    }
+
+    /**
+     * Setup {@link DefaultInputTextPlugin} using a grammar.
+     */
+    private static List<InputTextPlugin> setupDefaultInputTextPlugins(Grammar grammar) throws IOException {
+        PathAnchor anchor = PathAnchor.classpath();
+        List<Config.PluginConf<InputTextPlugin>> pconfs = Config.fromJsonString(
+                "{\"inputTextPlugin\":[{\"class\":\"com.worksap.nlp.sudachi.DefaultInputTextPlugin\"}]}", anchor)
+                .getInputTextPlugins();
+
+        List<InputTextPlugin> plugins = new ArrayList<>();
+        for (Config.PluginConf<InputTextPlugin> pconf : pconfs) {
+            InputTextPlugin p = pconf.instantiate(anchor);
+            p.setUp(grammar);
+            plugins.add(p);
+        }
+
+        return plugins;
+    }
+
+    /** Normalize given text */
+    public String normalize(CharSequence text) {
+        UTF8InputTextBuilder builder = new UTF8InputTextBuilder(text, grammar);
+        for (InputTextPlugin plugin : inputTextPlugins) {
+            plugin.rewrite(builder);
+        }
+        UTF8InputText input = builder.build();
+        return input.getText();
+    }
+}
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/CharacterCategory.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/CharacterCategory.java
@@ -17,6 +17,7 @@
 package com.worksap.nlp.sudachi.dictionary;
 
 import com.worksap.nlp.sudachi.Config;
+import com.worksap.nlp.sudachi.PathAnchor;
 
 import java.io.*;
 import java.nio.charset.StandardCharsets;
@@ -157,4 +158,9 @@ public static CharacterCategory load(Config.Resource<CharacterCategory> resource
             return result;
         });
     }
+
+    public static CharacterCategory loadDefault() throws IOException {
+        Config.Resource<CharacterCategory> defaultResource = PathAnchor.classpath().resource("char.def");
+        return load(defaultResource);
+    }
 }
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java
@@ -25,17 +25,19 @@
 import java.io.Console;
 
 import com.worksap.nlp.sudachi.WordId;
+import com.worksap.nlp.sudachi.TextNormalizer;
 import com.worksap.nlp.sudachi.SudachiCommandLine.FileOrStdoutPrintStream;
 
 public class DictionaryPrinter {
     private final PrintStream output;
+    private final TextNormalizer textNormalizer;
     private final GrammarImpl grammar;
     private final LexiconSet lexicon;
     private final List<String> posStrings;
     private final boolean isUser;
     private final int entrySize;
 
-    DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictionary base) {
+    DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictionary base) throws IOException {
         if (dic.getDictionaryHeader().isUserDictionary() && base == null) {
             throw new IllegalArgumentException("System dictionary is required to print user dictionary");
         }
@@ -57,6 +59,10 @@ public class DictionaryPrinter {
             }
         }
 
+        // set default char category for text normalizer
+        grammar.setCharacterCategory(CharacterCategory.loadDefault());
+        textNormalizer = new TextNormalizer(grammar);
+
         List<String> poss = new ArrayList<>();
         for (short pid = 0; pid < grammar.getPartOfSpeechSize(); pid++) {
             poss.add(String.join(",", grammar.getPartOfSpeechString(pid)));
@@ -86,7 +92,7 @@ private void printEntry(int wordId) {
         short cost = lexicon.getCost(wordId);
         WordInfo wordInfo = lexicon.getWordInfo(wordId);
 
-        field(maybeEscapeString(wordInfo.getSurface()));
+        field(maybeEscapeString(textNormalizer.normalize(wordInfo.getSurface())));
         field(leftId);
         field(rightId);
         field(cost);

diff --git a/src/test/dict/lex.csv b/src/test/dict/lex.csv
@@ -35,5 +35,5 @@
 いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,*
 いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,*,*,*,*
 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,2478,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,*
-特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,*,A,*,*,*,*
+特a,8,8,2914,特A,名詞,普通名詞,一般,*,*,*,トクエー,特A,*,A,*,*,*,*
 な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,A,*,*,*,*
diff --git a/src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt b/src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2022 Works Applications Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.worksap.nlp.sudachi
+
+import com.worksap.nlp.sudachi.dictionary.CharacterCategory
+import com.worksap.nlp.sudachi.dictionary.GrammarImpl
+import kotlin.test.*
+
+class TextNormalizerTest {
+
+  private val dic =
+      DictionaryFactory()
+          .create(TestDictionary.user2Cfg().characterDefinition(CharacterCategory.loadDefault()))
+          as JapaneseDictionary
+
+  @Test
+  fun instantiation() {
+    TextNormalizer.fromDictionary(dic)
+    TextNormalizer(dic.getGrammar())
+    TextNormalizer(dic.getGrammar(), dic.inputTextPlugins)
+    TextNormalizer.defaultTextNormalizer()
+  }
+
+  @Test
+  fun failToInstantiateWithoutCharCategory() {
+    val grammar = GrammarImpl()
+    assertFails { TextNormalizer(grammar) }
+  }
+
+  @Test
+  fun normalizeText() {
+    val tn = TextNormalizer.defaultTextNormalizer()
+
+    // from DefaultInputTextPlugin test
+    assertEquals("âbγд(株)ガヴ⼼ⅲ", tn.normalize("ÂＢΓД㈱ｶﾞウ゛⼼Ⅲ"))
+  }
+
+  @Test
+  fun normalizeTextWithDefaultConfig() {
+    // will use default config, which has InputTextPlugins of
+    // [Default, ProlongedSoundMark, IgnoreYomigana]
+    val tn = TextNormalizer.fromDictionary(dic)
+    print(dic.inputTextPlugins)
+
+    assertEquals("âbγд(株)ガヴ⼼ⅲ", tn.normalize("ÂＢΓД㈱ｶﾞウ゛⼼Ⅲ")) // default
+    assertEquals("うわーい", tn.normalize("うわーーーい")) // prolonged sound mark
+    assertEquals("小鳥遊", tn.normalize("小鳥遊（タカナシ）")) // ignore yomigana
+  }
+}
diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.java
@@ -75,6 +75,7 @@ public void printWithSystemDict() throws IOException {
         }
         assertThat(actuals.length, is(40));
         assertThat(actuals[0], is("た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,*,A,*,*,*,*"));
+        assertThat(actuals[37], is("特a,8,8,2914,特A,名詞,普通名詞,一般,*,*,*,トクエー,特A,*,A,*,*,*,*"));
     }
 
     @Test

diff --git a/src/test/resources/dict/lex.csv b/src/test/resources/dict/lex.csv
@@ -35,6 +35,6 @@
 いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,*
 いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,*,*,*,*
 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,*
-特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,*,A,*,*,*,*
+特a,8,8,2914,特A,名詞,普通名詞,一般,*,*,*,トクエー,特A,*,A,*,*,*,*
 隠し,-1,-1,0,隠し,名詞,普通名詞,一般,*,*,*,カクシ,隠し,*,A,*,*,*,*
 な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,C,11,11,*,*
-Original file line number
+Diff line change
@@ Expand Up / @@ -75,6 +75,7 @@ public void printWithSystemDict() throws IOException { @@
             }
             assertThat(actuals.length, is(40));
             assertThat(actuals[0], is("た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,*,A,*,*,*,*"));
+            assertThat(actuals[37], is("特a,8,8,2914,特A,名詞,普通名詞,一般,*,*,*,トクエー,特A,*,A,*,*,*,*"));
         }
         @Test
@@ Expand Down @@