Skip to content

Commit

Permalink
improvement for accept generic arpa language-model data(2)
Browse files Browse the repository at this point in the history
data.arpa needs both BOS(<s>) and EOS(</s>) entry.
to avoid libkkc crash, add fake BOS/EOS if data.arpa does not have
-one of- or -all of- them.
  • Loading branch information
jg1uaa committed Sep 16, 2023
1 parent e84eb9c commit 6d10054
Showing 1 changed file with 25 additions and 1 deletion.
26 changes: 25 additions & 1 deletion data/templates/libkkc-data/tools/sortlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __read_tries(self):
if line.startswith("\\1-grams"):
break

unigram_count = 0
add_bos = add_eos = True
while True:
line = self.__infile.readline()
if line == "":
Expand All @@ -66,12 +66,22 @@ def __read_tries(self):
continue
strv = match.groups()
self.__vocab_keyset.push_back(strv[1])
if strv[1] in ("<s>", "<S>"):
add_bos = False
if strv[1] in ("</s>", "</S>"):
add_eos = False
if not strv[1] in ("<s>", "</s>", "<unk>", "<S>", "</S>", "<UNK>"):
if "/" not in strv[1]:
continue
(input, output) = strv[1].split("/")
self.__input_keyset.push_back(input)

# add fake BOS/EOS if file has no BOS/EOS
if add_bos:
self.__vocab_keyset.push_back("<s>")
if add_eos:
self.__vocab_keyset.push_back("</s>")

self.__vocab_trie.build(self.__vocab_keyset)
self.__input_trie.build(self.__input_keyset)

Expand Down Expand Up @@ -112,6 +122,20 @@ def __read_ngrams(self):
backoff = float(strv[2])
self.__ngram_entries[n - 1][tuple(ids)] = (cost, backoff)

# cost/backoff for fake BOS/EOS
if n == 1:
for word in ("<s>", "</s>"):
agent = marisa.Agent()
agent.set_query(word)
if not self.__vocab_trie.lookup(agent):
continue
id = tuple([agent.key_id()])
try:
self.__ngram_entries[0][id]
except KeyError:
self.__ngram_entries[0][id] = (-99, 0.0)
pass

def write(self):
self.__min_cost = -8.0
self.__write_tries()
Expand Down

0 comments on commit 6d10054

Please sign in to comment.