From 49aab7e65846d71f3efe9978f68402a3ab02e3e5 Mon Sep 17 00:00:00 2001 From: huangruizhe Date: Sun, 2 Jan 2022 00:14:27 -0800 Subject: [PATCH 1/5] Update make_kn_lm.py Fixed issue #163 --- icefall/shared/make_kn_lm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/icefall/shared/make_kn_lm.py b/icefall/shared/make_kn_lm.py index 58b721d219..a52e8da711 100755 --- a/icefall/shared/make_kn_lm.py +++ b/icefall/shared/make_kn_lm.py @@ -165,7 +165,9 @@ def cal_discounting_constants(self): n1 += stat[1] n2 += stat[2] assert n1 + 2 * n2 > 0 - self.d.append(n1 * 1.0 / (n1 + 2 * n2)) + self.d.append(max(0.001, n1 * 1.0) / (n1 + 2 * n2)) # We are doing this max(0.001, xxx) to avoid zero discounting constant D, + # which could happen if the number of symbols is small and all w in the vocab + # has been seen after certain h. This can cause division by zero in computing BOW. def cal_f(self): # f(a_z) is a probability distribution of word sequence a_z. From 0a67015d63ff28566a5956185709f0bfb958744e Mon Sep 17 00:00:00 2001 From: huangruizhe Date: Sun, 2 Jan 2022 00:27:27 -0800 Subject: [PATCH 2/5] Update make_kn_lm.py --- icefall/shared/make_kn_lm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/icefall/shared/make_kn_lm.py b/icefall/shared/make_kn_lm.py index a52e8da711..8d0170f8b8 100755 --- a/icefall/shared/make_kn_lm.py +++ b/icefall/shared/make_kn_lm.py @@ -165,9 +165,9 @@ def cal_discounting_constants(self): n1 += stat[1] n2 += stat[2] assert n1 + 2 * n2 > 0 - self.d.append(max(0.001, n1 * 1.0) / (n1 + 2 * n2)) # We are doing this max(0.001, xxx) to avoid zero discounting constant D, - # which could happen if the number of symbols is small and all w in the vocab - # has been seen after certain h. This can cause division by zero in computing BOW. + self.d.append(max(0.001, n1 * 1.0) / (n1 + 2 * n2)) # We are doing this max(0.001, xxx) to avoid zero discounting constant D due to n1=0, + # which could happen if the number of symbols is small. + # Otherwise, zero discounting constant can cause division by zero in computing BOW. def cal_f(self): # f(a_z) is a probability distribution of word sequence a_z. From 82c8fac6ee898dc479e4e665dd0b69d26129dba6 Mon Sep 17 00:00:00 2001 From: huangruizhe Date: Sun, 2 Jan 2022 15:29:50 -0800 Subject: [PATCH 3/5] fixed a case where BOW can have problem to compute (ZeroDivisionError) --- icefall/shared/make_kn_lm.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/icefall/shared/make_kn_lm.py b/icefall/shared/make_kn_lm.py index 8d0170f8b8..fe514e52bf 100755 --- a/icefall/shared/make_kn_lm.py +++ b/icefall/shared/make_kn_lm.py @@ -165,9 +165,9 @@ def cal_discounting_constants(self): n1 += stat[1] n2 += stat[2] assert n1 + 2 * n2 > 0 - self.d.append(max(0.001, n1 * 1.0) / (n1 + 2 * n2)) # We are doing this max(0.001, xxx) to avoid zero discounting constant D due to n1=0, - # which could happen if the number of symbols is small. - # Otherwise, zero discounting constant can cause division by zero in computing BOW. + self.d.append(max(0.1, n1 * 1.0) / (n1 + 2 * n2)) # We are doing this max(0.001, xxx) to avoid zero discounting constant D due to n1=0, + # which could happen if the number of symbols is small. + # Otherwise, zero discounting constant can cause division by zero in computing BOW. def cal_f(self): # f(a_z) is a probability distribution of word sequence a_z. @@ -243,7 +243,10 @@ def cal_bow(self): for u in a_counts_for_hist.word_to_count.keys(): # Should be careful here: what is Z1 sum_z1_f_z += _counts_for_hist.word_to_f[u] - counts_for_hist.word_to_bow[w] = (1.0 - sum_z1_f_a_z) / (1.0 - sum_z1_f_z) + if 1.0 - sum_z1_f_z == 0: + counts_for_hist.word_to_bow[w] = (1.0 - sum_z1_f_a_z) / (1.0 - sum_z1_f_z) + else: + counts_for_hist.word_to_bow[w] = None def print_raw_counts(self, info_string): # these are useful for debug. From 7577b08bed580bd2d73802c8db50f0a4cadaa61f Mon Sep 17 00:00:00 2001 From: huangruizhe Date: Sun, 2 Jan 2022 23:32:43 -0800 Subject: [PATCH 4/5] fixed the mistake --- icefall/shared/make_kn_lm.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/icefall/shared/make_kn_lm.py b/icefall/shared/make_kn_lm.py index fe514e52bf..a8b58dc300 100755 --- a/icefall/shared/make_kn_lm.py +++ b/icefall/shared/make_kn_lm.py @@ -243,7 +243,10 @@ def cal_bow(self): for u in a_counts_for_hist.word_to_count.keys(): # Should be careful here: what is Z1 sum_z1_f_z += _counts_for_hist.word_to_f[u] - if 1.0 - sum_z1_f_z == 0: + # I assume the following to be true: + # assert sum_z1_f_z <= 1.0 + # assert sum_z1_f_a_z <= 1.0 + if sum_z1_f_z < 1: counts_for_hist.word_to_bow[w] = (1.0 - sum_z1_f_a_z) / (1.0 - sum_z1_f_z) else: counts_for_hist.word_to_bow[w] = None From 298faabb90d52e04a586c911bef75902a3aa3c2b Mon Sep 17 00:00:00 2001 From: huangruizhe Date: Sun, 2 Jan 2022 23:38:33 -0800 Subject: [PATCH 5/5] minor fixes --- icefall/shared/make_kn_lm.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/icefall/shared/make_kn_lm.py b/icefall/shared/make_kn_lm.py index a8b58dc300..c2edd823ee 100755 --- a/icefall/shared/make_kn_lm.py +++ b/icefall/shared/make_kn_lm.py @@ -243,10 +243,8 @@ def cal_bow(self): for u in a_counts_for_hist.word_to_count.keys(): # Should be careful here: what is Z1 sum_z1_f_z += _counts_for_hist.word_to_f[u] - # I assume the following to be true: - # assert sum_z1_f_z <= 1.0 - # assert sum_z1_f_a_z <= 1.0 if sum_z1_f_z < 1: + # assert sum_z1_f_a_z < 1 counts_for_hist.word_to_bow[w] = (1.0 - sum_z1_f_a_z) / (1.0 - sum_z1_f_z) else: counts_for_hist.word_to_bow[w] = None