Skip to content

Commit 701c69e

Browse files
committed
add good-turing smoothing
1 parent afdaf0a commit 701c69e

File tree

2 files changed

+299
-10
lines changed

2 files changed

+299
-10
lines changed

ngram/ngram.py

Lines changed: 270 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import numpy as np
99

10+
from linear_models.lm import LinearRegression
1011
from preprocessing.nlp import tokenize_words, ngrams
1112

1213

@@ -46,11 +47,19 @@ def train(self, corpus_fp, vocab=None, encoding=None):
4647
The path to a newline-separated text corpus file
4748
vocab : `preprocessing.nlp.Vocabulary` instance (default: None)
4849
If not `None`, only the words in `vocab` will be used to construct
49-
the language model
50+
the language model; all out-of-vocabulary words will either be
51+
mappend to <unk> (if self.unk = True) or removed (if self.unk =
52+
False).
5053
encoding : str (default: None)
5154
Specifies the text encoding for corpus. Common entries are 'utf-8',
5255
'utf-8-sig', 'utf-16'.
5356
"""
57+
return self._train(corpus_fp, vocab=vocab, encoding=encoding)
58+
59+
def _train(self, corpus_fp, vocab=None, encoding=None):
60+
"""
61+
Actual N-gram training logic
62+
"""
5463
H = self.hyperparameters
5564
grams = {N: [] for N in range(1, self.N + 1)}
5665
counts = {N: Counter() for N in range(1, self.N + 1)}
@@ -146,13 +155,15 @@ def generate(self, N, seed_words=["<bol>"], n_sentences=5):
146155
words = seed_words.copy()
147156
while counter < n_sentences:
148157
nextw, probs = zip(*self.completions(words, N))
149-
next_word = np.random.choice(nextw, p=np.exp(probs))
158+
probs = np.exp(probs) / np.exp(probs).sum() # renormalize probs if smoothed
159+
next_word = np.random.choice(nextw, p=probs)
150160

151161
# if we reach the end of a sentence, save it and start a new one
152162
if next_word == "<eol>":
153163
S = " ".join([w for w in words if w != "<bol>"])
154164
S = textwrap.fill(S, 90, initial_indent="", subsequent_indent=" ")
155165
print(S)
166+
words.append(next_word)
156167
sentences.append(words)
157168
words = seed_words.copy()
158169
counter += 1
@@ -161,6 +172,64 @@ def generate(self, N, seed_words=["<bol>"], n_sentences=5):
161172
words.append(next_word)
162173
return sentences
163174

175+
def perplexity(self, words, N):
176+
"""
177+
Calculate the model perplexity on a sequence of words. Perplexity,
178+
PP, is defined as
179+
180+
PP(W) = ( 1 / p(W) ) ^ (1 / n)
181+
log PP(W) = (1 / n) * log(1 / p(W))
182+
= -(1 / n) * log p(W)
183+
PP(W) = np.exp(-(1 / n) * log p(W))
184+
= np.exp(cross_entropy(W))
185+
186+
where n is the number of `N`-grams in W.
187+
188+
The higher the conditional probability of the word sequence, the lower
189+
the perplexity. Thus, minimizing perplexity is equivalent to maximizing
190+
the probability of `words` under the `N`-gram model.
191+
192+
Perplexity is equivalent to the average branching factor in predicting
193+
the next word.
194+
195+
Parameters
196+
----------
197+
N : int
198+
The gram-size of the model to calculate perplexity with
199+
words : list or tuple of strings
200+
The sequence of words to compute perplexity on
201+
202+
Returns
203+
-------
204+
perplexity : float
205+
The model perlexity for the words in `words`
206+
"""
207+
return np.exp(self.cross_entropy(words, N))
208+
209+
def cross_entropy(self, words, N):
210+
"""
211+
Calculate the model cross-entropy on a sequence of words. Cross-entropy,
212+
XE, is defined as
213+
214+
XE(W) = -(1 / n) * log p(W)
215+
216+
where n is the number of N-grams in W.
217+
218+
Parameters
219+
----------
220+
N : int
221+
The gram-size of the model to calculate cross-entropy on
222+
words : list or tuple of strings
223+
The sequence of words to compute cross-entropy on
224+
225+
Returns
226+
-------
227+
cross_entropy : float
228+
The model cross-entropy for the words in `words`
229+
"""
230+
n_ngrams = len(ngrams(words, N))
231+
return -(1 / n_ngrams) * self.log_prob(words, N)
232+
164233
def _log_prob(self, words, N):
165234
"""Calculate the log probability of a sequence of words under the `N`-gram model"""
166235
assert N in self.counts, "You do not have counts for {}-grams".format(N)
@@ -334,7 +403,205 @@ def _log_ngram_prob(self, ngram):
334403
counts, n_words, n_tokens = self.counts, self.n_words[1], self.n_tokens[1]
335404

336405
ctx = ngram[:-1]
337-
ctx_count = counts[N - 1][ctx] if N > 1 else n_words
338406
num = counts[N][ngram] + K
407+
ctx_count = counts[N - 1][ctx] if N > 1 else n_words
339408
den = ctx_count + K * n_tokens
340409
return np.log(num / den) if den != 0 else -np.inf
410+
411+
412+
class GoodTuringNGram(NGramBase):
413+
def __init__(
414+
self, N, conf=1.96, unk=True, filter_stopwords=True, filter_punctuation=True
415+
):
416+
"""
417+
An N-Gram model with smoothed probabilities calculated with the simple
418+
Good-Turing estimator from Gale (2001).
419+
420+
Parameters
421+
----------
422+
N : int
423+
The maximum length (in words) of the context-window to use in the
424+
langauge model. Model will compute all n-grams from 1, ..., N
425+
conf: float (default: 1.96)
426+
The multiplier of the standard deviation of the empirical smoothed
427+
count (the default, 1.96, corresponds to a 95% confidence
428+
interval). Controls how many datapoints are smoothed using the
429+
log-linear model.
430+
unk : bool (default: True)
431+
Whether to include the <unk> (unknown) token in the LM
432+
filter_stopwords : bool (default: True)
433+
Whether to remove stopwords before training
434+
filter_punctuation : bool (default: True)
435+
Whether to remove punctuation before training
436+
"""
437+
super().__init__(N, unk, filter_stopwords, filter_punctuation)
438+
self.hyperparameters["id"] = "GoodTuringNGram"
439+
self.hyperparameters["conf"] = conf
440+
441+
def train(self, corpus_fp, vocab=None, encoding=None):
442+
"""
443+
Compile the n-gram counts for the text(s) in `corpus_fp`. Upon
444+
completion the `self.counts` attribute will store dictionaries of the
445+
N, N-1, ..., 1-gram counts.
446+
447+
Parameters
448+
----------
449+
corpus_fp : str
450+
The path to a newline-separated text corpus file
451+
vocab : `preprocessing.nlp.Vocabulary` instance (default: None)
452+
If not `None`, only the words in `vocab` will be used to construct
453+
the language model; all out-of-vocabulary words will either be
454+
mappend to <unk> (if self.unk = True) or removed (if self.unk =
455+
False).
456+
encoding : str (default: None)
457+
Specifies the text encoding for corpus. Common entries are 'utf-8',
458+
'utf-8-sig', 'utf-16'.
459+
"""
460+
self._train(corpus_fp, vocab=None, encoding=None)
461+
self._calc_smoothed_counts()
462+
463+
def log_prob(self, words, N):
464+
"""
465+
Compute the smoothed log probability of a sequence of words under the
466+
`N`-gram language model with Good-Turing smoothing. For a bigram,
467+
this amounts to:
468+
469+
P(w_i | w_{i-1}) = C* / Count(w_{i-1})
470+
471+
where C* is the Good-Turing smoothed estimate of the bigram count:
472+
473+
C* = [ (c + 1) * NumCounts(c + 1, 2) ] / NumCounts(c, 2)
474+
475+
where
476+
477+
c = Count(w_{i-1}, w_i)
478+
NumCounts(r, k) = |{ k-gram : Count(k-gram) = r }|
479+
480+
In words, the probability of an N-gram that occurs r times in the
481+
corpus is estimated by dividing up the probability mass occupied by
482+
N-grams that occur r+1 times.
483+
484+
For large values of r, NumCounts becomes unreliable. In this case, we
485+
compute a smoothed version of NumCounts using a power law function,
486+
log(NumCounts(r)) = a * log(r) + b.
487+
488+
Under the Good-Turing estimator, the total probability assigned to
489+
unseen N-grams is equal to the relative occurrence of N-grams that
490+
appear only once.
491+
492+
Parameters
493+
----------
494+
words : list of strings
495+
A sequence of words
496+
N : int
497+
The gram-size of the language model to use when calculating the log
498+
probabilities of the sequence
499+
500+
Returns
501+
-------
502+
total_prob : float
503+
The total log-probability of the sequence `words` under the
504+
`N`-gram language model
505+
"""
506+
return self._log_prob(words, N)
507+
508+
def _calc_smoothed_counts(self):
509+
use_interp = False
510+
counts = self.counts
511+
NC = self._num_grams_with_count
512+
conf = self.hyperparameters["conf"]
513+
514+
totals = {N: 0 for N in range(1, self.N + 1)}
515+
smooth_counts = {N: {} for N in range(1, self.N + 1)}
516+
517+
# calculate the probability of all <unk> (i.e., unseen) n-grams
518+
self._p0 = {n: NC(1, n) / sum(counts[n].values()) for n in range(1, self.N + 1)}
519+
520+
# fit log-linear models for predicting smoothed counts in absence of
521+
# real data
522+
self._fit_count_models()
523+
524+
LM = self._count_models
525+
for N in range(1, self.N + 1):
526+
for C in sorted(set(counts[N].values())):
527+
528+
# estimate the interpolated count using the log-linear model
529+
c1_lm = np.exp(LM[N].predict(np.c_[np.log(C + 1)])).item()
530+
c0_lm = np.exp(LM[N].predict(np.c_[np.log(C)])).item()
531+
count_interp = ((C + 1) * c1_lm) / c0_lm
532+
533+
# if we have previously been using the interpolated count, or
534+
# if the number of ocurrences of C+1 is 0, use the interpolated
535+
# count as the smoothed count value C*
536+
c1, c0 = NC(C + 1, N), NC(C, N)
537+
if use_interp or c1 == 0:
538+
use_interp = True
539+
smooth_counts[N][C] = count_interp
540+
totals[N] += c0 * smooth_counts[N][C]
541+
continue
542+
543+
# estimate the smoothed count C* empirically if the number of
544+
# terms with count C + 1 > 0
545+
count_emp = ((C + 1) * c1) / c0
546+
547+
# compute the approximate variance of the empirical smoothed
548+
# count C* given C
549+
t = conf * np.sqrt((C + 1) ** 2 * (c1 / c0 ** 2) * (1 + c1 / c0))
550+
551+
# if the difference between the empirical and interpolated
552+
# smoothed counts is greater than t, the empirical estimate
553+
# tends to be more accurate. otherwise, use interpolated
554+
if np.abs(count_interp - count_emp) > t:
555+
smooth_counts[N][C] = count_emp
556+
totals[N] += c0 * smooth_counts[N][C]
557+
continue
558+
559+
use_interp = True
560+
smooth_counts[N][C] = count_interp
561+
totals[N] += c0 * smooth_counts[N][C]
562+
563+
self._smooth_totals = totals
564+
self._smooth_counts = smooth_counts
565+
566+
def _log_ngram_prob(self, ngram):
567+
"""Return the smoothed log probability of the ngram"""
568+
N = len(ngram)
569+
sc, T = self._smooth_counts[N], self._smooth_totals[N]
570+
n_tokens, n_seen = self.n_tokens[N], len(self.counts[N])
571+
572+
# approx. prob of an out-of-vocab ngram (i.e., a fraction of p0)
573+
n_unseen = max((n_tokens ** N) - n_seen, 1)
574+
prob = np.log(self._p0[N] / n_unseen)
575+
576+
if ngram in self.counts[N]:
577+
C = self.counts[N][ngram]
578+
prob = np.log(1 - self._p0[N]) + np.log(sc[C]) - np.log(T)
579+
return prob
580+
581+
def _fit_count_models(self):
582+
"""
583+
Perform the averaging transform proposed by Church and Gale (1991):
584+
estimate the expected count-of-counts by the *density* of
585+
count-of-count values.
586+
"""
587+
self._count_models = {}
588+
NC = self._num_grams_with_count
589+
for N in range(1, self.N + 1):
590+
X, Y = [], []
591+
sorted_counts = sorted(set(self.counts[N].values())) # r
592+
593+
for ix, j in enumerate(sorted_counts):
594+
i = 0 if ix == 0 else sorted_counts[ix - 1]
595+
k = 2 * j - i if ix == len(sorted_counts) - 1 else sorted_counts[ix + 1]
596+
y = 2 * NC(j, N) / (k - i)
597+
X.append(j)
598+
Y.append(y)
599+
600+
# fit log-linear model: log(counts) ~ log(average_transform(counts))
601+
self._count_models[N] = LinearRegression(fit_intercept=True)
602+
self._count_models[N].fit(np.log(X), np.log(Y))
603+
b, a = self._count_models[N].beta
604+
605+
if a > -1:
606+
fstr = "[Warning] Log-log averaging transform has slope > -1 for N={}"
607+
print(fstr.format(N))

0 commit comments

Comments
 (0)