7
7
8
8
import numpy as np
9
9
10
+ from linear_models .lm import LinearRegression
10
11
from preprocessing .nlp import tokenize_words , ngrams
11
12
12
13
@@ -46,11 +47,19 @@ def train(self, corpus_fp, vocab=None, encoding=None):
46
47
The path to a newline-separated text corpus file
47
48
vocab : `preprocessing.nlp.Vocabulary` instance (default: None)
48
49
If not `None`, only the words in `vocab` will be used to construct
49
- the language model
50
+ the language model; all out-of-vocabulary words will either be
51
+ mappend to <unk> (if self.unk = True) or removed (if self.unk =
52
+ False).
50
53
encoding : str (default: None)
51
54
Specifies the text encoding for corpus. Common entries are 'utf-8',
52
55
'utf-8-sig', 'utf-16'.
53
56
"""
57
+ return self ._train (corpus_fp , vocab = vocab , encoding = encoding )
58
+
59
+ def _train (self , corpus_fp , vocab = None , encoding = None ):
60
+ """
61
+ Actual N-gram training logic
62
+ """
54
63
H = self .hyperparameters
55
64
grams = {N : [] for N in range (1 , self .N + 1 )}
56
65
counts = {N : Counter () for N in range (1 , self .N + 1 )}
@@ -146,13 +155,15 @@ def generate(self, N, seed_words=["<bol>"], n_sentences=5):
146
155
words = seed_words .copy ()
147
156
while counter < n_sentences :
148
157
nextw , probs = zip (* self .completions (words , N ))
149
- next_word = np .random .choice (nextw , p = np .exp (probs ))
158
+ probs = np .exp (probs ) / np .exp (probs ).sum () # renormalize probs if smoothed
159
+ next_word = np .random .choice (nextw , p = probs )
150
160
151
161
# if we reach the end of a sentence, save it and start a new one
152
162
if next_word == "<eol>" :
153
163
S = " " .join ([w for w in words if w != "<bol>" ])
154
164
S = textwrap .fill (S , 90 , initial_indent = "" , subsequent_indent = " " )
155
165
print (S )
166
+ words .append (next_word )
156
167
sentences .append (words )
157
168
words = seed_words .copy ()
158
169
counter += 1
@@ -161,6 +172,64 @@ def generate(self, N, seed_words=["<bol>"], n_sentences=5):
161
172
words .append (next_word )
162
173
return sentences
163
174
175
+ def perplexity (self , words , N ):
176
+ """
177
+ Calculate the model perplexity on a sequence of words. Perplexity,
178
+ PP, is defined as
179
+
180
+ PP(W) = ( 1 / p(W) ) ^ (1 / n)
181
+ log PP(W) = (1 / n) * log(1 / p(W))
182
+ = -(1 / n) * log p(W)
183
+ PP(W) = np.exp(-(1 / n) * log p(W))
184
+ = np.exp(cross_entropy(W))
185
+
186
+ where n is the number of `N`-grams in W.
187
+
188
+ The higher the conditional probability of the word sequence, the lower
189
+ the perplexity. Thus, minimizing perplexity is equivalent to maximizing
190
+ the probability of `words` under the `N`-gram model.
191
+
192
+ Perplexity is equivalent to the average branching factor in predicting
193
+ the next word.
194
+
195
+ Parameters
196
+ ----------
197
+ N : int
198
+ The gram-size of the model to calculate perplexity with
199
+ words : list or tuple of strings
200
+ The sequence of words to compute perplexity on
201
+
202
+ Returns
203
+ -------
204
+ perplexity : float
205
+ The model perlexity for the words in `words`
206
+ """
207
+ return np .exp (self .cross_entropy (words , N ))
208
+
209
+ def cross_entropy (self , words , N ):
210
+ """
211
+ Calculate the model cross-entropy on a sequence of words. Cross-entropy,
212
+ XE, is defined as
213
+
214
+ XE(W) = -(1 / n) * log p(W)
215
+
216
+ where n is the number of N-grams in W.
217
+
218
+ Parameters
219
+ ----------
220
+ N : int
221
+ The gram-size of the model to calculate cross-entropy on
222
+ words : list or tuple of strings
223
+ The sequence of words to compute cross-entropy on
224
+
225
+ Returns
226
+ -------
227
+ cross_entropy : float
228
+ The model cross-entropy for the words in `words`
229
+ """
230
+ n_ngrams = len (ngrams (words , N ))
231
+ return - (1 / n_ngrams ) * self .log_prob (words , N )
232
+
164
233
def _log_prob (self , words , N ):
165
234
"""Calculate the log probability of a sequence of words under the `N`-gram model"""
166
235
assert N in self .counts , "You do not have counts for {}-grams" .format (N )
@@ -334,7 +403,205 @@ def _log_ngram_prob(self, ngram):
334
403
counts , n_words , n_tokens = self .counts , self .n_words [1 ], self .n_tokens [1 ]
335
404
336
405
ctx = ngram [:- 1 ]
337
- ctx_count = counts [N - 1 ][ctx ] if N > 1 else n_words
338
406
num = counts [N ][ngram ] + K
407
+ ctx_count = counts [N - 1 ][ctx ] if N > 1 else n_words
339
408
den = ctx_count + K * n_tokens
340
409
return np .log (num / den ) if den != 0 else - np .inf
410
+
411
+
412
+ class GoodTuringNGram (NGramBase ):
413
+ def __init__ (
414
+ self , N , conf = 1.96 , unk = True , filter_stopwords = True , filter_punctuation = True
415
+ ):
416
+ """
417
+ An N-Gram model with smoothed probabilities calculated with the simple
418
+ Good-Turing estimator from Gale (2001).
419
+
420
+ Parameters
421
+ ----------
422
+ N : int
423
+ The maximum length (in words) of the context-window to use in the
424
+ langauge model. Model will compute all n-grams from 1, ..., N
425
+ conf: float (default: 1.96)
426
+ The multiplier of the standard deviation of the empirical smoothed
427
+ count (the default, 1.96, corresponds to a 95% confidence
428
+ interval). Controls how many datapoints are smoothed using the
429
+ log-linear model.
430
+ unk : bool (default: True)
431
+ Whether to include the <unk> (unknown) token in the LM
432
+ filter_stopwords : bool (default: True)
433
+ Whether to remove stopwords before training
434
+ filter_punctuation : bool (default: True)
435
+ Whether to remove punctuation before training
436
+ """
437
+ super ().__init__ (N , unk , filter_stopwords , filter_punctuation )
438
+ self .hyperparameters ["id" ] = "GoodTuringNGram"
439
+ self .hyperparameters ["conf" ] = conf
440
+
441
+ def train (self , corpus_fp , vocab = None , encoding = None ):
442
+ """
443
+ Compile the n-gram counts for the text(s) in `corpus_fp`. Upon
444
+ completion the `self.counts` attribute will store dictionaries of the
445
+ N, N-1, ..., 1-gram counts.
446
+
447
+ Parameters
448
+ ----------
449
+ corpus_fp : str
450
+ The path to a newline-separated text corpus file
451
+ vocab : `preprocessing.nlp.Vocabulary` instance (default: None)
452
+ If not `None`, only the words in `vocab` will be used to construct
453
+ the language model; all out-of-vocabulary words will either be
454
+ mappend to <unk> (if self.unk = True) or removed (if self.unk =
455
+ False).
456
+ encoding : str (default: None)
457
+ Specifies the text encoding for corpus. Common entries are 'utf-8',
458
+ 'utf-8-sig', 'utf-16'.
459
+ """
460
+ self ._train (corpus_fp , vocab = None , encoding = None )
461
+ self ._calc_smoothed_counts ()
462
+
463
+ def log_prob (self , words , N ):
464
+ """
465
+ Compute the smoothed log probability of a sequence of words under the
466
+ `N`-gram language model with Good-Turing smoothing. For a bigram,
467
+ this amounts to:
468
+
469
+ P(w_i | w_{i-1}) = C* / Count(w_{i-1})
470
+
471
+ where C* is the Good-Turing smoothed estimate of the bigram count:
472
+
473
+ C* = [ (c + 1) * NumCounts(c + 1, 2) ] / NumCounts(c, 2)
474
+
475
+ where
476
+
477
+ c = Count(w_{i-1}, w_i)
478
+ NumCounts(r, k) = |{ k-gram : Count(k-gram) = r }|
479
+
480
+ In words, the probability of an N-gram that occurs r times in the
481
+ corpus is estimated by dividing up the probability mass occupied by
482
+ N-grams that occur r+1 times.
483
+
484
+ For large values of r, NumCounts becomes unreliable. In this case, we
485
+ compute a smoothed version of NumCounts using a power law function,
486
+ log(NumCounts(r)) = a * log(r) + b.
487
+
488
+ Under the Good-Turing estimator, the total probability assigned to
489
+ unseen N-grams is equal to the relative occurrence of N-grams that
490
+ appear only once.
491
+
492
+ Parameters
493
+ ----------
494
+ words : list of strings
495
+ A sequence of words
496
+ N : int
497
+ The gram-size of the language model to use when calculating the log
498
+ probabilities of the sequence
499
+
500
+ Returns
501
+ -------
502
+ total_prob : float
503
+ The total log-probability of the sequence `words` under the
504
+ `N`-gram language model
505
+ """
506
+ return self ._log_prob (words , N )
507
+
508
+ def _calc_smoothed_counts (self ):
509
+ use_interp = False
510
+ counts = self .counts
511
+ NC = self ._num_grams_with_count
512
+ conf = self .hyperparameters ["conf" ]
513
+
514
+ totals = {N : 0 for N in range (1 , self .N + 1 )}
515
+ smooth_counts = {N : {} for N in range (1 , self .N + 1 )}
516
+
517
+ # calculate the probability of all <unk> (i.e., unseen) n-grams
518
+ self ._p0 = {n : NC (1 , n ) / sum (counts [n ].values ()) for n in range (1 , self .N + 1 )}
519
+
520
+ # fit log-linear models for predicting smoothed counts in absence of
521
+ # real data
522
+ self ._fit_count_models ()
523
+
524
+ LM = self ._count_models
525
+ for N in range (1 , self .N + 1 ):
526
+ for C in sorted (set (counts [N ].values ())):
527
+
528
+ # estimate the interpolated count using the log-linear model
529
+ c1_lm = np .exp (LM [N ].predict (np .c_ [np .log (C + 1 )])).item ()
530
+ c0_lm = np .exp (LM [N ].predict (np .c_ [np .log (C )])).item ()
531
+ count_interp = ((C + 1 ) * c1_lm ) / c0_lm
532
+
533
+ # if we have previously been using the interpolated count, or
534
+ # if the number of ocurrences of C+1 is 0, use the interpolated
535
+ # count as the smoothed count value C*
536
+ c1 , c0 = NC (C + 1 , N ), NC (C , N )
537
+ if use_interp or c1 == 0 :
538
+ use_interp = True
539
+ smooth_counts [N ][C ] = count_interp
540
+ totals [N ] += c0 * smooth_counts [N ][C ]
541
+ continue
542
+
543
+ # estimate the smoothed count C* empirically if the number of
544
+ # terms with count C + 1 > 0
545
+ count_emp = ((C + 1 ) * c1 ) / c0
546
+
547
+ # compute the approximate variance of the empirical smoothed
548
+ # count C* given C
549
+ t = conf * np .sqrt ((C + 1 ) ** 2 * (c1 / c0 ** 2 ) * (1 + c1 / c0 ))
550
+
551
+ # if the difference between the empirical and interpolated
552
+ # smoothed counts is greater than t, the empirical estimate
553
+ # tends to be more accurate. otherwise, use interpolated
554
+ if np .abs (count_interp - count_emp ) > t :
555
+ smooth_counts [N ][C ] = count_emp
556
+ totals [N ] += c0 * smooth_counts [N ][C ]
557
+ continue
558
+
559
+ use_interp = True
560
+ smooth_counts [N ][C ] = count_interp
561
+ totals [N ] += c0 * smooth_counts [N ][C ]
562
+
563
+ self ._smooth_totals = totals
564
+ self ._smooth_counts = smooth_counts
565
+
566
+ def _log_ngram_prob (self , ngram ):
567
+ """Return the smoothed log probability of the ngram"""
568
+ N = len (ngram )
569
+ sc , T = self ._smooth_counts [N ], self ._smooth_totals [N ]
570
+ n_tokens , n_seen = self .n_tokens [N ], len (self .counts [N ])
571
+
572
+ # approx. prob of an out-of-vocab ngram (i.e., a fraction of p0)
573
+ n_unseen = max ((n_tokens ** N ) - n_seen , 1 )
574
+ prob = np .log (self ._p0 [N ] / n_unseen )
575
+
576
+ if ngram in self .counts [N ]:
577
+ C = self .counts [N ][ngram ]
578
+ prob = np .log (1 - self ._p0 [N ]) + np .log (sc [C ]) - np .log (T )
579
+ return prob
580
+
581
+ def _fit_count_models (self ):
582
+ """
583
+ Perform the averaging transform proposed by Church and Gale (1991):
584
+ estimate the expected count-of-counts by the *density* of
585
+ count-of-count values.
586
+ """
587
+ self ._count_models = {}
588
+ NC = self ._num_grams_with_count
589
+ for N in range (1 , self .N + 1 ):
590
+ X , Y = [], []
591
+ sorted_counts = sorted (set (self .counts [N ].values ())) # r
592
+
593
+ for ix , j in enumerate (sorted_counts ):
594
+ i = 0 if ix == 0 else sorted_counts [ix - 1 ]
595
+ k = 2 * j - i if ix == len (sorted_counts ) - 1 else sorted_counts [ix + 1 ]
596
+ y = 2 * NC (j , N ) / (k - i )
597
+ X .append (j )
598
+ Y .append (y )
599
+
600
+ # fit log-linear model: log(counts) ~ log(average_transform(counts))
601
+ self ._count_models [N ] = LinearRegression (fit_intercept = True )
602
+ self ._count_models [N ].fit (np .log (X ), np .log (Y ))
603
+ b , a = self ._count_models [N ].beta
604
+
605
+ if a > - 1 :
606
+ fstr = "[Warning] Log-log averaging transform has slope > -1 for N={}"
607
+ print (fstr .format (N ))
0 commit comments