|
20 | 20 | # that is just barely an answer, for which we use the entropy of the meta
|
21 | 21 | # answer "OUI, PAREE'S GAY". (Our probability metric considers that a worse
|
22 | 22 | # answer than "TURKMENHOWAYOLLARY" or "ATZERODT OR VOLOKH EG".)
|
23 |
| -NULL_HYPOTHESIS_ENTROPY = -4.195522303459861 |
| 23 | +NULL_HYPOTHESIS_ENTROPY = -4.2 |
24 | 24 | DECIBEL_SCALE = 20 / log(10)
|
25 | 25 |
|
26 | 26 |
|
@@ -93,20 +93,33 @@ def segment_logprob(self, slug):
|
93 | 93 | if found is None:
|
94 | 94 | return None
|
95 | 95 | freq, text = found
|
96 |
| - logprob = log(freq) - self.logtotal |
| 96 | + logprob = (log(freq) - self.logtotal) |
97 | 97 | return logprob, text
|
98 | 98 |
|
99 |
| - def logprob(self, word): |
| 99 | + def freq(self, word): |
100 | 100 | """
|
101 |
| - Get the log probability of a single item in the wordlist. |
102 |
| - Always returns just a number, which is -1000 if it's not found. |
| 101 | + Get the frequency of a single item in the wordlist. |
| 102 | + Always returns just a number, which is 0 if it's not found. |
103 | 103 | """
|
104 | 104 | found = self.lookup_slug(slugify(word))
|
105 | 105 | if found is None:
|
106 |
| - return -1000. |
| 106 | + return 0. |
107 | 107 | else:
|
108 | 108 | return found[0]
|
109 | 109 |
|
| 110 | + def logprob(self, word): |
| 111 | + """ |
| 112 | + Get the log probability of a single word, or 0 if it's not found. |
| 113 | + """ |
| 114 | + if self.logtotal is None: |
| 115 | + totalfreq, _ = self.lookup_slug('') |
| 116 | + self.logtotal = log(totalfreq) |
| 117 | + freq = self.freq(word) |
| 118 | + if freq == 0.: |
| 119 | + return -1000 |
| 120 | + logprob = log(freq) - self.logtotal |
| 121 | + return logprob |
| 122 | + |
110 | 123 | def text_logprob(self, text):
|
111 | 124 | """
|
112 | 125 | Get the log probability of this text, along with its most likely
|
@@ -588,6 +601,13 @@ def combine_wordlists(weighted_lists, out_name):
|
588 | 601 | print("Combining %s" % weighted_lists)
|
589 | 602 | for name, weight in weighted_lists:
|
590 | 603 | for i, slug, freq, text in read_wordlist(name):
|
| 604 | + # Turns out that things that just barely make our cutoff from |
| 605 | + # Google Books are worse than you'd think |
| 606 | + if name == 'google-books': |
| 607 | + freq -= 1000 |
| 608 | + if freq <= 0: |
| 609 | + break |
| 610 | + |
591 | 611 | # Replace an existing text if this spelling of it has a solid
|
592 | 612 | # majority of the frequency so far. Avoids weirdness such as
|
593 | 613 | # spelling "THE" as "T'HE".
|
|
0 commit comments