fixes to logprob and cromulence

Rob Speer · Rob Speer · commit 0ba25ab0a4db · 2018-01-12T03:25:15.000-05:00
diff --git a/scripts/build_search_index.py b/scripts/build_search_index.py
@@ -1,5 +1,5 @@
 from solvertools.wordlist import WORDS
-from solvertools.normalize import slugify
+from solvertools.normalize import slugify, sanitize
 from solvertools.util import data_path, corpus_path
 from whoosh.fields import Schema, ID, TEXT, KEYWORD, NUMERIC
 from whoosh.analysis import StandardAnalyzer
@@ -39,15 +39,16 @@ def get_adjacent(synset):
         for line in tqdm(open(data_path('wordlists/raw/big/en-wp-2word-summaries.txt')), desc='wikipedia'):
             title, summary = line.split('\t', 1)
             summary = summary.rstrip()
-            title = title.split(" (")[0]
+            title = sanitize(title.split(" (")[0])
             if title and summary:
-                slug = slugify(title)
-                writer.add_document(
-                    slug=slug,
-                    text=title,
-                    definition=summary,
-                    length=len(slug)
-                )
+                if ' ' not in title or title in WORDS:
+                    slug = slugify(title)
+                    writer.add_document(
+                        slug=slug,
+                        text=title,
+                        definition=summary,
+                        length=len(slug)
+                    )
     except FileNotFoundError:
         print("Skipping Wikipedia search index: en-wp-2word-summaries.txt not found")
 
diff --git a/solvertools/wordlist.py b/solvertools/wordlist.py
@@ -20,7 +20,7 @@
 # that is just barely an answer, for which we use the entropy of the meta
 # answer "OUI, PAREE'S GAY". (Our probability metric considers that a worse
 # answer than "TURKMENHOWAYOLLARY" or "ATZERODT OR VOLOKH EG".)
-NULL_HYPOTHESIS_ENTROPY = -4.195522303459861
+NULL_HYPOTHESIS_ENTROPY = -4.2
 DECIBEL_SCALE = 20 / log(10)
 
 
@@ -93,20 +93,33 @@ def segment_logprob(self, slug):
         if found is None:
             return None
         freq, text = found
-        logprob = log(freq) - self.logtotal
+        logprob = (log(freq) - self.logtotal)
         return logprob, text
 
-    def logprob(self, word):
+    def freq(self, word):
         """
-        Get the log probability of a single item in the wordlist.
-        Always returns just a number, which is -1000 if it's not found.
+        Get the frequency of a single item in the wordlist.
+        Always returns just a number, which is 0 if it's not found.
         """
         found = self.lookup_slug(slugify(word))
         if found is None:
-            return -1000.
+            return 0.
         else:
             return found[0]
 
+    def logprob(self, word):
+        """
+        Get the log probability of a single word, or 0 if it's not found.
+        """
+        if self.logtotal is None:
+            totalfreq, _ = self.lookup_slug('')
+            self.logtotal = log(totalfreq)
+        freq = self.freq(word)
+        if freq == 0.:
+            return -1000
+        logprob = log(freq) - self.logtotal
+        return logprob
+
     def text_logprob(self, text):
         """
         Get the log probability of this text, along with its most likely
@@ -588,6 +601,13 @@ def combine_wordlists(weighted_lists, out_name):
     print("Combining %s" % weighted_lists)
     for name, weight in weighted_lists:
         for i, slug, freq, text in read_wordlist(name):
+            # Turns out that things that just barely make our cutoff from
+            # Google Books are worse than you'd think
+            if name == 'google-books':
+                freq -= 1000
+                if freq <= 0:
+                    break
+
             # Replace an existing text if this spelling of it has a solid
             # majority of the frequency so far. Avoids weirdness such as
             # spelling "THE" as "T'HE".