Skip to content

Commit 0ba25ab

Browse files
author
Rob Speer
committed
fixes to logprob and cromulence
1 parent b3a824f commit 0ba25ab

File tree

2 files changed

+36
-15
lines changed

2 files changed

+36
-15
lines changed

scripts/build_search_index.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from solvertools.wordlist import WORDS
2-
from solvertools.normalize import slugify
2+
from solvertools.normalize import slugify, sanitize
33
from solvertools.util import data_path, corpus_path
44
from whoosh.fields import Schema, ID, TEXT, KEYWORD, NUMERIC
55
from whoosh.analysis import StandardAnalyzer
@@ -39,15 +39,16 @@ def get_adjacent(synset):
3939
for line in tqdm(open(data_path('wordlists/raw/big/en-wp-2word-summaries.txt')), desc='wikipedia'):
4040
title, summary = line.split('\t', 1)
4141
summary = summary.rstrip()
42-
title = title.split(" (")[0]
42+
title = sanitize(title.split(" (")[0])
4343
if title and summary:
44-
slug = slugify(title)
45-
writer.add_document(
46-
slug=slug,
47-
text=title,
48-
definition=summary,
49-
length=len(slug)
50-
)
44+
if ' ' not in title or title in WORDS:
45+
slug = slugify(title)
46+
writer.add_document(
47+
slug=slug,
48+
text=title,
49+
definition=summary,
50+
length=len(slug)
51+
)
5152
except FileNotFoundError:
5253
print("Skipping Wikipedia search index: en-wp-2word-summaries.txt not found")
5354

solvertools/wordlist.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
# that is just barely an answer, for which we use the entropy of the meta
2121
# answer "OUI, PAREE'S GAY". (Our probability metric considers that a worse
2222
# answer than "TURKMENHOWAYOLLARY" or "ATZERODT OR VOLOKH EG".)
23-
NULL_HYPOTHESIS_ENTROPY = -4.195522303459861
23+
NULL_HYPOTHESIS_ENTROPY = -4.2
2424
DECIBEL_SCALE = 20 / log(10)
2525

2626

@@ -93,20 +93,33 @@ def segment_logprob(self, slug):
9393
if found is None:
9494
return None
9595
freq, text = found
96-
logprob = log(freq) - self.logtotal
96+
logprob = (log(freq) - self.logtotal)
9797
return logprob, text
9898

99-
def logprob(self, word):
99+
def freq(self, word):
100100
"""
101-
Get the log probability of a single item in the wordlist.
102-
Always returns just a number, which is -1000 if it's not found.
101+
Get the frequency of a single item in the wordlist.
102+
Always returns just a number, which is 0 if it's not found.
103103
"""
104104
found = self.lookup_slug(slugify(word))
105105
if found is None:
106-
return -1000.
106+
return 0.
107107
else:
108108
return found[0]
109109

110+
def logprob(self, word):
111+
"""
112+
Get the log probability of a single word, or 0 if it's not found.
113+
"""
114+
if self.logtotal is None:
115+
totalfreq, _ = self.lookup_slug('')
116+
self.logtotal = log(totalfreq)
117+
freq = self.freq(word)
118+
if freq == 0.:
119+
return -1000
120+
logprob = log(freq) - self.logtotal
121+
return logprob
122+
110123
def text_logprob(self, text):
111124
"""
112125
Get the log probability of this text, along with its most likely
@@ -588,6 +601,13 @@ def combine_wordlists(weighted_lists, out_name):
588601
print("Combining %s" % weighted_lists)
589602
for name, weight in weighted_lists:
590603
for i, slug, freq, text in read_wordlist(name):
604+
# Turns out that things that just barely make our cutoff from
605+
# Google Books are worse than you'd think
606+
if name == 'google-books':
607+
freq -= 1000
608+
if freq <= 0:
609+
break
610+
591611
# Replace an existing text if this spelling of it has a solid
592612
# majority of the frequency so far. Avoids weirdness such as
593613
# spelling "THE" as "T'HE".

0 commit comments

Comments
 (0)