-
Notifications
You must be signed in to change notification settings - Fork 84
/
speech_lex_missing.py
executable file
·123 lines (90 loc) · 2.46 KB
/
speech_lex_missing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2016, 2017 Guenter Bartsch
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#
# compute top-20 missing words in lexicon from submissions
#
import os
import sys
import logging
import traceback
import curses
import curses.textpad
import locale
import codecs
from optparse import OptionParser
from nltools import misc
from nltools.tokenizer import tokenize
from nltools.phonetics import ipa2xsampa, xsampa2ipa
from speech_transcripts import Transcripts
from speech_lexicon import Lexicon
NUM_WORDS = 50
verbose = len(sys.argv)==2 and sys.argv[1] == '-v'
logging.basicConfig(level=logging.DEBUG)
#
# init terminal
#
misc.init_app ('speech_lex_missing')
#
# load transcripts
#
transcripts = Transcripts()
#
# load lexicon
#
lex = Lexicon()
#
# find missing words
#
missing = {} # word -> count
num = len(transcripts)
cnt = 0
num_ts_lacking = 0
num_ts_complete = 0
for cfn in transcripts:
ts = transcripts[cfn]
cnt += 1
if ts['quality']>0:
continue
lacking = False
for word in tokenize(ts['prompt']):
if word in lex:
continue
if word in missing:
missing[word] += 1
else:
missing[word] = 1
#print u"%5d/%5d missing word: %s" % (cnt, num, word)
lacking = True
if lacking:
num_ts_lacking += 1
else:
num_ts_complete += 1
cnt = 0
for item in reversed(sorted(missing.items(), key=lambda x: x[1])):
cnt += 1
if verbose:
print u"Missing %4d times: %s" % (item[1], item[0])
else:
print item[0].encode('utf8'),
if cnt > NUM_WORDS:
break
if verbose:
print
print "%d missing words total. %d submissions lack at least one word, %d are covered fully by the lexicon." % (len(missing), num_ts_lacking, num_ts_complete)
print