-
Notifications
You must be signed in to change notification settings - Fork 84
/
Copy pathwiktionary_sequitur_gen.py
executable file
·385 lines (288 loc) · 10.8 KB
/
wiktionary_sequitur_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2018 Guenter Bartsch
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#
# use the pre-trained wiktionary sequitur model to generate candidate lex entries,
# validate them against our regular sequitur model, add ones that match to our lex
#
import os
import sys
import string
import codecs
import logging
import traceback
from optparse import OptionParser
from nltools import misc
from speech_lexicon import Lexicon
from nltools.sequiturclient import sequitur_gen_ipa, sequitur_gen_ipa_multi
from nltools.phonetics import xsampa2ipa
PROC_TITLE = 'wiktionary_sequitur_gen'
DICTFN = 'data/dst/speech/de/dict_wiktionary_de.txt'
OUTDICTFN = 'data/dst/speech/de/dict_wiktionary_gen.txt'
OUTREJFN = 'data/dst/speech/de/dict_wiktionary_rej.txt'
CHUNKINRFN = 'data/dst/speech/de/dict_wiktionary_chinr_%04d.txt'
CHUNKINWFN = 'data/dst/speech/de/dict_wiktionary_chinw_%04d.txt'
CHUNKOUTRFN = 'data/dst/speech/de/dict_wiktionary_choutr_%04d.txt'
CHUNKOUTWFN = 'data/dst/speech/de/dict_wiktionary_choutw_%04d.txt'
SCRIPTFN = 'data/dst/speech/de/dict_wiktionary_run_parallel.sh'
REGULAR_MODEL = 'data/models/sequitur-dict-de.ipa-latest'
WIKTIONARY_MODEL = 'data/dst/speech/de/wiktionary_sequitur/model-6'
TEST_TOKEN = u'aalbestand'
# CHUNK_SIZE = 1000
CHUNK_SIZE = 256
DEFAULT_NUM_CPUS = 4
# DEBUG_CHUNK_LIMIT= 12
DEBUG_CHUNK_LIMIT= 0
ALPHABET = set([ u'a', u'b', u'c', u'd', u'e', u'f', u'g', u'h', u'i', u'j', u'k', u'l', u'm', u'n', u'o', u'p', u'q', u'r', u's', u't', u'u', u'v', u'w', u'x', u'y', u'z', u'ü', u'ö', u'ä', u'ß'])
#
# init
#
misc.init_app(PROC_TITLE)
#
# commandline parsing
#
parser = OptionParser("usage: %prog [options] )")
parser.add_option ("-F", "--filter", dest="filter_file", type="str",
help="limit extraction to tokens listed in this file, default: no filtering")
parser.add_option ("-n", "--num-cpus", dest="num_cpus", type="int", default=DEFAULT_NUM_CPUS,
help="number of cpus to use in parallel, default: %d" % DEFAULT_NUM_CPUS)
parser.add_option ("-v", "--verbose", action="store_true", dest="verbose",
help="enable verbose logging")
(options, args) = parser.parse_args()
if options.verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
#
# load config, set up global variables
#
config = misc.load_config ('.speechrc')
wikfn = config.get("speech", "wiktionary_de")
# IPA_R IPA_W
TOLERATABLE_DIFFS = [ (u'p', u'b'),
(u'b', u'p'),
(u'ɛ', u'ə'),
(u'ə', u'ɛ'),
(u'e', u'ɛ'),
(u'ɛ', u'e'),
(u'r', u'ʁ'),
(u'ʁ', u'r'),
(u'ɐ', u'ʁ'),
(u'ʁ', u'ɐ'),
(u'd', u't'),
(u't', u'd'),
(u'z', u's'),
(u's', u'z'),
(u'ʊ', u'u'),
(u'u', u'ʊ'),
]
def merge_check(token, ipa_r, ipa_w):
""" merge ipa_r and ipa_w into a "best-of" ipa_m:
- take hyphenation from ipa_r since wiktionary does not have hyphens
- take stress from ipa_w unless there is no stress marker in ipa_w, then use ipa_r
- tolerate vocal length differences
- tolerate differences listed above """
ir = 0
iw = 0
ipa_m = u""
stress_w = u"'" in ipa_w
while (ir < len(ipa_r)) and (iw < len(ipa_w)):
# print ir, iw, ipa_m
if ipa_r[ir] == u"'":
ir += 1
if not stress_w:
ipa_m += u"'"
continue
if ipa_r[ir] == u"ː" and ipa_w[iw] != u"ː":
ir += 1
continue
if ipa_r[ir] != u"ː" and ipa_w[iw] == u"ː":
iw += 1
ipa_m += u"ː"
continue
if ipa_r[ir] == u"ʔ" and ipa_w[iw] != u"ʔ":
ir += 1
continue
if ipa_r[ir] != u"ʔ" and ipa_w[iw] == u"ʔ":
iw += 1
ipa_m += u"ʔ"
continue
if ipa_r[ir] == u"-":
ir += 1
ipa_m += u"-"
continue
if ipa_w[iw] == u"'":
iw += 1
if stress_w:
ipa_m += u"'"
continue
if ipa_w[iw] != ipa_r[ir]:
tolerate = False
for tr, tw in TOLERATABLE_DIFFS:
if (ipa_r[ir] == tr) and (ipa_w[iw] == tw):
tolerate = True
break
if not tolerate:
break
ipa_m += ipa_w[iw]
ir += 1
iw += 1
if ir==len(ipa_r) and iw==len(ipa_w):
if token == TEST_TOKEN:
print token
print ipa_r
print ipa_w
print ipa_m
return ipa_m
return None
#
# load lexicon
#
logging.info("loading lexicon...")
lex = Lexicon('dict-de.ipa')
logging.info("loading lexicon...done.")
#
# read filter
#
filter_set = None
if options.filter_file:
logging.info("reading filter file %s ..." % options.filter_file)
filter_set = set()
with codecs.open(options.filter_file, 'r', 'utf8') as filterf:
for line in filterf:
filter_set.add(line.strip())
#
# load wiktionary
#
logging.info("loading wiktionary...")
wiktionary = {}
wiktionary_reverse = {}
with codecs.open(DICTFN, 'r', 'utf8') as dictf:
for line in dictf:
parts = line.strip().split(';')
if len(parts) != 2:
# print "Failed to parse line %s" % line.strip()
continue
word = parts[0]
ipa = parts[1].strip()
if u" " in word:
continue
if u"''" in word:
continue
token = u''
for c in word.lower():
if c in ALPHABET:
token += c
if token in lex:
logging.debug("%05d ignoring %s as it is already in our dict." % (len(wiktionary), token))
continue
if filter_set and not (token in filter_set):
logging.debug("%05d ignoring %s as it is not in the filter file." % (len(wiktionary), token))
continue
wiktionary[token] = (word, ipa)
wiktionary_reverse[ipa] = token
logging.info("loading wiktionary... done. %d entries." % len(wiktionary))
#
# predict missing entries
#
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
logging.info ('predicting missing entries in parallel...')
num_chunks = 0
with open(SCRIPTFN, 'w') as scriptf:
for i, chunk in enumerate(chunks(sorted(wiktionary), CHUNK_SIZE)):
with codecs.open(CHUNKINRFN % i, 'w', 'utf8') as chunkinrf:
with codecs.open(CHUNKINWFN % i, 'w', 'utf8') as chunkinwf:
for token in chunk:
w, ipa = wiktionary[token]
chunkinrf.write('%s\n' % token)
chunkinwf.write('%s\n' % ipa)
scriptf.write('echo %04d\n' % i)
scriptf.write('g2p.py -e utf-8 --model %s --apply %s > %s &\n' % (REGULAR_MODEL, CHUNKINRFN % i, CHUNKOUTRFN % i))
scriptf.write('g2p.py -e utf-8 --model %s --apply %s > %s &\n' % (WIKTIONARY_MODEL, CHUNKINWFN % i, CHUNKOUTWFN % i))
num_chunks += 1
if DEBUG_CHUNK_LIMIT and num_chunks > DEBUG_CHUNK_LIMIT:
logging.warn('debug limit reached.')
break
if i % options.num_cpus == (options.num_cpus-1):
scriptf.write('wait\n')
scriptf.write('wait\n')
logging.info ('%s written.' % SCRIPTFN)
os.system('chmod 700 %s' % SCRIPTFN)
os.system(SCRIPTFN)
ipa_r_map = {}
ipa_w_map = {}
for chunkidx in range(num_chunks):
with codecs.open(CHUNKOUTRFN % chunkidx, 'r', 'utf8') as chunkf:
for line in chunkf:
parts = line.strip().split('\t')
if len(parts) < 2:
continue
try:
word = parts[0]
if word in wiktionary:
xs = parts[1]
ipa = xsampa2ipa(word, xs)
ipa_r_map[word] = ipa
except:
logging.error("Error processing line %s:" % line)
logging.error(traceback.format_exc())
with codecs.open(CHUNKOUTWFN % chunkidx, 'r', 'utf8') as chunkf:
for line in chunkf:
parts = line.strip().split('\t')
if len(parts) < 2:
continue
try:
ipan = parts[0]
if ipan in wiktionary_reverse:
word = wiktionary_reverse[ipan]
xs = parts[1]
ipa = xsampa2ipa(word, xs)
ipa_w_map[word] = ipa
except:
logging.error("Error processing line %s:" % line)
logging.error(traceback.format_exc())
# print repr(ipa_r_map)
# print repr(ipa_w_map)
with codecs.open(OUTDICTFN, 'w', 'utf8') as outdictf, \
codecs.open(OUTREJFN, 'w', 'utf8') as outrejf:
cnt_matched = 0
cnt = 0
for token in ipa_r_map:
if not token in ipa_w_map:
continue
try:
ipa_r = ipa_r_map[token]
ipa_w = ipa_w_map[token]
ipa_m = merge_check(token, ipa_r, ipa_w)
if ipa_m and (not u"'" in ipa_m): # at least one stress marker is required
ipa_m = None
# if matched:
if ipa_m:
logging.info("%6d/%6d %6d %-30s: %s vs %s MATCHED!" % (cnt, len(wiktionary), cnt_matched, token, ipa_r, ipa_w))
cnt_matched += 1
outdictf.write(u"%s;%s\n" % (token, ipa_m))
else:
logging.info("%6d/%6d %6d %-30s: %s vs %s" % (cnt, len(wiktionary), cnt_matched, token, ipa_r, ipa_w))
outrejf.write(u"\n%s\nIPA_R %s\nIPA_W %s\n" % (token, ipa_r.replace(u"-", u""), ipa_w))
except:
logging.error(traceback.format_exc())
logging.info (" %s written." % OUTDICTFN)
logging.info (" %s written." % OUTREJFN)