-
Notifications
You must be signed in to change notification settings - Fork 3
/
project_foreign_words_to_english_brown_clusters.py
57 lines (52 loc) · 2.72 KB
/
project_foreign_words_to_english_brown_clusters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import gzip
import re
import time
import io
import sys
import argparse
from collections import defaultdict
# parse/validate arguments
argparser = argparse.ArgumentParser()
# TODO(wammar): implement the "robust projection" methods of Guo et al. (ACL 2015), which assigns "OOV" foreign words to the cluster of edit-1-distant words.
argparser.add_argument("-ib", "--input-english-brown-clusters", help="input english brown clusters")
argparser.add_argument("-ob", "--output-foreign-brown-clusters", help="output foreign brown clusters")
argparser.add_argument("-ia", "--input-alignment-probs", help="input alignment probabilities, each line is formatted: foreign_word english_word p(english_word|foregin_word)")
args = argparser.parse_args()
# read brown clusters
english_to_cluster = {}
with io.open(args.input_english_brown_clusters, encoding='utf8') as english_clusters:
for line in english_clusters:
# each line consists of three tab-delimited fields: brown cluster, english word, frequency
splits = line.split('\t')
if len(splits) != 3:
print 'the following line in the clusters file is malformatted', line
exit
cluster, english, frequency = splits[0], splits[1], splits[2]
english_to_cluster[english] = cluster
# read word alignment probabilities
alignments = defaultdict(lambda: defaultdict(float))
with io.open(args.input_alignment_probs, encoding='utf8') as alignments_file:
for line in alignments_file:
# each line consists of three space-delimited fields: foregin word, english word, alignment probability
splits = line.strip().split()
if (len(splits) != 3):
print 'the following line in the alignments file is malformatted', line
exit
foreign, english, prob = splits[0], splits[1], float(splits[2])
alignments[foreign][english] = prob
# for each foreign word, write the brown cluster of the most likely english translation (for which we have a brown cluster)
with io.open(args.output_foreign_brown_clusters, encoding='utf8', mode='w') as output_file:
for foreign in alignments.keys():
best_cluster = ''
best_prob = 0.0
for english, prob in alignments[foreign].items():
# skip english translatiosn for which we don't have an assigned cluster
if english not in english_to_cluster: continue
# skip english translations which are less likely than what we already used
if prob < best_prob: continue
# if you reach this far, update best_cluster and best_prob
best_cluster, best_prob = english_to_cluster[english], prob
# skip foreign words we can't project
if len(best_cluster) == 0: continue
# write the projected cluster of this foreign word, with made-up freq of 10
output_file.write(u'{}\t{}\t{}\n'.format(best_cluster, foreign, 10))