-
Notifications
You must be signed in to change notification settings - Fork 3
/
induce-code-switches-in-conll.py
66 lines (62 loc) · 2.99 KB
/
induce-code-switches-in-conll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import io
import argparse
from collections import defaultdict
argparser = argparse.ArgumentParser()
argparser.add_argument("-i", "--input_conll_filename", required=True)
argparser.add_argument("-o", "--output_conll_filename", required=True)
argparser.add_argument("-d", "--dictionary_filename", required=True)
argparser.add_argument("-c", "--code_switched_filename")
argparser.add_argument("-f", "--frequency", type=float, default=1.0)
argparser.add_argument("-l1", default='en')
argparser.add_argument("-l2", default='es')
args = argparser.parse_args()
# load dictionary
dictionary = defaultdict(list)
dictionary_size = 0
with io.open(args.dictionary_filename, encoding='utf8') as dictionary_file:
for line in dictionary_file:
l1_word, l2_word = line.strip().split(' ||| ')
dictionary[l1_word].append(l2_word)
dictionary_size += 1
print '# of dictionary entries:', dictionary_size
print '# of l1 words in the dictionary:', len(dictionary)
print
# map each word in l1 to words which follow it in l2 according to the code switched text
l1_word_to_l2_next_words = defaultdict(set)
l1_to_l2_transitions = 0
if args.code_switched_filename:
for line in io.open(args.code_switched_filename, encoding='utf8'):
tokens = line.strip().split(' ')
for i in xrange(len(tokens)-1):
if tokens[i].startswith(args.l1) and tokens[i+1].startswith(args.l2):
l1_word_to_l2_next_words[ tokens[i] ].add( tokens[i+1] )
l1_to_l2_transitions += 1
print '# of transitions from l1 to l2:', l1_to_l2_transitions
print '# of unique l1 words which has transitions:', len(l1_word_to_l2_next_words)
print
# substitute each word in l1 with its translation in l2 if this results in a bigram which has been seen in the code switched text
# if no code switched text is available, replace words with translations at the specified frequency
tokens_count, replaced_count = 1.0, 0.0
with io.open(args.input_conll_filename) as input_conll_file:
with io.open(args.output_conll_filename, mode='w') as output_conll_file:
previous_word = ''
for line in input_conll_file:
if len(line.strip()) == 0:
output_conll_file.write(u'\n')
previous_word = ''
else:
conll_fields = line.strip().split('\t')
token = conll_fields[1]
if token in dictionary and (not args.code_switched_filename or previous_word != '' and previous_word in l1_word_to_l2_next_words):
'previous token = ', previous_word
'current token = ', token
for l2_translation in dictionary[token]:
if l2_translation in l1_word_to_l2_next_words[previous_word]:
if replaced_count / tokens_count < args.frequency:
conll_fields[1] = l2_translation
replaced_count += 1
break
tokens_count += 1
output_conll_file.write(u'\t'.join(conll_fields) + u'\n')
previous_word = token
print '# of replacements: {} out of {} tokens. code switching rate = {}'.format(replaced_count, tokens_count, replaced_count / tokens_count)