-
Notifications
You must be signed in to change notification settings - Fork 0
/
miamiCorpusLID.py
138 lines (112 loc) · 4.39 KB
/
miamiCorpusLID.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import lidCall
import eval
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
# model_name = '/scratch/gpfs/ca2992/robertuito-base-cased'
model_name = '/scratch/gpfs/ca2992/codeswitch-spaeng-lid-lince'
tokenizer_name = '/scratch/gpfs/ca2992/codeswitch-spaeng-lid-lince'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
out_dir = '/scratch/gpfs/ca2992/jpLLM/jpLLM/lid_out'
data_dir = '/scratch/gpfs/ca2992/jpLLM/bangor/crowdsourced_bangor'
lid_model = pipeline('ner', model=model, tokenizer=tokenizer)
lid_truth = []
lid_pred = []
# given a token with the '#' symbol,
# remove the symbol for preprocessing
def cleanPoundSign(word):
tempTok = ""
for i in range(len(word)):
if (word[i] != '#'):
tempTok = tempTok + word[i]
return tempTok
# words in the annotated Bangor Corpus
# contain ' if a contraction. Check to allow
# concatenation
def isContraction(word):
for char in word:
if (char == '\''):
return True
return False
# convert token predictions to word predictions
def tokenToWordPred(message, trueWords):
lidResult = lid_model(message)
index = 0
for word in trueWords:
lidToken = lidResult[index].get('word')
# get the lid predicted for this token and append
# to the lid word level predictions
lid = lidResult[index].get('entity')
lid_pred.append([lid])
# if token word mismatch imlidsible to handle
if (word != lidToken and word[0] != lidToken[0]):
print("MISMATCH", word, lidToken)
continue
while (word != lidToken and word[0] == lidToken[0]):
index += 1
lidToken = lidToken + lidResult[index].get('word')
# get rid of # symbols added by tokenizer
lidToken = cleanPoundSign(lidToken)
index += 1
with open(out_dir, "a") as output:
for file in os.listdir(data_dir):
if os.path.isdir(data_dir + '/' + file):
# Skip directories and readme
continue
if(file == "README.md"):
continue
# open the current file in the directory
with open(data_dir + '/' + file, "r") as read:
numWords = 0
words = []
message = ""
for line in read:
values = line.split()
# skip blank lines or placeholder lines
if (len(values) <= 3):
# print(line)
continue
# print(values[0], values[1], values[2], values[3])
# print(line)
lid = values[2] #lid at index 2 of each line
word = values[1] # word at index 1 of each line
numWords += 1
# print(lid)
if isContraction(word):
# if is a contraction, implicitly use last truth tag
message = message + word
lastWord = words.pop()
words.append(lastWord + word)
else:
# if it is not a contraction, use the truth tag
message = message + " " + word
words.append(word)
lid_truth.append([lid])
# at the end of each sentence, pass into the model
if (word == '.'):
tokenToWordPred(message, words)
numWords = 0
lid = []
words = []
message = ""
if (len(message) != 0):
tokenToWordPred(message, words)
numWords = 0
lid = []
words = []
message = ""
# print(lid_truth, file = output)
# print(lid_pred, file = output)
# print(len(lid_truth), len(lid_pred), file = output)
truth_cleaned = [[]]
pred_cleaned = [[]]
assert len(lid_truth) == len(lid_pred)
for i in range(len(lid_truth)):
if (lid_truth[i][0] == 'eng'):
truth_cleaned.append(['en'])
pred_cleaned.append(lid_pred[i])
elif (lid_truth[i][0] == 'spa'):
truth_cleaned.append(['spa'])
pred_cleaned.append(lid_pred[i])
assert len(truth_cleaned) == len(pred_cleaned)
print(eval.getMetrics(truth_cleaned, pred_cleaned), file = output)