-
Notifications
You must be signed in to change notification settings - Fork 0
/
mistralPOSLID.py
105 lines (86 loc) · 3 KB
/
mistralPOSLID.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
import lidCall
import posCall
import lidInterpreter
read_dir = "/scratch/gpfs/ca2992/jpLLM/jpLLM_Data/out_415.tsv"
read_dir_1 = "/scratch/gpfs/ca2992/jpLLM/jpLLM_Data/out_415_1.tsv"
read_list = [read_dir, read_dir_1]
out_pos = "/scratch/gpfs/ca2992/jpLLM/jpLLM_Data/mistral_greedy_pos.tsv"
out_lid = "/scratch/gpfs/ca2992/jpLLM/jpLLM_Data/mistral_greedy_lid.tsv"
# switch counts based on previous word
EN_SPA_prev_verb = 0 # English to Spanish conditional on prev. word verb
SPA_EN_prev_verb = 0 # English to Spanish conditional on prev. word verb
EN_SPA_prev_noun = 0 # EN to SPA conditional on prev. word noun
SPA_EN_prev_noun = 0 # SPA to EN conditional on prev. word noun
EN_SPA_prev_conj = 0 # EN to SPA conditoinal on prev. word conj
SPA_EN_prev_conj = 0 # SPA to EN conditional on prev. word conj
# switch counts based on current word
EN_SPA_verb = 0 # English to Spanish conditional on word verb
SPA_EN_verb = 0 # English to Spanish conditional on word verb
EN_SPA_noun = 0 # EN to SPA conditional on word noun
SPA_EN_noun = 0 # SPA to EN conditional on word noun
EN_SPA_conj = 0 # EN to SPA conditoinal on word conj
SPA_EN_conj = 0 # SPA to EN conditional on word conj
verb_Count = 0
noun_Count = 0
conj_Count = 0
# access to last word, pos, lid for counting purposes
last_word = ""
last_pos = ""
last_lid = ""
# so for reading corpus data,
# i need to feed each line into my pos, lid tagger
# then I need to get probability of switching
# based on pos and/or lid
# for each file:
# for each line:
# feed into lid model, tag each word
# feed into pos model, tag each word
# count CS occurrences
lid_out = []
pos_out = []
# instructions in [INST] [\INST] format:
def extractText(text):
stateInMessage = False
stateInBracket = False
out = ""
for char in text:
if char == '[' and stateInBracket == False:
stateInBracket = True
stateInMessage = False
elif char == ']' and stateInBracket == True:
stateInBracket = False
stateInMessage = True
elif stateInBracket:
continue
elif stateInMessage:
out = out + char
assert stateInBracket != stateInMessage
return out
def pos_lid(input):
t = extractText(input)
pos_result = posCall.pos(t)
lid_result = lidCall.lid(t)
pos_out.append(pos_result)
lid_out.append(lid_result)
for file in read_list:
with open(file, "r") as f:
text = ""
for line in file:
# if start of new instruction
# conduct lid and pos
if line[0] == '[':
pos_lid(text)
text = line
else:
text = text + " " + line
# account for last examples from last prompt
pos_lid(text)
with open(out_pos, "a") as f:
for item in pos_out:
print(item, file = f)
print('\t', file = f)
with open(out_lid, "a") as f:
for item in lid_out:
print(item, file = f)
print('\t', file = f)