forked from cgpotts/cs224u
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
322 lines (264 loc) · 9.3 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
from collections import Counter
import csv
import logging
import numpy as np
import pandas as pd
import random
from scipy import stats
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
import sys
import os
__author__ = "Christopher Potts"
__version__ = "CS224u, Stanford, Spring 2020"
START_SYMBOL = "<s>"
END_SYMBOL = "</s>"
UNK_SYMBOL = "$UNK"
def glove2dict(src_filename):
"""GloVe Reader.
Parameters
----------
src_filename : str
Full path to the GloVe file to be processed.
Returns
-------
dict
Mapping words to their GloVe vectors as `np.array`.
"""
# This distribution has some words with spaces, so we have to
# assume its dimensionality and parse out the lines specially:
if '840B.300d' in src_filename:
line_parser = lambda line: line.rsplit(" ", 300)
else:
line_parser = lambda line: line.strip().split()
data = {}
with open(src_filename, encoding='utf8') as f:
while True:
try:
line = next(f)
line = line_parser(line)
data[line[0]] = np.array(line[1: ], dtype=np.float)
except StopIteration:
break
except UnicodeDecodeError:
pass
return data
def d_tanh(z):
"""The derivative of np.tanh. z should be a float or np.array."""
return 1.0 - z**2
def softmax(z):
"""Softmax activation function. z should be a float or np.array."""
# Increases numerical stability:
t = np.exp(z - np.max(z))
return t / np.sum(t)
def relu(z):
return np.maximum(0, z)
def d_relu(z):
return np.where(z > 0, 1, 0)
def randvec(n=50, lower=-0.5, upper=0.5):
"""Returns a random vector of length `n`. `w` is ignored."""
return np.array([random.uniform(lower, upper) for i in range(n)])
def randmatrix(m, n, lower=-0.5, upper=0.5):
"""Creates an m x n matrix of random values in [lower, upper]"""
return np.array([random.uniform(lower, upper) for i in range(m*n)]).reshape(m, n)
def safe_macro_f1(y, y_pred):
"""Macro-averaged F1, forcing `sklearn` to report as a multiclass
problem even when there are just two classes. `y` is the list of
gold labels and `y_pred` is the list of predicted labels."""
return f1_score(y, y_pred, average='macro', pos_label=None)
def progress_bar(msg):
"""Simple over-writing progress bar."""
sys.stderr.write('\r')
sys.stderr.write(msg)
sys.stderr.flush()
def log_of_array_ignoring_zeros(M):
"""Returns an array containing the logs of the nonzero
elements of M. Zeros are left alone since log(0) isn't
defined.
"""
log_M = M.copy()
mask = log_M > 0
log_M[mask] = np.log(log_M[mask])
return log_M
def mcnemar(y_true, pred_a, pred_b):
"""McNemar's test using the chi2 distribution.
Parameters
----------
y_true : list of actual labels
pred_a, pred_b : lists
Predictions from the two systems being evaluated.
Assumed to have the same length as `y_true`.
Returns
-------
float, float (the test statistic and p value)
"""
c01 = 0
c10 = 0
for y, a, b in zip(y_true, pred_a, pred_b):
if a == y and b != y:
c01 += 1
elif a != y and b == y:
c10 += 1
stat = ((np.abs(c10 - c01) - 1.0)**2) / (c10 + c01)
df = 1
pval = stats.chi2.sf(stat, df)
return stat, pval
def fit_classifier_with_crossvalidation(
X, y, basemod, cv, param_grid, scoring='f1_macro', verbose=True):
"""Fit a classifier with hyperparameters set via cross-validation.
Parameters
----------
X : 2d np.array
The matrix of features, one example per row.
y : list
The list of labels for rows in `X`.
basemod : an sklearn model class instance
This is the basic model-type we'll be optimizing.
cv : int
Number of cross-validation folds.
param_grid : dict
A dict whose keys name appropriate parameters for `basemod` and
whose values are lists of values to try.
scoring : value to optimize for (default: f1_macro)
Other options include 'accuracy' and 'f1_micro'. See
http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
verbose : bool
Whether to print some summary information to standard output.
Prints
------
To standard output (if `verbose=True`)
The best parameters found.
The best macro F1 score obtained.
Returns
-------
An instance of the same class as `basemod`.
A trained model instance, the best model found.
"""
# Find the best model within param_grid:
crossvalidator = GridSearchCV(basemod, param_grid, cv=cv, scoring=scoring)
crossvalidator.fit(X, y)
# Report some information:
if verbose:
print("Best params: {}".format(crossvalidator.best_params_))
print("Best score: {0:0.03f}".format(crossvalidator.best_score_))
# Return the best model found:
return crossvalidator.best_estimator_
def get_vocab(X, n_words=None, mincount=1):
"""Get the vocabulary for an RNN example matrix `X`,
adding $UNK$ if it isn't already present.
Parameters
----------
X : list of lists of str
n_words : int or None
If this is `int > 0`, keep only the top `n_words` by frequency.
mincount : int
Only words with at least this many tokens are kept.
Returns
-------
list of str
"""
wc = Counter([w for ex in X for w in ex])
wc = wc.most_common(n_words) if n_words else wc.items()
if mincount > 1:
wc = {(w, c) for w, c in wc if c >= mincount}
vocab = {w for w, _ in wc}
vocab.add("$UNK")
return sorted(vocab)
def create_pretrained_embedding(
lookup, vocab, required_tokens=('$UNK', "<s>", "</s>")):
"""Create an embedding matrix from a lookup and a specified vocab.
Words from `vocab` that are not in `lookup` are given random
representations.
Parameters
----------
lookup : dict
Must map words to their vector representations.
vocab : list of str
Words to create embeddings for.
required_tokens : tuple of str
Tokens that must have embeddings. If they are not available
in the look-up, they will be given random representations.
Returns
-------
np.array, list
The np.array is an embedding for `vocab` and the `list` is
the potentially expanded version of `vocab` that came in.
"""
dim = len(next(iter(lookup.values())))
embedding = np.array([lookup.get(w, randvec(dim)) for w in vocab])
for tok in required_tokens:
if tok not in vocab:
vocab.append(tok)
embedding = np.vstack((embedding, randvec(dim)))
return embedding, vocab
def fix_random_seeds(
seed=42,
set_system=True,
set_torch=True,
set_tensorflow=True,
set_torch_cudnn=True):
"""Fix random seeds for reproducibility.
Parameters
----------
seed : int
Random seed to be set.
set_system : bool
Whether to set `np.random.seed(seed)` and `random.seed(seed)`
set_tensorflow : bool
Whether to set `tf.random.set_random_seed(seed)`
set_torch : bool
Whether to set `torch.manual_seed(seed)`
set_torch_cudnn: bool
Flag for whether to enable cudnn deterministic mode.
Note that deterministic mode can have a performance impact, depending on your model.
https://pytorch.org/docs/stable/notes/randomness.html
Notes
-----
The function checks that PyTorch and TensorFlow are installed
where the user asks to set seeds for them. If they are not
installed, the seed-setting instruction is ignored. The intention
is to make it easier to use this function in environments that lack
one or both of these libraries.
Even though the random seeds are explicitly set,
the behavior may still not be deterministic (especially when a
GPU is enabled), due to:
* CUDA: There are some PyTorch functions that use CUDA functions
that can be a source of non-determinism:
https://pytorch.org/docs/stable/notes/randomness.html
* PYTHONHASHSEED: On Python 3.3 and greater, hash randomization is
turned on by default. This seed could be fixed before calling the
python interpreter (PYTHONHASHSEED=0 python test.py). However, it
seems impossible to set it inside the python program:
https://stackoverflow.com/questions/30585108/disable-hash-randomization-from-within-python-program
"""
# set system seed
if set_system:
np.random.seed(seed)
random.seed(seed)
# set torch seed
if set_torch:
try:
import torch
except ImportError:
pass
else:
torch.manual_seed(seed)
# set torch cudnn backend
if set_torch_cudnn:
try:
import torch
except ImportError:
pass
else:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# set tf seed
if set_tensorflow:
try:
from tensorflow.compat.v1 import set_random_seed as set_tf_seed
except ImportError:
from tensorflow.random import set_seed as set_tf_seed
except ImportError:
pass
else:
set_tf_seed(seed)