forked from cgpotts/cs224u
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrel_ext.py
662 lines (571 loc) · 24.3 KB
/
rel_ext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
from collections import Counter, defaultdict, namedtuple
import gzip
import numpy as np
import os
import random
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
__author__ = "Bill MacCartney and Christopher Potts"
__version__ = "CS224u, Stanford, Fall 2020"
Example = namedtuple('Example',
'entity_1, entity_2, left, mention_1, middle, mention_2, right, '
'left_POS, mention_1_POS, middle_POS, mention_2_POS, right_POS')
class Corpus(object):
"""
Class for representing and working with the raw text we use
as evidence for making relation predictions.
Parameters
----------
src_filename_or_examples : str or list
If str, this is assumed to be the full path to the gzip file
that contains the examples to use. The method `read_examples`
is used to open it in that case. If this is a list, then it
should be a list of `Example` instances.
Attributes
----------
examples_by_entities : dict
A 2d dictionary mapping `ex.entity_1` to a dict mapping entity
`ex.entity_2` to the full `Example` instance `ex`. This is
created by the method `_index_examples_by_entities`.
"""
def __init__(self, src_filename_or_examples):
if isinstance(src_filename_or_examples, str):
self.examples = self.read_examples(src_filename_or_examples)
else:
self.examples = src_filename_or_examples
self.examples_by_entities = {}
self._index_examples_by_entities()
@staticmethod
def read_examples(src_filename):
"""
Read `src_filename`, assumed to be a `gzip` file with
tab-separated lines that can be turned into `Example`
instances.
Parameters
----------
src_filename : str
Assumed to be the full path to the gzip file that contains
the examples.
Returns
-------
list of Example
"""
examples = []
with gzip.open(src_filename, mode='rt', encoding='utf8') as f:
for line in f:
fields = line[:-1].split('\t')
examples.append(Example(*fields))
return examples
def _index_examples_by_entities(self):
"""
Fill `examples_by_entities` as a 2d dictionary mapping
`ex.entity_1` to a dict mapping entity `ex.entity_2` to the
full `Example` instance `ex`.
"""
for ex in self.examples:
if ex.entity_1 not in self.examples_by_entities:
self.examples_by_entities[ex.entity_1] = {}
if ex.entity_2 not in self.examples_by_entities[ex.entity_1]:
self.examples_by_entities[ex.entity_1][ex.entity_2] = []
self.examples_by_entities[ex.entity_1][ex.entity_2].append(ex)
def get_examples_for_entities(self, e1, e2):
"""
Given two entities `e1` and `e2` as strings, return
examples from `self.examples_by_entities`, as a list of
`Example` instances."""
try:
return self.examples_by_entities[e1][e2]
except KeyError:
return []
def show_examples_for_pair(self, e1, e2):
"""
Given two entities `e1` and `e2` as strings, print out their
first `Example`, if there is one, otherwise print out a message
saying there are no Example instances relating `e1` to `e2`."""
exs = self.get_examples_for_entities(e1, e2)
if exs:
print('The first of {0:,} examples for {1:} and {2:} is:'.format(
len(exs), e1, e2))
print(exs[0])
else:
print('No examples for {0:} and {1:}'.format(e1, e2))
def __str__(self):
return 'Corpus with {0:,} examples'.format(len(self.examples))
def __repr__(self):
return str(self)
def __len__(self):
return len(self.examples)
KBTriple = namedtuple('KBTriple', 'rel, sbj, obj')
class KB(object):
"""
Class for representing and working with the knowledge base.
Parameters
----------
src_filename_or_triples : str or list
If str, this is assumed to be the full path to the gzip file
that contains the KB. The method `read_kb_triples` is used to
open it in that case. If this is a list, then it should be a
list of `KBTriple` instances.
Attributes
----------
all_relations : list
Built by `_index_kb_triples_by_relation` as a list of str.
all_entity_pairs : list
Built by `_collect_all_entity_pairs`, as a sorted list of
(subject, object) tuples.
kb_triples_by_relation : dict
Built by `_index_kb_triples_by_relation`, as a dict mapping
relations (str) to `KBTriple` lists.
kb_triples_by_entities : dict
Built by `_index_kb_triples_by_entities`, as a dict mapping
relations subject (str) to dict mapping object (str) to
`KBTriple` lists.
"""
def __init__(self, src_filename_or_triples):
if isinstance(src_filename_or_triples, str):
self.kb_triples = self.read_kb_triples(src_filename_or_triples)
else:
self.kb_triples = src_filename_or_triples
self.all_relations = []
self.all_entity_pairs = []
self.kb_triples_by_relation = {}
self.kb_triples_by_entities = {}
self._collect_all_entity_pairs()
self._index_kb_triples_by_relation()
self._index_kb_triples_by_entities()
@staticmethod
def read_kb_triples(src_filename):
"""
Read `src_filename`, assumed to be a `gzip` file with
tab-separated lines that can be turned into `KBTriple`
instances.
Parameters
----------
src_filename : str
Assumed to be the full path to the gzip file that contains
the triples
Returns
-------
list of KBTriple
"""
kb_triples = []
with gzip.open(src_filename, mode='rt', encoding='utf8') as f:
for line in f:
rel, sbj, obj = line[:-1].split('\t')
kb_triples.append(KBTriple(rel, sbj, obj))
return kb_triples
def _collect_all_entity_pairs(self):
pairs = set()
for kbt in self.kb_triples:
pairs.add((kbt.sbj, kbt.obj))
self.all_entity_pairs = sorted(list(pairs))
def _index_kb_triples_by_relation(self):
for kbt in self.kb_triples:
if kbt.rel not in self.kb_triples_by_relation:
self.kb_triples_by_relation[kbt.rel] = []
self.kb_triples_by_relation[kbt.rel].append(kbt)
self.all_relations = sorted(list(self.kb_triples_by_relation))
def _index_kb_triples_by_entities(self):
for kbt in self.kb_triples:
if kbt.sbj not in self.kb_triples_by_entities:
self.kb_triples_by_entities[kbt.sbj] = {}
if kbt.obj not in self.kb_triples_by_entities[kbt.sbj]:
self.kb_triples_by_entities[kbt.sbj][kbt.obj] = []
self.kb_triples_by_entities[kbt.sbj][kbt.obj].append(kbt)
def get_triples_for_relation(self, rel):
""""
Given a relation name (str), return all of the `KBTriple`
instances that involve it.
"""
try:
return self.kb_triples_by_relation[rel]
except KeyError:
return []
def get_triples_for_entities(self, e1, e2):
"""
Given a pair of entities `e1` and `e2` (both str), return
all of the `KBTriple` instances that involve them.
"""
try:
return self.kb_triples_by_entities[e1][e2]
except KeyError:
return []
def __str__(self):
return 'KB with {0:,} triples'.format(len(self.kb_triples))
def __repr__(self):
return str(self)
def __len__(self):
return len(self.kb_triples)
class Dataset(object):
"""
Class for unifying a `Corpus` and a `KB`.
Parameters
----------
corpus : `Corpus`
kb : `KB`
"""
def __init__(self, corpus, kb):
self.corpus = corpus
self.kb = kb
def find_unrelated_pairs(self):
unrelated_pairs = set()
for ex in self.corpus.examples:
if self.kb.get_triples_for_entities(ex.entity_1, ex.entity_2):
continue
if self.kb.get_triples_for_entities(ex.entity_2, ex.entity_1):
continue
unrelated_pairs.add((ex.entity_1, ex.entity_2))
unrelated_pairs.add((ex.entity_2, ex.entity_1))
return unrelated_pairs
def featurize(self, kbts_by_rel, featurizers, vectorizer=None, vectorize=True):
"""
Featurize by relation.
Parameters
----------
kbts_by_rel : dict
A map from relation (str) to lists of `KBTriples`.
featurizers : list of func
Each function has to have the signature
`kbt, corpus, feature_counter`, where `kbt` is a `KBTriple`,
`corpus` is a `Corpus`, and `feature_counter` is a count
dictionary.
vectorizer : DictVectorizer or None:
If None, a new `DictVectorizer` is created and used via
`fit`. This is primarily for training. If not None, then
`transform` is used. This is primarily for testing.
vectorize: bool
If True, the feature functions in `featurizers` are presumed
to create feature dicts, and a `DictVectorizer` is used. If
False, then `featurizers` is required to have exactly one
function in it, and that function must return exactly the
sort of objects that the models in the model factory take
as inputs.
Returns
-------
feat_matrices_by_rel, vectorizer
where `feat_matrices_by_rel` is a dict mapping relation names
to (i) lists of representation if `vectorize=False`, else
to `np.array`s, and (ii) and `vectorizer` is a
`DictVectorizer` if `vectorize=True`, else None
"""
if not vectorize:
feat_matrices_by_rel = defaultdict(list)
if len(featurizers) != 1:
raise ValueError(
"If `vectorize=False`, the `featurizers` argument "
"must contain exactly one function.")
featurizer = featurizers[0]
for rel, kbts in kbts_by_rel.items():
for kbt in kbts:
rep = featurizer(kbt, self.corpus)
feat_matrices_by_rel[rel].append(rep)
return feat_matrices_by_rel, None
# Create feature counters for all instances (kbts).
feat_counters_by_rel = defaultdict(list)
for rel, kbts in kbts_by_rel.items():
for kbt in kbts:
feature_counter = Counter()
for featurizer in featurizers:
feature_counter = featurizer(kbt, self.corpus, feature_counter)
feat_counters_by_rel[rel].append(feature_counter)
feat_matrices_by_rel = defaultdict(list)
# If we haven't been given a Vectorizer, create one and fit
# it to all the feature counters.
if vectorizer is None:
vectorizer = DictVectorizer(sparse=True)
def traverse_dicts():
for dict_list in feat_counters_by_rel.values():
for d in dict_list:
yield d
vectorizer.fit(traverse_dicts())
# Now use the Vectorizer to transform feature dictionaries
# into feature matrices.
for rel, feat_counters in feat_counters_by_rel.items():
feat_matrices_by_rel[rel] = vectorizer.transform(feat_counters)
return feat_matrices_by_rel, vectorizer
def build_dataset(self, include_positive=True, sampling_rate=0.1, seed=1):
unrelated_pairs = self.find_unrelated_pairs()
random.seed(seed)
unrelated_pairs = random.sample(
unrelated_pairs, int(sampling_rate * len(unrelated_pairs)))
kbts_by_rel = defaultdict(list)
labels_by_rel = defaultdict(list)
for index, rel in enumerate(self.kb.all_relations):
if include_positive:
for kbt in self.kb.get_triples_for_relation(rel):
kbts_by_rel[rel].append(kbt)
labels_by_rel[rel].append(True)
for sbj, obj in unrelated_pairs:
kbts_by_rel[rel].append(KBTriple(rel, sbj, obj))
labels_by_rel[rel].append(False)
return kbts_by_rel, labels_by_rel
def build_splits(self,
split_names=['tiny', 'train', 'dev'],
split_fracs=[0.01, 0.74, 0.25],
seed=1):
if len(split_names) != len(split_fracs):
raise ValueError('split_names and split_fracs must be of equal length')
if sum(split_fracs) != 1.0:
raise ValueError('split_fracs must sum to 1')
n = len(split_fracs) # for convenience only
def split_list(xs):
xs = sorted(xs) # sorted for reproducibility
if seed:
random.seed(seed)
random.shuffle(xs)
split_points = [0] + [int(round(frac * len(xs)))
for frac in np.cumsum(split_fracs)]
return [xs[split_points[i]:split_points[i + 1]] for i in range(n)]
# first, split the entities that appear as subjects in the KB
sbjs = list(set([kbt.sbj for kbt in self.kb.kb_triples]))
sbj_splits = split_list(sbjs)
sbj_split_dict = {sbj: i for i, split in enumerate(sbj_splits)
for sbj in split}
# next, split the KB triples based on their subjects
kbt_splits = [[kbt for kbt in self.kb.kb_triples if sbj_split_dict[kbt.sbj] == i]
for i in range(n)]
# now split examples based on the entities they contain
ex_splits = [[] for i in range(n + 1)] # include an extra split
for ex in self.corpus.examples:
if ex.entity_1 in sbj_split_dict:
# if entity_1 is a sbj in the KB, assign example to split of that sbj
ex_splits[sbj_split_dict[ex.entity_1]].append(ex)
elif ex.entity_2 in sbj_split_dict:
# if entity_2 is a sbj in the KB, assign example to split of that sbj
ex_splits[sbj_split_dict[ex.entity_2]].append(ex)
else:
# otherwise, put in extra split to be redistributed
ex_splits[-1].append(ex)
# reallocate the examples that weren't assigned to a split on first pass
extra_ex_splits = split_list(ex_splits[-1])
ex_splits = [ex_splits[i] + extra_ex_splits[i] for i in range(n)]
# create a Corpus and a KB for each split
data = {}
for i in range(n):
data[split_names[i]] = Dataset(Corpus(ex_splits[i]), KB(kbt_splits[i]))
data['all'] = self
return data
def count_examples(self):
counter = Counter()
for rel in self.kb.all_relations:
for kbt in self.kb.get_triples_for_relation(rel):
# count examples in both forward and reverse directions
counter[rel] += len(self.corpus.get_examples_for_entities(kbt.sbj, kbt.obj))
counter[rel] += len(self.corpus.get_examples_for_entities(kbt.obj, kbt.sbj))
# report results
print('{:20s} {:>10s} {:>10s} {:>10s}'.format(
'', '', '', 'examples'))
print('{:20s} {:>10s} {:>10s} {:>10s}'.format(
'relation', 'examples', 'triples', '/triple'))
print('{:20s} {:>10s} {:>10s} {:>10s}'.format(
'--------', '--------', '-------', '-------'))
for rel in self.kb.all_relations:
nx = counter[rel]
nt = len(self.kb.get_triples_for_relation(rel))
print('{:20s} {:10d} {:10d} {:10.2f}'.format(
rel, nx, nt, 1.0 * nx / nt))
def count_relation_combinations(self):
counter = Counter()
for sbj, obj in self.kb.all_entity_pairs:
rels = tuple(sorted({kbt.rel for kbt in self.kb.get_triples_for_entities(sbj, obj)}))
if len(rels) > 1:
counter[rels] += 1
counts = sorted([(count, key) for key, count in counter.items()], reverse=True)
print('The most common relation combinations are:')
for count, key in counts:
print('{:10d} {}'.format(count, key))
def __str__(self):
return "{}; {}".format(self.corpus, self.kb)
def __repr__(self):
return str(self)
def print_statistics_header():
print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
'relation', 'precision', 'recall', 'f-score', 'support', 'size'))
print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
'-' * 18, '-' * 9, '-' * 9, '-' * 9, '-' * 9, '-' * 9))
def print_statistics_row(rel, result):
print('{:20s} {:10.3f} {:10.3f} {:10.3f} {:10d} {:10d}'.format(rel, *result))
def print_statistics_footer(avg_result):
print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
'-' * 18, '-' * 9, '-' * 9, '-' * 9, '-' * 9, '-' * 9))
print('{:20s} {:10.3f} {:10.3f} {:10.3f} {:10d} {:10d}'.format('macro-average', *avg_result))
def macro_average_results(results):
avg_result = [np.average([r[i] for r in results.values()]) for i in range(3)]
avg_result.append(np.sum([r[3] for r in results.values()]))
avg_result.append(np.sum([r[4] for r in results.values()]))
return avg_result
def evaluate(splits, classifier, test_split='dev', sampling_rate=0.1, verbose=True):
test_kbts_by_rel, true_labels_by_rel = splits[test_split].build_dataset(sampling_rate=sampling_rate)
results = {}
if verbose:
print_statistics_header()
for rel in splits['all'].kb.all_relations:
pred_labels = classifier(test_kbts_by_rel[rel])
stats = precision_recall_fscore_support(true_labels_by_rel[rel], pred_labels, beta=0.5)
stats = [stat[1] for stat in stats] # stats[1] is the stat for label True
stats.append(len(pred_labels)) # number of examples
results[rel] = stats
if verbose:
print_statistics_row(rel, results[rel])
avg_result = macro_average_results(results)
if verbose:
print_statistics_footer(avg_result)
return avg_result[2] # return f_0.5 score as summary statistic
def train_models(
splits,
featurizers,
split_name='train',
model_factory=(lambda: LogisticRegression(
fit_intercept=True, solver='liblinear', random_state=42)),
sampling_rate=0.1,
vectorize=True,
verbose=True):
train_dataset = splits[split_name]
train_o, train_y = train_dataset.build_dataset(sampling_rate=sampling_rate)
train_X, vectorizer = train_dataset.featurize(
train_o, featurizers, vectorize=vectorize)
models = {}
for rel in splits['all'].kb.all_relations:
models[rel] = model_factory()
models[rel].fit(train_X[rel], train_y[rel])
return {
'featurizers': featurizers,
'vectorizer': vectorizer,
'models': models,
'all_relations': splits['all'].kb.all_relations,
'vectorize': vectorize}
def predict(splits, train_result, split_name='dev', sampling_rate=0.1, vectorize=True):
assess_dataset = splits[split_name]
assess_o, assess_y = assess_dataset.build_dataset(sampling_rate=sampling_rate)
test_X, _ = assess_dataset.featurize(
assess_o,
featurizers=train_result['featurizers'],
vectorizer=train_result['vectorizer'],
vectorize=vectorize)
predictions = {}
for rel in train_result['all_relations']:
predictions[rel] = train_result['models'][rel].predict(test_X[rel])
return predictions, assess_y
def evaluate_predictions(predictions, test_y, verbose=True):
results = {} # one result row for each relation
if verbose:
print_statistics_header()
for rel, preds in predictions.items():
stats = precision_recall_fscore_support(test_y[rel], preds, beta=0.5)
stats = [stat[1] for stat in stats] # stats[1] is the stat for label True
stats.append(len(test_y[rel]))
results[rel] = stats
if verbose:
print_statistics_row(rel, results[rel])
avg_result = macro_average_results(results)
if verbose:
print_statistics_footer(avg_result)
return avg_result[2] # return f_0.5 score as summary statistic
def experiment(
splits,
featurizers,
train_split='train',
test_split='dev',
model_factory=(lambda: LogisticRegression(
fit_intercept=True, solver='liblinear', random_state=42)),
train_sampling_rate=0.1,
test_sampling_rate=0.1,
vectorize=True,
verbose=True):
train_result = train_models(
splits,
featurizers=featurizers,
split_name=train_split,
model_factory=model_factory,
sampling_rate=train_sampling_rate,
vectorize=vectorize,
verbose=verbose)
predictions, test_y = predict(
splits,
train_result,
split_name=test_split,
sampling_rate=test_sampling_rate,
vectorize=vectorize)
evaluate_predictions(
predictions,
test_y,
verbose)
return train_result
def examine_model_weights(train_result, k=3, verbose=True):
vectorizer = train_result['vectorizer']
if vectorizer is None:
print("Model weights can be examined only if the featurizers "
"are based in dicts (i.e., if `vectorize=True`).")
return
feature_names = vectorizer.get_feature_names()
for rel, model in train_result['models'].items():
print('Highest and lowest feature weights for relation {}:\n'.format(rel))
try:
coefs = model.coef_.toarray()
except AttributeError:
coefs = model.coef_
sorted_weights = sorted([(wgt, idx) for idx, wgt in enumerate(coefs[0])], reverse=True)
for wgt, idx in sorted_weights[:k]:
print('{:10.3f} {}'.format(wgt, feature_names[idx]))
print('{:>10s} {}'.format('.....', '.....'))
for wgt, idx in sorted_weights[-k:]:
print('{:10.3f} {}'.format(wgt, feature_names[idx]))
print()
def find_new_relation_instances(
dataset,
featurizers,
train_split='train',
test_split='dev',
model_factory=(lambda: LogisticRegression(
fit_intercept=True, solver='liblinear', random_state=42)),
k=10,
vectorize=True,
verbose=True):
splits = dataset.build_splits()
# train models
train_result = train_models(
splits,
split_name=train_split,
featurizers=featurizers,
model_factory=model_factory,
vectorize=vectorize,
verbose=True)
test_split = splits[test_split]
neg_o, neg_y = test_split.build_dataset(
include_positive=False,
sampling_rate=1.0)
neg_X, _ = test_split.featurize(
neg_o,
featurizers=featurizers,
vectorizer=train_result['vectorizer'],
vectorize=vectorize)
# Report highest confidence predictions:
for rel, model in train_result['models'].items():
print('Highest probability examples for relation {}:\n'.format(rel))
probs = model.predict_proba(neg_X[rel])
probs = [prob[1] for prob in probs] # probability for class True
sorted_probs = sorted([(p, idx) for idx, p in enumerate(probs)], reverse=True)
for p, idx in sorted_probs[:k]:
print('{:10.3f} {}'.format(p, neg_o[rel][idx]))
print()
def bake_off_experiment(train_result, rel_ext_data_home, verbose=True):
test_corpus_filename = os.path.join(rel_ext_data_home, "corpus-test.tsv.gz")
test_kb_filename = os.path.join(rel_ext_data_home, "kb-test.tsv.gz")
corpus = Corpus(test_corpus_filename)
kb = KB(test_kb_filename)
test_dataset = Dataset(corpus, kb)
test_o, test_y = test_dataset.build_dataset()
test_X, _ = test_dataset.featurize(
test_o,
featurizers=train_result['featurizers'],
vectorizer=train_result['vectorizer'],
vectorize=train_result['vectorize'])
predictions = {}
for rel in train_result['all_relations']:
predictions[rel] = train_result['models'][rel].predict(test_X[rel])
evaluate_predictions(
predictions,
test_y,
verbose=verbose)