refactored the joint q2ar evaluation script

rowanz · Feb 14, 2019 · ae532f6 · ae532f6
1 parent 73a2408
commit ae532f6
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 86 deletions.
diff --git a/eval_all.py b/eval_all.py
diff --git a/models/eval_q2ar.py b/models/eval_q2ar.py
@@ -0,0 +1,64 @@
+"""
+You can use this script to evaluate prediction files (valpreds.npy). Essentially this is needed if you want to, say,
+combine answer and rationale predictions.
+"""
+
+import numpy as np
+import json
+import os
+from config import VCR_ANNOTS_DIR
+import argparse
+
+parser = argparse.ArgumentParser(description='Evaluate question -> answer and rationale')
+parser.add_argument(
+    '-answer_preds',
+    dest='answer_preds',
+    default='saves/flagship_answer/valpreds.npy',
+    help='Location of question->answer predictions',
+    type=str,
+)
+parser.add_argument(
+    '-rationale_preds',
+    dest='rationale_preds',
+    default='saves/flagship_rationale/valpreds.npy',
+    help='Location of question+answer->rationale predictions',
+    type=str,
+)
+parser.add_argument(
+    '-split',
+    dest='split',
+    default='val',
+    help='Split you\'re using. Probably you want val.',
+    type=str,
+)
+
+args = parser.parse_args()
+
+answer_preds = np.load(args.answer_preds)
+rationale_preds = np.load(args.rationale_preds)
+
+rationale_labels = []
+answer_labels = []
+
+with open(os.path.join(VCR_ANNOTS_DIR, '{}.jsonl'.format(args.split)), 'r') as f:
+    for l in f:
+        item = json.loads(l)
+        answer_labels.append(item['answer_label'])
+        rationale_labels.append(item['rationale_label'])
+
+answer_labels = np.array(answer_labels)
+rationale_labels = np.array(rationale_labels)
+
+# Sanity checks
+assert answer_preds.shape[0] == answer_labels.size
+assert rationale_preds.shape[0] == answer_labels.size
+assert answer_preds.shape[1] == 4
+assert rationale_preds.shape[1] == 4
+
+answer_hits = answer_preds.argmax(1) == answer_labels
+rationale_hits = rationale_preds.argmax(1) == rationale_labels
+joint_hits = answer_hits & rationale_hits
+
+print("Answer acc:    {:.3f}".format(np.mean(answer_hits)), flush=True)
+print("Rationale acc: {:.3f}".format(np.mean(rationale_hits)), flush=True)
+print("Joint acc:     {:.3f}".format(np.mean(answer_hits & rationale_hits)), flush=True)