-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdrop_reader.py
158 lines (136 loc) · 6.18 KB
/
drop_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import logging
from collections import defaultdict
from typing import Iterable, Optional, Tuple
from overrides import overrides
import datasets
from allennlp.data.fields import (
MetadataField,
TextField,
)
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import PretrainedTransformerIndexer
from allennlp.data.tokenizers import PretrainedTransformerTokenizer
from attribution.huggingface_readers import HuggingfaceReaderRankClassification
logger = logging.getLogger(__name__)
# full reader for drop to pass through query id
@DatasetReader.register("drop_reader")
class DROPReader(DatasetReader):
def __init__(
self,
model_name: str = "google/t5-small-lm-adapt",
max_query_length: int = 512,
split_name: str = "train",
val_size: int = 1000,
return_original_instance: bool = False,
**kwargs,
) -> None:
super().__init__(
manual_distributed_sharding=True,
manual_multiprocess_sharding=True,
**kwargs,
)
dataset_name, subset_name = self.get_dataset_name()
self._dataset_name = dataset_name
self._subset_name = subset_name
self.return_original_instance = return_original_instance
original_val_set = datasets.load_dataset(
dataset_name, subset_name, split="validation"
)
small_val_size = val_size # I consider under 2000 examples as small
val_split_size = val_size # when splitting out val, get 1000 examples
seed = 42
if split_name == "train":
if len(original_val_set) >= small_val_size:
self._dataset = datasets.load_dataset(
dataset_name, subset_name, split="train"
)
else:
# for small val sets, split out val from train and use old val as test
# this is because some casehold splits are specially designed, so I want
# to keep these as-is (rather than just split the val set in half)
self._dataset = datasets.load_dataset(
dataset_name, subset_name, split="train"
).train_test_split(test_size=val_split_size, seed=seed)["train"]
if split_name == "validation":
# for large val sets, just split out from val
if len(original_val_set) >= small_val_size:
self._dataset = original_val_set.train_test_split(
train_size=val_split_size, seed=seed
)["train"]
else:
# for small val sets, split out val from train and use old val as test
self._dataset = datasets.load_dataset(
dataset_name, subset_name, split="train"
).train_test_split(test_size=val_split_size, seed=seed)["test"]
elif split_name == "test":
# for large val sets, test is the small split from val
if len(original_val_set) >= small_val_size:
self._dataset = original_val_set.train_test_split(
train_size=val_split_size, seed=seed
)["test"]
else:
# for small val sets, split our new val from train (val becomes test)
self._dataset = datasets.load_dataset(
dataset_name, subset_name, split="validation"
)
self._transformer_model_name = model_name
self._tokenizer = PretrainedTransformerTokenizer(model_name)
self._token_indexers = {"tokens": PretrainedTransformerIndexer(model_name)}
self._max_query_length = max_query_length
self._stats = defaultdict(int)
def get_dataset_name(self) -> Tuple[str, Optional[str]]:
return "drop", None
def hf_to_instance(self, instance) -> Tuple[str, str]:
# using GPT-3 DROP prompt.
input = (
f"Passage: {instance['passage']}\nQuestion: {instance['question']}\nAnswer:"
)
answer = instance["answers_spans"]["spans"][0]
return [[input, instance["query_id"], answer]]
@overrides
def _read(self, file_path) -> Iterable[Instance]:
for sample in self._dataset:
converted_samples = self.hf_to_instance(sample)
for inputs, qid, targets in converted_samples:
yield self.text_to_instance(inputs, qid, targets)
def text_to_instance(
self,
input_text: str,
query_id: str,
target: str,
) -> Instance:
fields = {}
tokenized_input = self._tokenizer.tokenize(input_text)
if len(tokenized_input) > self._max_query_length:
self._stats["Truncated inputs"] += 1
tokenized_input = tokenized_input[: self._max_query_length]
input_field = TextField(tokenized_input)
fields["prompt_and_input"] = input_field
if self.return_original_instance:
fields["pretokenized_input"] = input_text
tokenized_target = self._tokenizer.tokenize(target)
if len(tokenized_target) > self._max_query_length:
self._stats["Truncated targets"] += 1
tokenized_target = tokenized_target[: self._max_query_length]
target_field = TextField(tokenized_target)
fields["target"] = target_field
query_id_field = MetadataField(query_id)
fields["query_id"] = query_id_field
return Instance(fields)
@overrides
def apply_token_indexers(self, instance: Instance) -> None:
instance.fields["prompt_and_input"].token_indexers = self._token_indexers
instance.fields["target"].token_indexers = self._token_indexers
# drop reader that aligns with the other formats for multitask
# NOTE: don't use this for eval!
@DatasetReader.register("multi_task_drop_reader")
class DropMReader(HuggingfaceReaderRankClassification):
def get_dataset_name(self) -> Tuple[str, Optional[str]]:
return "drop", None
def hf_to_instance(self, instance) -> Tuple[str, str]:
input = (
f"Passage: {instance['passage']}\nQuestion: {instance['question']}\nAnswer:"
)
answer = [instance["answers_spans"]["spans"][0]]
return [[input, answer, 0]]