forked from facebookresearch/pytext
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
SQuAD data source (facebookresearch#382)
Summary: Pull Request resolved: facebookresearch#382 Data source specifically for the SQuAD 2.0 dataset. Reviewed By: kartikayk Differential Revision: D14072144 fbshipit-source-id: b82470bc41d0ae918ae65ffcb3fa97ba653c2732
- Loading branch information
1 parent
931cc84
commit c669f8a
Showing
1 changed file
with
83 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
#!/usr/bin/env python3 | ||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved | ||
import json | ||
from typing import List, Optional | ||
|
||
from pytext.data.sources.data_source import DataSource, generator_property | ||
|
||
|
||
def unflatten(fname, ignore_impossible): | ||
if not fname: | ||
return | ||
with open(fname) as file: | ||
dump = json.load(file) | ||
for article in dump["data"]: | ||
for paragraph in article["paragraphs"]: | ||
context = paragraph["context"] | ||
for question in paragraph["qas"]: | ||
label = not question["is_impossible"] | ||
if label or not ignore_impossible: | ||
answers = ( | ||
question["answers"] if label else question["plausible_answers"] | ||
) | ||
yield { | ||
"context": context, | ||
"question": question["question"], | ||
"answers": [answer["text"] for answer in answers], | ||
"answer_starts": [int(ans["answer_start"]) for ans in answers], | ||
"label": label, | ||
} | ||
|
||
|
||
class SquadDataSource(DataSource): | ||
"""Download data from https://rajpurkar.github.io/SQuAD-explorer/ | ||
Will return tuples of (context, question, answer, answer_start, label, weight) | ||
""" | ||
|
||
class Config(DataSource.Config): | ||
train_filename: Optional[str] = "train-v2.0.json" | ||
test_filename: Optional[str] = "dev-v2.0.json" | ||
eval_filename: Optional[str] = "dev-v2.0.json" | ||
ignore_impossible: bool = True | ||
|
||
@classmethod | ||
def from_config(cls, config: Config, schema=None): | ||
return cls( | ||
config.train_filename, | ||
config.test_filename, | ||
config.eval_filename, | ||
config.ignore_impossible, | ||
) | ||
|
||
def __init__( | ||
self, | ||
train_filename=None, | ||
test_filename=None, | ||
eval_filename=None, | ||
ignore_impossible=Config.ignore_impossible, | ||
): | ||
schema = { | ||
"context": str, | ||
"question": str, | ||
"answers": List[str], | ||
"answer_starts": List[int], | ||
"answer_ends": List[int], | ||
"label": str, | ||
} | ||
super().__init__(schema) | ||
self.train_filename = train_filename | ||
self.test_filename = test_filename | ||
self.eval_filename = eval_filename | ||
self.ignore_impossible = ignore_impossible | ||
|
||
@generator_property | ||
def train(self): | ||
return unflatten(self.train_filename, self.ignore_impossible) | ||
|
||
@generator_property | ||
def test(self): | ||
return unflatten(self.test_filename, self.ignore_impossible) | ||
|
||
@generator_property | ||
def eval(self): | ||
return unflatten(self.eval_filename, self.ignore_impossible) |