Skip to content

Commit 5f9f078

Browse files
Merge pull request #7 from wellcometrust/feature/ivyleavedtoadflax/parsing_ref_spans_to_token_spans
Allow ref_to_token_annotations to handle more general cases
2 parents 3a8afa3 + ab6fd20 commit 5f9f078

File tree

2 files changed

+167
-162
lines changed

2 files changed

+167
-162
lines changed

deep_reference_parser/prodigy/reference_to_token_annotations.py

Lines changed: 136 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -10,31 +10,54 @@
1010

1111

1212
class TokenTagger:
13-
"""
14-
Converts data in prodigy format with full reference spans to per-token spans
13+
def __init__(self, task="splitting", lowercase=True):
14+
"""
15+
Converts data in prodigy format with full reference spans to per-token
16+
spans
1517
16-
Expects one of four lables for the spans:
18+
Args:
19+
task (str): One of ["parsing", "splitting"]. See below further
20+
explanation.
21+
lowercase (bool): Automatically convert upper case annotations to
22+
lowercase under the parsing scenario.
1723
18-
* BE: A complete reference
19-
* BI: A frgament of reference that captures the beginning but not the end
20-
* IE: A frgament of reference that captures the end but not the beginning
21-
* II: A fragment of a reference that captures neither the beginning nor the
22-
end .
23-
"""
24+
Since the parsing, splitting, and classification tasks have quite
25+
different labelling requirements, this class behaves differently
26+
depending on which task is specified in the task argument.
27+
28+
For splitting:
2429
25-
def __init__(self):
30+
Expects one of four labels for the spans:
31+
32+
* BE: A complete reference
33+
* BI: A frgament of reference that captures the beginning but not the end
34+
* IE: A frgament of reference that captures the end but not the beginning
35+
* II: A fragment of a reference that captures neither the beginning nor the
36+
end .
37+
38+
Depending on which label is applied the tokens within the span will be
39+
labelled differently as one of ["b-r", "i-r", "e-r", "o"].
40+
41+
For parsing:
42+
43+
Expects any arbitrary label for spans. All tokens within that span will
44+
be labelled with the same span.
45+
46+
"""
2647

2748
self.out = []
49+
self.task = task
50+
self.lowercase = lowercase
2851

2952
def tag_doc(self, doc):
3053
"""
31-
Tags a document with the appropriate labels
54+
Tags a document with appropriate labels for the parsing task
3255
3356
Args:
3457
doc(dict): A single document in prodigy dict format to be labelled.
3558
"""
3659

37-
bie_spans = self.reference_spans(doc["spans"], doc["tokens"])
60+
bie_spans = self.reference_spans(doc["spans"], doc["tokens"], task=self.task)
3861
o_spans = self.outside_spans(bie_spans, doc["tokens"])
3962

4063
# Flatten into one list.
@@ -43,7 +66,7 @@ def tag_doc(self, doc):
4366

4467
# Sort by token id to ensure it is ordered.
4568

46-
spans = sorted(spans, key=lambda k: k['token_start'])
69+
spans = sorted(spans, key=lambda k: k["token_start"])
4770

4871
doc["spans"] = spans
4972

@@ -63,42 +86,54 @@ def run(self, docs):
6386

6487
return self.out
6588

66-
def reference_spans(self, spans, tokens):
89+
def reference_spans(self, spans, tokens, task):
6790
"""
6891
Given a whole reference span as labelled in prodigy, break this into
6992
appropriate single token spans depending on the label that was applied to
7093
the whole reference span.
7194
"""
7295
split_spans = []
7396

74-
for span in spans:
75-
if span["label"] in ["BE", "be"]:
97+
if task == "splitting":
7698

77-
split_spans.extend(
78-
self.split_long_span(tokens, span, "b-r", "e-r")
79-
)
99+
for span in spans:
100+
if span["label"] in ["BE", "be"]:
80101

81-
elif span["label"] in ["BI", "bi"]:
102+
split_spans.extend(
103+
self.split_long_span(tokens, span, "b-r", "e-r", "i-r")
104+
)
82105

83-
split_spans.extend(
84-
self.split_long_span(tokens, span, "b-r", "i-r")
85-
)
106+
elif span["label"] in ["BI", "bi"]:
86107

87-
elif span["label"] in ["IE", "ie"]:
108+
split_spans.extend(
109+
self.split_long_span(tokens, span, "b-r", "i-r", "i-r")
110+
)
88111

89-
split_spans.extend(
90-
self.split_long_span(tokens, span, "i-r", "e-r")
91-
)
112+
elif span["label"] in ["IE", "ie"]:
113+
114+
split_spans.extend(
115+
self.split_long_span(tokens, span, "i-r", "e-r", "i-r")
116+
)
117+
118+
elif span["label"] in ["II", "ii"]:
92119

93-
elif span["label"] in ["II", "ii"]:
120+
split_spans.extend(
121+
self.split_long_span(tokens, span, "i-r", "i-r", "i-r")
122+
)
94123

124+
elif task == "parsing":
125+
126+
for span in spans:
127+
if self.lowercase:
128+
label = span["label"].lower()
129+
else:
130+
label = span["label"]
95131
split_spans.extend(
96-
self.split_long_span(tokens, span, "i-r", "i-r")
132+
self.split_long_span(tokens, span, label, label, label)
97133
)
98134

99135
return split_spans
100136

101-
102137
def outside_spans(self, spans, tokens):
103138
"""
104139
Label tokens with `o` if they are outside a reference
@@ -125,7 +160,6 @@ def outside_spans(self, spans, tokens):
125160

126161
return outside_spans
127162

128-
129163
def create_span(self, tokens, index, label):
130164
"""
131165
Given a list of tokens, (in prodigy format) and an index relating to one of
@@ -145,60 +179,107 @@ def create_span(self, tokens, index, label):
145179

146180
return span
147181

148-
149-
def split_long_span(self, tokens, span, start_label, end_label):
182+
def split_long_span(self, tokens, span, start_label, end_label, inside_label):
150183
"""
151-
Split a milti-token span into `n` spans of lengh `1`, where `n=len(tokens)`
184+
Split a multi-token span into `n` spans of lengh `1`, where `n=len(tokens)`
152185
"""
153186

154187
spans = []
155188
spans.append(self.create_span(tokens, span["token_start"], start_label))
156189
spans.append(self.create_span(tokens, span["token_end"], end_label))
157190

158191
for index in range(span["token_start"] + 1, span["token_end"]):
159-
spans.append(self.create_span(tokens, index, "i-r"))
192+
spans.append(self.create_span(tokens, index, inside_label))
160193

161-
spans = sorted(spans, key=lambda k: k['token_start'])
194+
spans = sorted(spans, key=lambda k: k["token_start"])
162195

163196
return spans
164197

198+
165199
@plac.annotations(
166200
input_file=(
167201
"Path to jsonl file containing chunks of references in prodigy format.",
168202
"positional",
169203
None,
170-
str
204+
str,
171205
),
172206
output_file=(
173207
"Path to jsonl file into which fully annotate files will be saved.",
174208
"positional",
175209
None,
176-
str
177-
)
210+
str,
211+
),
212+
task=(
213+
"Which task is being performed. Either splitting or parsing.",
214+
"positional",
215+
None,
216+
str,
217+
),
218+
lowercase=(
219+
"Convert UPPER case reference labels to lower case token labels?",
220+
"flag",
221+
"f",
222+
bool,
223+
),
178224
)
179-
180-
def reference_to_token_annotations(input_file, output_file):
181-
""" Converts a file output by prodigy (using prodigy db-out) from
182-
references level annotations to individual level annotations. The rationale
183-
for this is that reference level annotations are much easier for humans to
184-
do, but not useful when training a token level model.
185-
186-
This function is predominantly useful fot tagging reference spans, but may
187-
also have a function with other references annotations.
225+
def reference_to_token_annotations(
226+
input_file, output_file, task="splitting", lowercase=False
227+
):
228+
"""
229+
Creates a span for every token from existing multi-token spans
230+
231+
Converts a jsonl file output by prodigy (using prodigy db-out) with spans
232+
extending over more than a single token to individual token level spans.
233+
234+
The rationale for this is that reference level annotations are much easier
235+
for humans to do, but not useful when training a token level model.
236+
237+
This command functions in two ways:
238+
239+
* task=splitting: For the splitting task where we are interested in
240+
labelling the beginning (b-r) and end (e-r) of references, reference
241+
spans are labelled with one of BI, BE, IE, II. These are then converted
242+
to token level spans b-r, i-r, e-r, and o using logic. Symbolically:
243+
* BE: [BE, BE, BE] becomes [b-r][i-r][e-r]
244+
* BI: [BI, BI, BI] becomes [b-r][i-r][i-r]
245+
* IE: [IE, IE, IE] becomes [i-r][i-r][e-r]
246+
* II: [II, II, II] becomes [i-r][i-r][i-r]
247+
* All other tokens become [o]
248+
249+
* task=parsing: For the parsing task, multi-task annotations are much
250+
simpler and would tend to be just 'author', or 'title'. These simple
251+
labels can be applied directly to the individual tokens contained within
252+
these multi-token spans; for each token in the multi-token span, a span
253+
is created with the same label. Symbolically:
254+
* [author author author] becomes [author][author][author]
188255
"""
189256

190-
partially_annotated = read_jsonl(input_file)
257+
ref_annotated_docs = read_jsonl(input_file)
191258

192259
# Only run the tagger on annotated examples.
193260

194-
partially_annotated = [doc for doc in partially_annotated if doc.get("spans")]
261+
not_annotated_docs = [doc for doc in ref_annotated_docs if not doc.get("spans")]
262+
ref_annotated_docs = [doc for doc in ref_annotated_docs if doc.get("spans")]
195263

196-
logger.info("Loaded %s documents with reference annotations", len(partially_annotated))
264+
logger.info(
265+
"Loaded %s documents with reference annotations", len(ref_annotated_docs)
266+
)
267+
logger.info(
268+
"Loaded %s documents with no reference annotations", len(not_annotated_docs)
269+
)
197270

198-
annotator = TokenTagger(partially_annotated)
271+
annotator = TokenTagger(task=task, lowercase=lowercase)
199272

200-
fully_annotated = annotator.run()
273+
token_annotated_docs = annotator.run(ref_annotated_docs)
274+
all_docs = token_annotated_docs + token_annotated_docs
201275

202-
write_jsonl(fully_annotated, output_file=output_file)
276+
write_jsonl(all_docs, output_file=output_file)
203277

204-
logger.info("Fully annotated references written to %s", output_file)
278+
logger.info(
279+
"Wrote %s docs with token annotations to %s",
280+
len(token_annotated_docs),
281+
output_file,
282+
)
283+
logger.info(
284+
"Wrote %s docs with no annotations to %s", len(not_annotated_docs), output_file
285+
)

0 commit comments

Comments
 (0)