Skip to content

Commit dcea5e4

Browse files
Merge pull request #8 from wellcometrust/feature/ivyleavedtoadflax/prodigy_to_tsv
Fix issues with references to spans
2 parents 5f9f078 + 42f270c commit dcea5e4

19 files changed

+1487
-342
lines changed

deep_reference_parser/prodigy/prodigy_to_tsv.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,20 @@
1212
import numpy as np
1313
import plac
1414

15-
from ..io import read_jsonl
15+
from wasabi import Printer
1616

17+
from ..io import read_jsonl
1718
from ..logger import logger
1819

20+
msg = Printer()
21+
1922

2023
class TokenLabelPairs:
2124
"""
2225
Convert prodigy format docs or list of lists into tuples of (token, label).
2326
"""
2427

25-
def __init__(self, line_limit=73, respect_line_endings=True, respect_doc_endings=True):
28+
def __init__(self, line_limit=250, respect_line_endings=False, respect_doc_endings=True):
2629
"""
2730
Args:
2831
line_limit(int): Maximum number of tokens allowed per training
@@ -191,19 +194,45 @@ def yield_token_label_pair(self, doc, lists=False):
191194
"positional",
192195
None,
193196
str
197+
),
198+
respect_lines=(
199+
"Respect line endings? Or parse entire document in a single string?",
200+
"flag",
201+
"r",
202+
bool
203+
),
204+
respect_docs=(
205+
"Respect doc endings or parse corpus in single string?",
206+
"flag",
207+
"d",
208+
bool
209+
),
210+
line_limit=(
211+
"Number of characters to include on a line",
212+
"option",
213+
"l",
214+
int
194215
)
195216
)
196-
def prodigy_to_tsv(input_file, output_file):
217+
def prodigy_to_tsv(input_file, output_file, respect_lines, respect_docs, line_limit=250):
197218
"""
198219
Convert token annotated jsonl to token annotated tsv ready for use in the
199220
Rodrigues model.
200221
"""
201222

223+
msg.info(f"Respect line endings: {respect_lines}")
224+
msg.info(f"Respect doc endings: {respect_docs}")
225+
msg.info(f"Line limit: {line_limit}")
226+
202227
annotated_data = read_jsonl(input_file)
203228

204229
logger.info("Loaded %s prodigy docs", len(annotated_data))
205230

206-
tlp = TokenLabelPairs()
231+
tlp = TokenLabelPairs(
232+
respect_doc_endings=respect_docs,
233+
respect_line_endings=respect_lines,
234+
line_limit=line_limit
235+
)
207236
token_label_pairs = list(tlp.run(annotated_data))
208237

209238
with open(output_file, 'w') as fb:
@@ -214,4 +243,3 @@ def prodigy_to_tsv(input_file, output_file):
214243

215244
logger.info("Wrote %s token/label pairs to %s", len(token_label_pairs),
216245
output_file)
217-

deep_reference_parser/prodigy/reference_to_token_annotations.py

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111

1212
class TokenTagger:
13-
def __init__(self, task="splitting", lowercase=True):
13+
def __init__(self, task="splitting", lowercase=True, text=True):
1414
"""
1515
Converts data in prodigy format with full reference spans to per-token
1616
spans
@@ -20,6 +20,8 @@ def __init__(self, task="splitting", lowercase=True):
2020
explanation.
2121
lowercase (bool): Automatically convert upper case annotations to
2222
lowercase under the parsing scenario.
23+
text (bool): Include the token text in the output span (very useful
24+
for debugging).
2325
2426
Since the parsing, splitting, and classification tasks have quite
2527
different labelling requirements, this class behaves differently
@@ -48,6 +50,7 @@ def __init__(self, task="splitting", lowercase=True):
4850
self.out = []
4951
self.task = task
5052
self.lowercase = lowercase
53+
self.text = text
5154

5255
def tag_doc(self, doc):
5356
"""
@@ -177,19 +180,33 @@ def create_span(self, tokens, index, label):
177180
"label": label,
178181
}
179182

183+
if self.text:
184+
span["text"] = token["text"]
185+
180186
return span
181187

182188
def split_long_span(self, tokens, span, start_label, end_label, inside_label):
183189
"""
184190
Split a multi-token span into `n` spans of lengh `1`, where `n=len(tokens)`
185191
"""
186-
187192
spans = []
188-
spans.append(self.create_span(tokens, span["token_start"], start_label))
189-
spans.append(self.create_span(tokens, span["token_end"], end_label))
193+
start = span["token_start"]
194+
end = span["token_end"]
195+
196+
span_size = end - start
197+
198+
# Case when there is only one token in the span
199+
if span_size == 0:
200+
spans.append(self.create_span(tokens, start, start_label))
201+
# Case when there are two or more tokens in the span
202+
else:
203+
spans.append(self.create_span(tokens, start, start_label))
204+
spans.append(self.create_span(tokens, end, end_label))
190205

191-
for index in range(span["token_start"] + 1, span["token_end"]):
192-
spans.append(self.create_span(tokens, index, inside_label))
206+
if span_size > 1:
207+
208+
for index in range(start + 1, end):
209+
spans.append(self.create_span(tokens, index, inside_label))
193210

194211
spans = sorted(spans, key=lambda k: k["token_start"])
195212

@@ -221,9 +238,15 @@ def split_long_span(self, tokens, span, start_label, end_label, inside_label):
221238
"f",
222239
bool,
223240
),
241+
text=(
242+
"Output the token text in the span (useful for debugging).",
243+
"flag",
244+
"t",
245+
bool,
246+
),
224247
)
225248
def reference_to_token_annotations(
226-
input_file, output_file, task="splitting", lowercase=False
249+
input_file, output_file, task="splitting", lowercase=False, text=False
227250
):
228251
"""
229252
Creates a span for every token from existing multi-token spans
@@ -268,10 +291,10 @@ def reference_to_token_annotations(
268291
"Loaded %s documents with no reference annotations", len(not_annotated_docs)
269292
)
270293

271-
annotator = TokenTagger(task=task, lowercase=lowercase)
294+
annotator = TokenTagger(task=task, lowercase=lowercase, text=text)
272295

273296
token_annotated_docs = annotator.run(ref_annotated_docs)
274-
all_docs = token_annotated_docs + token_annotated_docs
297+
all_docs = token_annotated_docs + not_annotated_docs
275298

276299
write_jsonl(all_docs, output_file=output_file)
277300

tests/common.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,11 @@
55

66

77
def get_path(p):
8-
return os.path.join(
9-
os.path.dirname(__file__),
10-
p
11-
)
8+
return os.path.join(os.path.dirname(__file__), p)
129

13-
TEST_CFG = get_path('test_data/test_config.ini')
14-
TEST_JSONL = get_path('test_data/test_jsonl.jsonl')
15-
TEST_REFERENCES = get_path('test_data/test_references.txt')
16-
TEST_TSV_PREDICT = get_path('test_data/test_tsv_predict.tsv')
17-
TEST_TSV_TRAIN = get_path('test_data/test_tsv_train.tsv')
10+
11+
TEST_CFG = get_path("test_data/test_config.ini")
12+
TEST_JSONL = get_path("test_data/test_jsonl.jsonl")
13+
TEST_REFERENCES = get_path("test_data/test_references.txt")
14+
TEST_TSV_PREDICT = get_path("test_data/test_tsv_predict.tsv")
15+
TEST_TSV_TRAIN = get_path("test_data/test_tsv_train.tsv")

tests/prodigy/common.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/usr/bin/env python3
2+
# coding: utf-8
3+
4+
import os
5+
6+
7+
def get_path(p):
8+
return os.path.join(os.path.dirname(__file__), p)
9+
10+
11+
TEST_TOKENS = get_path("test_data/test_tokens_to_tsv_tokens.jsonl")
12+
TEST_SPANS = get_path("test_data/test_tokens_to_tsv_spans.jsonl")
13+
TEST_REF_TOKENS = get_path("test_data/test_reference_to_token_tokens.jsonl")
14+
TEST_REF_SPANS = get_path("test_data/test_reference_to_token_spans.jsonl")
15+
TEST_REF_EXPECTED_SPANS = get_path("test_data/test_reference_to_token_expected.jsonl")

0 commit comments

Comments
 (0)