1010
1111
1212class TokenTagger :
13- """
14- Converts data in prodigy format with full reference spans to per-token spans
13+ def __init__ (self , task = "splitting" , lowercase = True ):
14+ """
15+ Converts data in prodigy format with full reference spans to per-token
16+ spans
1517
16- Expects one of four lables for the spans:
18+ Args:
19+ task (str): One of ["parsing", "splitting"]. See below further
20+ explanation.
21+ lowercase (bool): Automatically convert upper case annotations to
22+ lowercase under the parsing scenario.
1723
18- * BE: A complete reference
19- * BI: A frgament of reference that captures the beginning but not the end
20- * IE: A frgament of reference that captures the end but not the beginning
21- * II: A fragment of a reference that captures neither the beginning nor the
22- end .
23- """
24+ Since the parsing, splitting, and classification tasks have quite
25+ different labelling requirements, this class behaves differently
26+ depending on which task is specified in the task argument.
27+
28+ For splitting:
2429
25- def __init__ (self ):
30+ Expects one of four labels for the spans:
31+
32+ * BE: A complete reference
33+ * BI: A frgament of reference that captures the beginning but not the end
34+ * IE: A frgament of reference that captures the end but not the beginning
35+ * II: A fragment of a reference that captures neither the beginning nor the
36+ end .
37+
38+ Depending on which label is applied the tokens within the span will be
39+ labelled differently as one of ["b-r", "i-r", "e-r", "o"].
40+
41+ For parsing:
42+
43+ Expects any arbitrary label for spans. All tokens within that span will
44+ be labelled with the same span.
45+
46+ """
2647
2748 self .out = []
49+ self .task = task
50+ self .lowercase = lowercase
2851
2952 def tag_doc (self , doc ):
3053 """
31- Tags a document with the appropriate labels
54+ Tags a document with appropriate labels for the parsing task
3255
3356 Args:
3457 doc(dict): A single document in prodigy dict format to be labelled.
3558 """
3659
37- bie_spans = self .reference_spans (doc ["spans" ], doc ["tokens" ])
60+ bie_spans = self .reference_spans (doc ["spans" ], doc ["tokens" ], task = self . task )
3861 o_spans = self .outside_spans (bie_spans , doc ["tokens" ])
3962
4063 # Flatten into one list.
@@ -43,7 +66,7 @@ def tag_doc(self, doc):
4366
4467 # Sort by token id to ensure it is ordered.
4568
46- spans = sorted (spans , key = lambda k : k [' token_start' ])
69+ spans = sorted (spans , key = lambda k : k [" token_start" ])
4770
4871 doc ["spans" ] = spans
4972
@@ -63,42 +86,54 @@ def run(self, docs):
6386
6487 return self .out
6588
66- def reference_spans (self , spans , tokens ):
89+ def reference_spans (self , spans , tokens , task ):
6790 """
6891 Given a whole reference span as labelled in prodigy, break this into
6992 appropriate single token spans depending on the label that was applied to
7093 the whole reference span.
7194 """
7295 split_spans = []
7396
74- for span in spans :
75- if span ["label" ] in ["BE" , "be" ]:
97+ if task == "splitting" :
7698
77- split_spans .extend (
78- self .split_long_span (tokens , span , "b-r" , "e-r" )
79- )
99+ for span in spans :
100+ if span ["label" ] in ["BE" , "be" ]:
80101
81- elif span ["label" ] in ["BI" , "bi" ]:
102+ split_spans .extend (
103+ self .split_long_span (tokens , span , "b-r" , "e-r" , "i-r" )
104+ )
82105
83- split_spans .extend (
84- self .split_long_span (tokens , span , "b-r" , "i-r" )
85- )
106+ elif span ["label" ] in ["BI" , "bi" ]:
86107
87- elif span ["label" ] in ["IE" , "ie" ]:
108+ split_spans .extend (
109+ self .split_long_span (tokens , span , "b-r" , "i-r" , "i-r" )
110+ )
88111
89- split_spans .extend (
90- self .split_long_span (tokens , span , "i-r" , "e-r" )
91- )
112+ elif span ["label" ] in ["IE" , "ie" ]:
113+
114+ split_spans .extend (
115+ self .split_long_span (tokens , span , "i-r" , "e-r" , "i-r" )
116+ )
117+
118+ elif span ["label" ] in ["II" , "ii" ]:
92119
93- elif span ["label" ] in ["II" , "ii" ]:
120+ split_spans .extend (
121+ self .split_long_span (tokens , span , "i-r" , "i-r" , "i-r" )
122+ )
94123
124+ elif task == "parsing" :
125+
126+ for span in spans :
127+ if self .lowercase :
128+ label = span ["label" ].lower ()
129+ else :
130+ label = span ["label" ]
95131 split_spans .extend (
96- self .split_long_span (tokens , span , "i-r" , "i-r" )
132+ self .split_long_span (tokens , span , label , label , label )
97133 )
98134
99135 return split_spans
100136
101-
102137 def outside_spans (self , spans , tokens ):
103138 """
104139 Label tokens with `o` if they are outside a reference
@@ -125,7 +160,6 @@ def outside_spans(self, spans, tokens):
125160
126161 return outside_spans
127162
128-
129163 def create_span (self , tokens , index , label ):
130164 """
131165 Given a list of tokens, (in prodigy format) and an index relating to one of
@@ -145,60 +179,107 @@ def create_span(self, tokens, index, label):
145179
146180 return span
147181
148-
149- def split_long_span (self , tokens , span , start_label , end_label ):
182+ def split_long_span (self , tokens , span , start_label , end_label , inside_label ):
150183 """
151- Split a milti -token span into `n` spans of lengh `1`, where `n=len(tokens)`
184+ Split a multi -token span into `n` spans of lengh `1`, where `n=len(tokens)`
152185 """
153186
154187 spans = []
155188 spans .append (self .create_span (tokens , span ["token_start" ], start_label ))
156189 spans .append (self .create_span (tokens , span ["token_end" ], end_label ))
157190
158191 for index in range (span ["token_start" ] + 1 , span ["token_end" ]):
159- spans .append (self .create_span (tokens , index , "i-r" ))
192+ spans .append (self .create_span (tokens , index , inside_label ))
160193
161- spans = sorted (spans , key = lambda k : k [' token_start' ])
194+ spans = sorted (spans , key = lambda k : k [" token_start" ])
162195
163196 return spans
164197
198+
165199@plac .annotations (
166200 input_file = (
167201 "Path to jsonl file containing chunks of references in prodigy format." ,
168202 "positional" ,
169203 None ,
170- str
204+ str ,
171205 ),
172206 output_file = (
173207 "Path to jsonl file into which fully annotate files will be saved." ,
174208 "positional" ,
175209 None ,
176- str
177- )
210+ str ,
211+ ),
212+ task = (
213+ "Which task is being performed. Either splitting or parsing." ,
214+ "positional" ,
215+ None ,
216+ str ,
217+ ),
218+ lowercase = (
219+ "Convert UPPER case reference labels to lower case token labels?" ,
220+ "flag" ,
221+ "f" ,
222+ bool ,
223+ ),
178224)
179-
180- def reference_to_token_annotations (input_file , output_file ):
181- """ Converts a file output by prodigy (using prodigy db-out) from
182- references level annotations to individual level annotations. The rationale
183- for this is that reference level annotations are much easier for humans to
184- do, but not useful when training a token level model.
185-
186- This function is predominantly useful fot tagging reference spans, but may
187- also have a function with other references annotations.
225+ def reference_to_token_annotations (
226+ input_file , output_file , task = "splitting" , lowercase = False
227+ ):
228+ """
229+ Creates a span for every token from existing multi-token spans
230+
231+ Converts a jsonl file output by prodigy (using prodigy db-out) with spans
232+ extending over more than a single token to individual token level spans.
233+
234+ The rationale for this is that reference level annotations are much easier
235+ for humans to do, but not useful when training a token level model.
236+
237+ This command functions in two ways:
238+
239+ * task=splitting: For the splitting task where we are interested in
240+ labelling the beginning (b-r) and end (e-r) of references, reference
241+ spans are labelled with one of BI, BE, IE, II. These are then converted
242+ to token level spans b-r, i-r, e-r, and o using logic. Symbolically:
243+ * BE: [BE, BE, BE] becomes [b-r][i-r][e-r]
244+ * BI: [BI, BI, BI] becomes [b-r][i-r][i-r]
245+ * IE: [IE, IE, IE] becomes [i-r][i-r][e-r]
246+ * II: [II, II, II] becomes [i-r][i-r][i-r]
247+ * All other tokens become [o]
248+
249+ * task=parsing: For the parsing task, multi-task annotations are much
250+ simpler and would tend to be just 'author', or 'title'. These simple
251+ labels can be applied directly to the individual tokens contained within
252+ these multi-token spans; for each token in the multi-token span, a span
253+ is created with the same label. Symbolically:
254+ * [author author author] becomes [author][author][author]
188255 """
189256
190- partially_annotated = read_jsonl (input_file )
257+ ref_annotated_docs = read_jsonl (input_file )
191258
192259 # Only run the tagger on annotated examples.
193260
194- partially_annotated = [doc for doc in partially_annotated if doc .get ("spans" )]
261+ not_annotated_docs = [doc for doc in ref_annotated_docs if not doc .get ("spans" )]
262+ ref_annotated_docs = [doc for doc in ref_annotated_docs if doc .get ("spans" )]
195263
196- logger .info ("Loaded %s documents with reference annotations" , len (partially_annotated ))
264+ logger .info (
265+ "Loaded %s documents with reference annotations" , len (ref_annotated_docs )
266+ )
267+ logger .info (
268+ "Loaded %s documents with no reference annotations" , len (not_annotated_docs )
269+ )
197270
198- annotator = TokenTagger (partially_annotated )
271+ annotator = TokenTagger (task = task , lowercase = lowercase )
199272
200- fully_annotated = annotator .run ()
273+ token_annotated_docs = annotator .run (ref_annotated_docs )
274+ all_docs = token_annotated_docs + token_annotated_docs
201275
202- write_jsonl (fully_annotated , output_file = output_file )
276+ write_jsonl (all_docs , output_file = output_file )
203277
204- logger .info ("Fully annotated references written to %s" , output_file )
278+ logger .info (
279+ "Wrote %s docs with token annotations to %s" ,
280+ len (token_annotated_docs ),
281+ output_file ,
282+ )
283+ logger .info (
284+ "Wrote %s docs with no annotations to %s" , len (not_annotated_docs ), output_file
285+ )
0 commit comments