@@ -141,7 +141,7 @@ def __call__(self, string_or_list, return_tokens=False):
141141 else :
142142 return new_strings
143143
144- def _extract_tokens (self , string , pred ):
144+ def _extract_tokens_2class (self , string , pred ):
145145 mask = ''
146146 for p in pred :
147147 mask += self .encodings ._label_list [p ]
@@ -168,3 +168,52 @@ def _extract_tokens(self, string, pred):
168168 tokens .append ((c_tok , start , stop ))
169169 new_str += '<RANDOM_STRING>'
170170 return new_str , tokens
171+
172+ def _extract_tokens (self , string , pred ):
173+ mask = ''
174+ for p in pred :
175+ mask += self .encodings ._label_list [p ]
176+ start = 0
177+ new_str = ''
178+ tokens = []
179+ c_tok = ''
180+ last_label = mask [0 ]
181+ for ii in range (len (string )):
182+ # check if the label-type has changed
183+ if last_label != mask [ii ]:
184+ if c_tok != '' :
185+ if last_label == 'C' :
186+ new_str += c_tok
187+ elif last_label == 'H' :
188+ type = '<RANDOM_STRING>'
189+ elif last_label == 'N' :
190+ type = '<NUMERIC>'
191+ elif last_label == 'I' :
192+ type = '<IP_ADDR>'
193+ elif last_label == 'U' :
194+ type = '<UUID>'
195+
196+ if last_label != 'C' :
197+ tokens .append ((c_tok , start , ii , type ))
198+ new_str += type
199+ c_tok = ''
200+ start = ii
201+
202+ last_label = mask [ii ]
203+ c_tok += string [ii ]
204+
205+ if c_tok != '' :
206+ if last_label == 'C' :
207+ new_str += c_tok
208+ elif last_label == 'H' :
209+ type = '<RANDOM_STRING>'
210+ elif last_label == 'N' :
211+ type = '<NUMERIC>'
212+ elif last_label == 'I' :
213+ type = '<IP_ADDR>'
214+ elif last_label == 'U' :
215+ type = '<UUID>'
216+ if last_label != 'C' :
217+ tokens .append ((c_tok , start , ii , type ))
218+ new_str += type
219+ return new_str , tokens
0 commit comments