@@ -241,6 +241,97 @@ def _annotate_with_smart_cascade(
241
241
return result .spans
242
242
return regex_result
243
243
244
+ def _annotate_single_chunk (
245
+ self , text : str , structured : bool = False
246
+ ) -> Union [Dict [str , List [str ]], List ["Span" ]]:
247
+ """Annotate a single chunk of text based on the engine type."""
248
+ if self .engine == "regex" :
249
+ if structured :
250
+ _ , result = self .regex_annotator .annotate_with_spans (text )
251
+ return result .spans
252
+ return self .regex_annotator .annotate (text )
253
+ elif self .engine == "spacy" :
254
+ if self .spacy_annotator is None :
255
+ raise ImportError (
256
+ "SpaCy engine not available. Install with: pip install datafog[nlp]"
257
+ )
258
+ return self .spacy_annotator .annotate (text )
259
+ elif self .engine == "gliner" :
260
+ if self .gliner_annotator is None :
261
+ raise ImportError (
262
+ "GLiNER engine not available. Install with: pip install datafog[nlp-advanced]"
263
+ )
264
+ return self .gliner_annotator .annotate (text )
265
+ elif self .engine == "smart" :
266
+ return self ._annotate_with_smart_cascade (text , structured )
267
+ elif self .engine == "auto" :
268
+ return self ._annotate_with_auto_engine (text , structured )
269
+
270
+ def _annotate_with_auto_engine (
271
+ self , text : str , structured : bool = False
272
+ ) -> Union [Dict [str , List [str ]], List ["Span" ]]:
273
+ """Handle auto engine annotation with regex fallback to spacy."""
274
+ # Try regex first
275
+ if structured :
276
+ # For structured output, use annotate_with_spans directly to avoid double processing
277
+ _ , result = self .regex_annotator .annotate_with_spans (text )
278
+ regex_result = {}
279
+ for span in result .spans :
280
+ if span .label not in regex_result :
281
+ regex_result [span .label ] = []
282
+ regex_result [span .label ].append (span .text )
283
+
284
+ # Check if regex found any entities
285
+ if any (entities for entities in regex_result .values ()):
286
+ return result .spans
287
+ else :
288
+ regex_result = self .regex_annotator .annotate (text )
289
+
290
+ # Check if regex found any entities
291
+ if any (entities for entities in regex_result .values ()):
292
+ return regex_result
293
+
294
+ # Fall back to spacy if available
295
+ if self .spacy_annotator is not None :
296
+ return self .spacy_annotator .annotate (text )
297
+
298
+ # Return regex result even if empty
299
+ if structured :
300
+ # We already have the result from above in structured mode
301
+ return result .spans
302
+ return regex_result
303
+
304
+ def _annotate_multiple_chunks_structured (self , chunks : List [str ]) -> List ["Span" ]:
305
+ """Handle structured annotation across multiple chunks."""
306
+ all_spans = []
307
+ current_offset = 0
308
+
309
+ # Get Span class once outside the loop for efficiency
310
+ SpanClass = _get_span_class ()
311
+
312
+ for chunk in chunks :
313
+ chunk_spans = self ._annotate_single_chunk (chunk , structured = True )
314
+ # Adjust span positions to account for chunk offset
315
+ for span in chunk_spans :
316
+ adjusted_span = SpanClass (
317
+ start = span .start + current_offset ,
318
+ end = span .end + current_offset ,
319
+ text = span .text ,
320
+ label = span .label ,
321
+ )
322
+ all_spans .append (adjusted_span )
323
+ current_offset += len (chunk )
324
+
325
+ return all_spans
326
+
327
+ def _annotate_multiple_chunks_dict (self , chunks : List [str ]) -> Dict [str , List [str ]]:
328
+ """Handle dictionary annotation across multiple chunks."""
329
+ chunk_annotations = []
330
+ for chunk in chunks :
331
+ chunk_result = self ._annotate_single_chunk (chunk , structured = False )
332
+ chunk_annotations .append (chunk_result )
333
+ return self ._combine_annotations (chunk_annotations )
334
+
244
335
def annotate_text_sync (
245
336
self , text : str , structured : bool = False
246
337
) -> Union [Dict [str , List [str ]], List ["Span" ]]:
@@ -256,88 +347,15 @@ def annotate_text_sync(
256
347
"""
257
348
if len (text ) <= self .text_chunk_length :
258
349
# Single chunk processing
259
- if self .engine == "regex" :
260
- if structured :
261
- _ , result = self .regex_annotator .annotate_with_spans (text )
262
- return result .spans
263
- return self .regex_annotator .annotate (text )
264
- elif self .engine == "spacy" :
265
- if self .spacy_annotator is None :
266
- raise ImportError (
267
- "SpaCy engine not available. Install with: pip install datafog[nlp]"
268
- )
269
- return self .spacy_annotator .annotate (text )
270
- elif self .engine == "gliner" :
271
- if self .gliner_annotator is None :
272
- raise ImportError (
273
- "GLiNER engine not available. Install with: pip install datafog[nlp-advanced]"
274
- )
275
- return self .gliner_annotator .annotate (text )
276
- elif self .engine == "smart" :
277
- return self ._annotate_with_smart_cascade (text , structured )
278
- elif self .engine == "auto" :
279
- # Try regex first
280
- if structured :
281
- # For structured output, use annotate_with_spans directly to avoid double processing
282
- _ , result = self .regex_annotator .annotate_with_spans (text )
283
- regex_result = {}
284
- for span in result .spans :
285
- if span .label not in regex_result :
286
- regex_result [span .label ] = []
287
- regex_result [span .label ].append (span .text )
288
-
289
- # Check if regex found any entities
290
- if any (entities for entities in regex_result .values ()):
291
- return result .spans
292
- else :
293
- regex_result = self .regex_annotator .annotate (text )
294
-
295
- # Check if regex found any entities
296
- if any (entities for entities in regex_result .values ()):
297
- return regex_result
298
-
299
- # Fall back to spacy if available
300
- if self .spacy_annotator is not None :
301
- return self .spacy_annotator .annotate (text )
302
-
303
- # Return regex result even if empty
304
- if structured :
305
- # We already have the result from above in structured mode
306
- return result .spans
307
- return regex_result
350
+ return self ._annotate_single_chunk (text , structured )
308
351
else :
309
352
# Multi-chunk processing
310
353
chunks = self ._chunk_text (text )
311
354
312
355
if structured :
313
- # For structured output, we need to handle span positions across chunks
314
- all_spans = []
315
- current_offset = 0
316
-
317
- # Get Span class once outside the loop for efficiency
318
- SpanClass = _get_span_class ()
319
-
320
- for chunk in chunks :
321
- chunk_spans = self .annotate_text_sync (chunk , structured = True )
322
- # Adjust span positions to account for chunk offset
323
- for span in chunk_spans :
324
- adjusted_span = SpanClass (
325
- start = span .start + current_offset ,
326
- end = span .end + current_offset ,
327
- text = span .text ,
328
- label = span .label ,
329
- )
330
- all_spans .append (adjusted_span )
331
- current_offset += len (chunk )
332
-
333
- return all_spans
356
+ return self ._annotate_multiple_chunks_structured (chunks )
334
357
else :
335
- # Dictionary format - combine annotations
336
- chunk_annotations = []
337
- for chunk in chunks :
338
- chunk_result = self .annotate_text_sync (chunk , structured = False )
339
- chunk_annotations .append (chunk_result )
340
- return self ._combine_annotations (chunk_annotations )
358
+ return self ._annotate_multiple_chunks_dict (chunks )
341
359
342
360
async def annotate_text_async (
343
361
self , text : str , structured : bool = False
0 commit comments