19
19
from scrapely .extraction .pageobjects import (AnnotationTag ,
20
20
PageRegion , FragmentedHtmlPageRegion )
21
21
22
- def build_extraction_tree (template , type_descriptor , trace = True ):
23
- """Build a tree of region extractors corresponding to the
24
- template
25
- """
26
- attribute_map = type_descriptor .attribute_map if type_descriptor else None
27
- extractors = BasicTypeExtractor .create (template .annotations , attribute_map )
28
- if trace :
29
- extractors = TraceExtractor .apply (template , extractors )
30
- for cls in (RepeatedDataExtractor , AdjacentVariantExtractor , RepeatedDataExtractor , AdjacentVariantExtractor , RepeatedDataExtractor ,
31
- RecordExtractor ):
32
- extractors = cls .apply (template , extractors )
33
- if trace :
34
- extractors = TraceExtractor .apply (template , extractors )
35
-
36
- return TemplatePageExtractor (template , extractors )
37
-
38
22
_EXTRACT_HTML = lambda x : x
39
23
_DEFAULT_DESCRIPTOR = FieldDescriptor ('none' , None )
40
24
41
- def _labelled (obj ):
25
+ __all__ = ['BasicTypeExtractor' ,
26
+ 'TraceExtractor' ,
27
+ 'RepeatedDataExtractor' ,
28
+ 'AdjacentVariantExtractor' ,
29
+ 'RecordExtractor' ,
30
+ 'TemplatePageExtractor' ,
31
+ 'TextRegionDataExtractor' ,
32
+ 'attrs2dict' ,
33
+ 'labelled_element' ]
34
+
35
+ def labelled_element (obj ):
42
36
"""
43
37
Returns labelled element of the object (extractor or labelled region)
44
38
"""
@@ -282,13 +276,13 @@ class TransposedDataExtractor(object):
282
276
283
277
_namef = operator .itemgetter (0 )
284
278
_valuef = operator .itemgetter (1 )
285
- def _attrs2dict (attributes ):
279
+ def attrs2dict (attributes ):
286
280
"""convert a list of attributes (name, value) tuples
287
281
into a dict of lists.
288
282
289
283
For example:
290
284
>>> l = [('name', 'sofa'), ('colour', 'red'), ('colour', 'green')]
291
- >>> _attrs2dict (l) == {'name': ['sofa'], 'colour': ['red', 'green']}
285
+ >>> attrs2dict (l) == {'name': ['sofa'], 'colour': ['red', 'green']}
292
286
True
293
287
"""
294
288
grouped_data = groupby (sorted (attributes , key = _namef ), _namef )
@@ -326,6 +320,7 @@ def __init__(self, extractors, template_tokens):
326
320
start_index = min (e .annotation .start_index for e in extractors )
327
321
end_index = max (e .annotation .end_index for e in extractors )
328
322
self .annotation = AnnotationTag (start_index , end_index )
323
+ self .best_match = longest_unique_subsequence
329
324
330
325
def extract (self , page , start_index = 0 , end_index = None , ignored_regions = None , ** kwargs ):
331
326
"""extract data from an extraction page
@@ -335,7 +330,7 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **k
335
330
"""
336
331
if ignored_regions is None :
337
332
ignored_regions = []
338
- region_elements = sorted (self .extractors + ignored_regions , key = lambda x : _labelled (x ).start_index )
333
+ region_elements = sorted (self .extractors + ignored_regions , key = lambda x : labelled_element (x ).start_index )
339
334
_ , _ , attributes = self ._doextract (page , region_elements , start_index ,
340
335
end_index , ** kwargs )
341
336
# collect variant data, maintaining the order of variants
@@ -350,10 +345,10 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **k
350
345
else :
351
346
items .append ((k , v ))
352
347
353
- variant_records = [('variants' , _attrs2dict (variants [vid ])) \
348
+ variant_records = [('variants' , attrs2dict (variants [vid ])) \
354
349
for vid in variant_ids ]
355
350
items += variant_records
356
- return [_attrs2dict (items )]
351
+ return [attrs2dict (items )]
357
352
358
353
def _doextract (self , page , region_elements , start_index , end_index , nested_regions = None , ignored_regions = None , ** kwargs ):
359
354
"""Carry out extraction of records using the given annotations
@@ -364,30 +359,30 @@ def _doextract(self, page, region_elements, start_index, end_index, nested_regio
364
359
nested_regions = nested_regions or []
365
360
ignored_regions = ignored_regions or []
366
361
first_region , following_regions = region_elements [0 ], region_elements [1 :]
367
- while following_regions and _labelled (following_regions [0 ]).start_index \
368
- < _labelled (first_region ).end_index :
362
+ while following_regions and labelled_element (following_regions [0 ]).start_index \
363
+ < labelled_element (first_region ).end_index :
369
364
region = following_regions .pop (0 )
370
- labelled = _labelled (region )
365
+ labelled = labelled_element (region )
371
366
if isinstance (labelled , AnnotationTag ) or (nested_regions and \
372
- _labelled (nested_regions [- 1 ]).start_index < labelled .start_index \
373
- < _labelled (nested_regions [- 1 ]).end_index ):
367
+ labelled_element (nested_regions [- 1 ]).start_index < labelled .start_index \
368
+ < labelled_element (nested_regions [- 1 ]).end_index ):
374
369
nested_regions .append (region )
375
370
else :
376
371
ignored_regions .append (region )
377
372
extracted_data = []
378
373
# end_index is inclusive, but similar_region treats it as exclusive
379
374
end_region = None if end_index is None else end_index + 1
380
- labelled = _labelled (first_region )
375
+ labelled = labelled_element (first_region )
381
376
score , pindex , sindex = \
382
377
similar_region (page .page_tokens , self .template_tokens ,
383
- labelled , start_index , end_region , ** kwargs )
378
+ labelled , start_index , end_region , self . best_match , ** kwargs )
384
379
if score > 0 :
385
380
if isinstance (labelled , AnnotationTag ):
386
381
similar_ignored_regions = []
387
382
start = pindex
388
383
for i in ignored_regions :
389
384
s , p , e = similar_region (page .page_tokens , self .template_tokens , \
390
- i , start , sindex , ** kwargs )
385
+ i , start , sindex , self . best_match , ** kwargs )
391
386
if s > 0 :
392
387
similar_ignored_regions .append (PageRegion (p , e ))
393
388
start = e or start
0 commit comments