Skip to content

Commit 62a46da

Browse files
committed
Merge pull request #47 from tpeng/patch-1
allow passsing different subsequence methods to similar_region
2 parents 576d3db + a5e8fc9 commit 62a46da

File tree

4 files changed

+82
-37
lines changed

4 files changed

+82
-37
lines changed

scrapely/extraction/__init__.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717
suffix.
1818
"""
1919
from operator import itemgetter
20-
from .regionextract import build_extraction_tree
2120
from .pageparsing import parse_template, parse_extraction_page
2221
from .pageobjects import TokenDict
22+
from .regionextract import (BasicTypeExtractor, TraceExtractor, RepeatedDataExtractor, \
23+
AdjacentVariantExtractor, RecordExtractor, TemplatePageExtractor)
24+
2325

2426
class InstanceBasedLearningExtractor(object):
2527
"""Implementation of the instance based learning algorithm to
@@ -66,11 +68,27 @@ def __init__(self, td_pairs, trace=False, apply_extrarequired=True):
6668
# templates with more attributes are considered first
6769
sorted_tdpairs = sorted(modified_parsed_tdpairs, \
6870
key=lambda x: _annotation_count(itemgetter(0)(x)), reverse=True)
69-
self.extraction_trees = [build_extraction_tree(p, td[1],
71+
self.extraction_trees = [self.build_extraction_tree(p, td[1],
7072
trace) for p, td in sorted_tdpairs]
7173
self.validated = dict((td[0].page_id, td[1].validated if td[1] else \
7274
self._filter_not_none) for _, td in sorted_tdpairs)
7375

76+
def build_extraction_tree(self, template, type_descriptor, trace=True):
77+
"""Build a tree of region extractors corresponding to the
78+
template
79+
"""
80+
attribute_map = type_descriptor.attribute_map if type_descriptor else None
81+
extractors = BasicTypeExtractor.create(template.annotations, attribute_map)
82+
if trace:
83+
extractors = TraceExtractor.apply(template, extractors)
84+
for cls in (RepeatedDataExtractor, AdjacentVariantExtractor, RepeatedDataExtractor, AdjacentVariantExtractor, RepeatedDataExtractor,
85+
RecordExtractor):
86+
extractors = cls.apply(template, extractors)
87+
if trace:
88+
extractors = TraceExtractor.apply(template, extractors)
89+
90+
return TemplatePageExtractor(template, extractors)
91+
7492
def extract(self, html, pref_template_id=None):
7593
"""extract data from an html page
7694

scrapely/extraction/regionextract.py

Lines changed: 25 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -19,26 +19,20 @@
1919
from scrapely.extraction.pageobjects import (AnnotationTag,
2020
PageRegion, FragmentedHtmlPageRegion)
2121

22-
def build_extraction_tree(template, type_descriptor, trace=True):
23-
"""Build a tree of region extractors corresponding to the
24-
template
25-
"""
26-
attribute_map = type_descriptor.attribute_map if type_descriptor else None
27-
extractors = BasicTypeExtractor.create(template.annotations, attribute_map)
28-
if trace:
29-
extractors = TraceExtractor.apply(template, extractors)
30-
for cls in (RepeatedDataExtractor, AdjacentVariantExtractor, RepeatedDataExtractor, AdjacentVariantExtractor, RepeatedDataExtractor,
31-
RecordExtractor):
32-
extractors = cls.apply(template, extractors)
33-
if trace:
34-
extractors = TraceExtractor.apply(template, extractors)
35-
36-
return TemplatePageExtractor(template, extractors)
37-
3822
_EXTRACT_HTML = lambda x: x
3923
_DEFAULT_DESCRIPTOR = FieldDescriptor('none', None)
4024

41-
def _labelled(obj):
25+
__all__ = ['BasicTypeExtractor',
26+
'TraceExtractor',
27+
'RepeatedDataExtractor',
28+
'AdjacentVariantExtractor',
29+
'RecordExtractor',
30+
'TemplatePageExtractor',
31+
'TextRegionDataExtractor',
32+
'attrs2dict',
33+
'labelled_element']
34+
35+
def labelled_element(obj):
4236
"""
4337
Returns labelled element of the object (extractor or labelled region)
4438
"""
@@ -282,13 +276,13 @@ class TransposedDataExtractor(object):
282276

283277
_namef = operator.itemgetter(0)
284278
_valuef = operator.itemgetter(1)
285-
def _attrs2dict(attributes):
279+
def attrs2dict(attributes):
286280
"""convert a list of attributes (name, value) tuples
287281
into a dict of lists.
288282
289283
For example:
290284
>>> l = [('name', 'sofa'), ('colour', 'red'), ('colour', 'green')]
291-
>>> _attrs2dict(l) == {'name': ['sofa'], 'colour': ['red', 'green']}
285+
>>> attrs2dict(l) == {'name': ['sofa'], 'colour': ['red', 'green']}
292286
True
293287
"""
294288
grouped_data = groupby(sorted(attributes, key=_namef), _namef)
@@ -326,6 +320,7 @@ def __init__(self, extractors, template_tokens):
326320
start_index = min(e.annotation.start_index for e in extractors)
327321
end_index = max(e.annotation.end_index for e in extractors)
328322
self.annotation = AnnotationTag(start_index, end_index)
323+
self.best_match = longest_unique_subsequence
329324

330325
def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs):
331326
"""extract data from an extraction page
@@ -335,7 +330,7 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **k
335330
"""
336331
if ignored_regions is None:
337332
ignored_regions = []
338-
region_elements = sorted(self.extractors + ignored_regions, key=lambda x: _labelled(x).start_index)
333+
region_elements = sorted(self.extractors + ignored_regions, key=lambda x: labelled_element(x).start_index)
339334
_, _, attributes = self._doextract(page, region_elements, start_index,
340335
end_index, **kwargs)
341336
# collect variant data, maintaining the order of variants
@@ -350,10 +345,10 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **k
350345
else:
351346
items.append((k, v))
352347

353-
variant_records = [('variants', _attrs2dict(variants[vid])) \
348+
variant_records = [('variants', attrs2dict(variants[vid])) \
354349
for vid in variant_ids]
355350
items += variant_records
356-
return [_attrs2dict(items)]
351+
return [attrs2dict(items)]
357352

358353
def _doextract(self, page, region_elements, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs):
359354
"""Carry out extraction of records using the given annotations
@@ -364,30 +359,30 @@ def _doextract(self, page, region_elements, start_index, end_index, nested_regio
364359
nested_regions = nested_regions or []
365360
ignored_regions = ignored_regions or []
366361
first_region, following_regions = region_elements[0], region_elements[1:]
367-
while following_regions and _labelled(following_regions[0]).start_index \
368-
< _labelled(first_region).end_index:
362+
while following_regions and labelled_element(following_regions[0]).start_index \
363+
< labelled_element(first_region).end_index:
369364
region = following_regions.pop(0)
370-
labelled = _labelled(region)
365+
labelled = labelled_element(region)
371366
if isinstance(labelled, AnnotationTag) or (nested_regions and \
372-
_labelled(nested_regions[-1]).start_index < labelled.start_index \
373-
< _labelled(nested_regions[-1]).end_index):
367+
labelled_element(nested_regions[-1]).start_index < labelled.start_index \
368+
< labelled_element(nested_regions[-1]).end_index):
374369
nested_regions.append(region)
375370
else:
376371
ignored_regions.append(region)
377372
extracted_data = []
378373
# end_index is inclusive, but similar_region treats it as exclusive
379374
end_region = None if end_index is None else end_index + 1
380-
labelled = _labelled(first_region)
375+
labelled = labelled_element(first_region)
381376
score, pindex, sindex = \
382377
similar_region(page.page_tokens, self.template_tokens,
383-
labelled, start_index, end_region, **kwargs)
378+
labelled, start_index, end_region, self.best_match, **kwargs)
384379
if score > 0:
385380
if isinstance(labelled, AnnotationTag):
386381
similar_ignored_regions = []
387382
start = pindex
388383
for i in ignored_regions:
389384
s, p, e = similar_region(page.page_tokens, self.template_tokens, \
390-
i, start, sindex, **kwargs)
385+
i, start, sindex, self.best_match, **kwargs)
391386
if s > 0:
392387
similar_ignored_regions.append(PageRegion(p, e))
393388
start = e or start

scrapely/extraction/similarity.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,39 @@ def longest_unique_subsequence(to_search, subsequence, range_start=0,
7878
return best2[0]
7979
return None, None
8080

81+
def first_longest_subsequence(to_search, subsequence, range_start=0, range_end=None):
82+
"""Find the first longest subsequence of the items in a list or array.
83+
84+
range_start and range_end specify a range in which the match must begin.
85+
86+
For example, the longest match occurs at index 2 and has length 3
87+
>>> to_search = [6, 3, 2, 4, 3, 2, 5]
88+
>>> first_longest_subsequence(to_search, [2, 4, 3])
89+
(2, 3)
90+
91+
When there are two equally long subsequences, it return the nearest one)
92+
>>> first_longest_subsequence(to_search, [3, 2])
93+
(1, 2)
94+
95+
>>> first_longest_subsequence([], [3, 2])
96+
(None, None)
97+
"""
98+
startval = subsequence[0]
99+
if range_end is None:
100+
range_end = len(to_search)
101+
102+
# the comparison to startval ensures only matches of length >= 1 and
103+
# reduces the number of calls to the common_length function
104+
matches = [(i, common_prefix_length(to_search[i:], subsequence)) \
105+
for i in xrange(range_start, range_end) if startval == to_search[i]]
106+
107+
if not matches:
108+
return None, None
109+
# secondary sort on position and prefer the smaller one (near)
110+
return max(matches, key=lambda x: (x[1], -x[0]))
111+
81112
def similar_region(extracted_tokens, template_tokens, labelled_region,
82-
range_start=0, range_end=None, **kwargs):
113+
range_start=0, range_end=None, best_match=longest_unique_subsequence, **kwargs):
83114
"""Given a labelled section in a template, identify a similar region
84115
in the extracted tokens.
85116
@@ -100,7 +131,7 @@ def similar_region(extracted_tokens, template_tokens, labelled_region,
100131
# reverse order
101132
reverse_prefix = template_tokens[labelled_region.start_index::-1]
102133
reverse_tokens = extracted_tokens[::-1]
103-
(rpi, pscore) = longest_unique_subsequence(reverse_tokens, reverse_prefix,
134+
(rpi, pscore) = best_match(reverse_tokens, reverse_prefix,
104135
data_length - range_end, data_length - range_start)
105136

106137
# None means nothing extracted. Index 0 means there cannot be a suffix.
@@ -119,7 +150,7 @@ def similar_region(extracted_tokens, template_tokens, labelled_region,
119150

120151
# if it's not a paired tag, use the best match between prefix & suffix
121152
if labelled_region.start_index == labelled_region.end_index:
122-
(match_index, sscore) = longest_unique_subsequence(extracted_tokens,
153+
(match_index, sscore) = best_match(extracted_tokens,
123154
suffix, prefix_index, range_end)
124155
if match_index == prefix_index:
125156
return (pscore + sscore, prefix_index, match_index)
@@ -131,7 +162,7 @@ def similar_region(extracted_tokens, template_tokens, labelled_region,
131162

132163
# calculate the suffix match on the tokens following the prefix. We could
133164
# consider the whole page and require a good match.
134-
(match_index, sscore) = longest_unique_subsequence(extracted_tokens,
165+
(match_index, sscore) = best_match(extracted_tokens,
135166
suffix, prefix_index + 1, range_end)
136167
if match_index is None:
137168
return 0, None, None

scrapely/template.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,8 @@ def annotate_fragment(self, index, field):
8181
a = ' data-scrapy-annotate="%s"' % json.dumps(d).replace('"', '&quot;')
8282
p = self.htmlpage
8383
p.body = p.body[:f.end-1] + a + p.body[f.end-1:]
84-
break
84+
return True
85+
return False
8586

8687
def get_template(self):
8788
"""Return the generated template as a HtmlPage object"""

0 commit comments

Comments
 (0)