Skip to content

allow pass different subsequence method in similar_region #47

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 13, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions scrapely/extraction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
suffix.
"""
from operator import itemgetter
from .regionextract import build_extraction_tree
from .pageparsing import parse_template, parse_extraction_page
from .pageobjects import TokenDict
from .regionextract import (BasicTypeExtractor, TraceExtractor, RepeatedDataExtractor, \
AdjacentVariantExtractor, RecordExtractor, TemplatePageExtractor)


class InstanceBasedLearningExtractor(object):
"""Implementation of the instance based learning algorithm to
Expand Down Expand Up @@ -66,11 +68,27 @@ def __init__(self, td_pairs, trace=False, apply_extrarequired=True):
# templates with more attributes are considered first
sorted_tdpairs = sorted(modified_parsed_tdpairs, \
key=lambda x: _annotation_count(itemgetter(0)(x)), reverse=True)
self.extraction_trees = [build_extraction_tree(p, td[1],
self.extraction_trees = [self.build_extraction_tree(p, td[1],
trace) for p, td in sorted_tdpairs]
self.validated = dict((td[0].page_id, td[1].validated if td[1] else \
self._filter_not_none) for _, td in sorted_tdpairs)

def build_extraction_tree(self, template, type_descriptor, trace=True):
"""Build a tree of region extractors corresponding to the
template
"""
attribute_map = type_descriptor.attribute_map if type_descriptor else None
extractors = BasicTypeExtractor.create(template.annotations, attribute_map)
if trace:
extractors = TraceExtractor.apply(template, extractors)
for cls in (RepeatedDataExtractor, AdjacentVariantExtractor, RepeatedDataExtractor, AdjacentVariantExtractor, RepeatedDataExtractor,
RecordExtractor):
extractors = cls.apply(template, extractors)
if trace:
extractors = TraceExtractor.apply(template, extractors)

return TemplatePageExtractor(template, extractors)

def extract(self, html, pref_template_id=None):
"""extract data from an html page

Expand Down
55 changes: 25 additions & 30 deletions scrapely/extraction/regionextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,20 @@
from scrapely.extraction.pageobjects import (AnnotationTag,
PageRegion, FragmentedHtmlPageRegion)

def build_extraction_tree(template, type_descriptor, trace=True):
"""Build a tree of region extractors corresponding to the
template
"""
attribute_map = type_descriptor.attribute_map if type_descriptor else None
extractors = BasicTypeExtractor.create(template.annotations, attribute_map)
if trace:
extractors = TraceExtractor.apply(template, extractors)
for cls in (RepeatedDataExtractor, AdjacentVariantExtractor, RepeatedDataExtractor, AdjacentVariantExtractor, RepeatedDataExtractor,
RecordExtractor):
extractors = cls.apply(template, extractors)
if trace:
extractors = TraceExtractor.apply(template, extractors)

return TemplatePageExtractor(template, extractors)

_EXTRACT_HTML = lambda x: x
_DEFAULT_DESCRIPTOR = FieldDescriptor('none', None)

def _labelled(obj):
__all__ = ['BasicTypeExtractor',
'TraceExtractor',
'RepeatedDataExtractor',
'AdjacentVariantExtractor',
'RecordExtractor',
'TemplatePageExtractor',
'TextRegionDataExtractor',
'attrs2dict',
'labelled_element']

def labelled_element(obj):
"""
Returns labelled element of the object (extractor or labelled region)
"""
Expand Down Expand Up @@ -282,13 +276,13 @@ class TransposedDataExtractor(object):

_namef = operator.itemgetter(0)
_valuef = operator.itemgetter(1)
def _attrs2dict(attributes):
def attrs2dict(attributes):
"""convert a list of attributes (name, value) tuples
into a dict of lists.

For example:
>>> l = [('name', 'sofa'), ('colour', 'red'), ('colour', 'green')]
>>> _attrs2dict(l) == {'name': ['sofa'], 'colour': ['red', 'green']}
>>> attrs2dict(l) == {'name': ['sofa'], 'colour': ['red', 'green']}
True
"""
grouped_data = groupby(sorted(attributes, key=_namef), _namef)
Expand Down Expand Up @@ -326,6 +320,7 @@ def __init__(self, extractors, template_tokens):
start_index = min(e.annotation.start_index for e in extractors)
end_index = max(e.annotation.end_index for e in extractors)
self.annotation = AnnotationTag(start_index, end_index)
self.best_match = longest_unique_subsequence

def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs):
"""extract data from an extraction page
Expand All @@ -335,7 +330,7 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **k
"""
if ignored_regions is None:
ignored_regions = []
region_elements = sorted(self.extractors + ignored_regions, key=lambda x: _labelled(x).start_index)
region_elements = sorted(self.extractors + ignored_regions, key=lambda x: labelled_element(x).start_index)
_, _, attributes = self._doextract(page, region_elements, start_index,
end_index, **kwargs)
# collect variant data, maintaining the order of variants
Expand All @@ -350,10 +345,10 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **k
else:
items.append((k, v))

variant_records = [('variants', _attrs2dict(variants[vid])) \
variant_records = [('variants', attrs2dict(variants[vid])) \
for vid in variant_ids]
items += variant_records
return [_attrs2dict(items)]
return [attrs2dict(items)]

def _doextract(self, page, region_elements, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs):
"""Carry out extraction of records using the given annotations
Expand All @@ -364,30 +359,30 @@ def _doextract(self, page, region_elements, start_index, end_index, nested_regio
nested_regions = nested_regions or []
ignored_regions = ignored_regions or []
first_region, following_regions = region_elements[0], region_elements[1:]
while following_regions and _labelled(following_regions[0]).start_index \
< _labelled(first_region).end_index:
while following_regions and labelled_element(following_regions[0]).start_index \
< labelled_element(first_region).end_index:
region = following_regions.pop(0)
labelled = _labelled(region)
labelled = labelled_element(region)
if isinstance(labelled, AnnotationTag) or (nested_regions and \
_labelled(nested_regions[-1]).start_index < labelled.start_index \
< _labelled(nested_regions[-1]).end_index):
labelled_element(nested_regions[-1]).start_index < labelled.start_index \
< labelled_element(nested_regions[-1]).end_index):
nested_regions.append(region)
else:
ignored_regions.append(region)
extracted_data = []
# end_index is inclusive, but similar_region treats it as exclusive
end_region = None if end_index is None else end_index + 1
labelled = _labelled(first_region)
labelled = labelled_element(first_region)
score, pindex, sindex = \
similar_region(page.page_tokens, self.template_tokens,
labelled, start_index, end_region, **kwargs)
labelled, start_index, end_region, self.best_match, **kwargs)
if score > 0:
if isinstance(labelled, AnnotationTag):
similar_ignored_regions = []
start = pindex
for i in ignored_regions:
s, p, e = similar_region(page.page_tokens, self.template_tokens, \
i, start, sindex, **kwargs)
i, start, sindex, self.best_match, **kwargs)
if s > 0:
similar_ignored_regions.append(PageRegion(p, e))
start = e or start
Expand Down
39 changes: 35 additions & 4 deletions scrapely/extraction/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,39 @@ def longest_unique_subsequence(to_search, subsequence, range_start=0,
return best2[0]
return None, None

def first_longest_subsequence(to_search, subsequence, range_start=0, range_end=None):
"""Find the first longest subsequence of the items in a list or array.

range_start and range_end specify a range in which the match must begin.

For example, the longest match occurs at index 2 and has length 3
>>> to_search = [6, 3, 2, 4, 3, 2, 5]
>>> first_longest_subsequence(to_search, [2, 4, 3])
(2, 3)

When there are two equally long subsequences, it return the nearest one)
>>> first_longest_subsequence(to_search, [3, 2])
(1, 2)

>>> first_longest_subsequence([], [3, 2])
(None, None)
"""
startval = subsequence[0]
if range_end is None:
range_end = len(to_search)

# the comparison to startval ensures only matches of length >= 1 and
# reduces the number of calls to the common_length function
matches = [(i, common_prefix_length(to_search[i:], subsequence)) \
for i in xrange(range_start, range_end) if startval == to_search[i]]

if not matches:
return None, None
# secondary sort on position and prefer the smaller one (near)
return max(matches, key=lambda x: (x[1], -x[0]))

def similar_region(extracted_tokens, template_tokens, labelled_region,
range_start=0, range_end=None, **kwargs):
range_start=0, range_end=None, best_match=longest_unique_subsequence, **kwargs):
"""Given a labelled section in a template, identify a similar region
in the extracted tokens.

Expand All @@ -100,7 +131,7 @@ def similar_region(extracted_tokens, template_tokens, labelled_region,
# reverse order
reverse_prefix = template_tokens[labelled_region.start_index::-1]
reverse_tokens = extracted_tokens[::-1]
(rpi, pscore) = longest_unique_subsequence(reverse_tokens, reverse_prefix,
(rpi, pscore) = best_match(reverse_tokens, reverse_prefix,
data_length - range_end, data_length - range_start)

# None means nothing extracted. Index 0 means there cannot be a suffix.
Expand All @@ -119,7 +150,7 @@ def similar_region(extracted_tokens, template_tokens, labelled_region,

# if it's not a paired tag, use the best match between prefix & suffix
if labelled_region.start_index == labelled_region.end_index:
(match_index, sscore) = longest_unique_subsequence(extracted_tokens,
(match_index, sscore) = best_match(extracted_tokens,
suffix, prefix_index, range_end)
if match_index == prefix_index:
return (pscore + sscore, prefix_index, match_index)
Expand All @@ -131,7 +162,7 @@ def similar_region(extracted_tokens, template_tokens, labelled_region,

# calculate the suffix match on the tokens following the prefix. We could
# consider the whole page and require a good match.
(match_index, sscore) = longest_unique_subsequence(extracted_tokens,
(match_index, sscore) = best_match(extracted_tokens,
suffix, prefix_index + 1, range_end)
if match_index is None:
return 0, None, None
Expand Down
3 changes: 2 additions & 1 deletion scrapely/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ def annotate_fragment(self, index, field):
a = ' data-scrapy-annotate="%s"' % json.dumps(d).replace('"', '&quot;')
p = self.htmlpage
p.body = p.body[:f.end-1] + a + p.body[f.end-1:]
break
return True
return False

def get_template(self):
"""Return the generated template as a HtmlPage object"""
Expand Down