Skip to content

Commit 46b96ee

Browse files
committed
change the MDR output to list of dict
1 parent 87c1677 commit 46b96ee

File tree

2 files changed

+44
-44
lines changed

2 files changed

+44
-44
lines changed

scrapely/extraction/regionextract.py

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -118,11 +118,8 @@ def _extract_content(self, extraction_page, start_index, end_index, ignored_regi
118118
region = FragmentedHtmlPageRegion(extraction_page.htmlpage, list(regions))
119119
else:
120120
region = extraction_page.htmlpage_region_inside(start_index, end_index)
121-
if kwargs.get('no_content_validate'):
122-
validated = True
123-
else:
124-
validated = self.content_validate(region)
125-
return [(self.annotation.surrounds_attribute, self.content_validate(region))] if validated else []
121+
validated = self.content_validate(region)
122+
return [(self.annotation.surrounds_attribute, validated)] if validated else []
126123

127124
def _extract_attribute(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs):
128125
data = []
@@ -497,25 +494,27 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **k
497494
warnings.warn("MDRExtractor can't find element with xpath: %s" % self.xpath)
498495
return [{}]
499496

500-
items = {}
501-
502-
_, mapping = mdr.extract(element[0], record=self.record)
503-
for seed_elem, elements in mapping.iteritems():
504-
annotation_elem = [elem for elem in ([seed_elem] + elements) if elem.attrib.get('data-scrapy-annotate')]
505-
if annotation_elem:
506-
annotation = self._read_template_annotation(annotation_elem[0])
507-
name = annotation.get('annotations', {}).get('content')
508-
ex = self.extractors[name]
509-
for elem in elements:
497+
items = []
498+
_, mappings = mdr.extract(element[0], record=self.record)
499+
500+
for record, mapping in mappings.iteritems():
501+
item = {}
502+
for seed_elem, element in mapping.iteritems():
503+
annotation_elem = [elem for elem in [seed_elem, element] if elem.attrib.get('data-scrapy-annotate')]
504+
if annotation_elem:
505+
annotation = self._read_template_annotation(annotation_elem[0])
506+
group_name = annotation.get('listingDateGroupName', 'default_group')
507+
name = annotation.get('annotations', {}).get('content')
508+
ex = self.extractors[name]
510509
elem_page = HtmlPage(None, {}, tostring(elem, encoding='unicode'))
511510
parsed_elem_page = parse_extraction_page(self.token_dict, elem_page)
512-
items.setdefault(name, []).extend([v for _, v in ex.extract(parsed_elem_page, 0,
513-
len(parsed_elem_page.page_tokens) - 1, no_content_validate=True)])
511+
item.setdefault(name, []).extend([v for _, v in ex.extract(parsed_elem_page, 0,
512+
len(parsed_elem_page.page_tokens) - 1)])
513+
items.append(item)
514514

515515
if items:
516-
lengths = [len(values) for values in items.values()]
517-
assert len(set(lengths)) == 1, 'extract items %r should be have same count' % items
518-
return [items]
516+
return [{group_name: items}]
517+
return []
519518

520519
@classmethod
521520
def apply(cls, template, extractors):
@@ -558,8 +557,8 @@ def apply(cls, template, extractors):
558557
if name == extractor.annotation.surrounds_attribute:
559558
listing_data_extractors.append(extractor)
560559
extractors.remove(extractor)
561-
record, mapping = mdr.extract(candidate)
562-
cls._propagate_annotations(mapping)
560+
record, mappings = mdr.extract(candidate)
561+
cls._propagate_annotations(mappings)
563562
return cls(template.token_dict, cls._get_candidate_xpath(doc, candidate), record, listing_data_extractors), extractors
564563

565564
return None, extractors
@@ -605,16 +604,16 @@ def _get_common_ancestor_xpath(doc, elements):
605604
return "/".join(common_prefix(*[doc.getpath(elem).split('/') for elem in elements]))
606605

607606
@staticmethod
608-
def _propagate_annotations(mapping):
609-
for elem, targ_elements in mapping.iteritems():
610-
elements = [elem] + targ_elements
611-
for _elem in elements:
612-
annotation = _elem.attrib.get('data-scrapy-annotate')
607+
def _propagate_annotations(mappings):
608+
for record, mapping in mappings.iteritems():
609+
for elem, targ_elem in mapping.iteritems():
610+
for _elem in [elem, targ_elem]:
611+
annotation = _elem.attrib.get('data-scrapy-annotate')
612+
if annotation:
613+
break
613614
if annotation:
614-
break
615-
if annotation:
616-
for _elem in elements:
617-
_elem.attrib['data-scrapy-annotate'] = annotation
615+
for _elem in [elem, targ_elem]:
616+
_elem.attrib['data-scrapy-annotate'] = annotation
618617

619618
def __repr__(self):
620619
return "MdrExtractor(%s %r)" % (self.xpath, self.extractors)

tests/test_mdr_extractor.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,13 @@ def test_extract(self):
6969
ex2 = BasicTypeExtractor(template.annotations[-2], {'date': d2})
7070

7171
extractor = MdrExtractor.apply(template, [ex1, ex2])[0]
72-
items = extractor.extract(page)[0]
72+
items = extractor.extract(page)[0].values()[0]
7373

74-
self.assertEqual(len(items['date']), 40)
75-
self.assertEqual(len(items['text']), 40)
74+
self.assertEqual(len(items), 40)
7675

7776
# extracted items are orderred
78-
self.assertEquals(_get_value_with_xpath(items['date'][0], '//meta/@content'), '2014-07-02')
79-
self.assertEquals(_get_value_with_xpath(items['date'][-1], '//meta/@content'), '2014-05-18')
77+
self.assertEquals(_get_value_with_xpath(items[0]['date'][0], '//meta/@content'), '2014-07-02')
78+
self.assertEquals(_get_value_with_xpath(items[-1]['date'][0], '//meta/@content'), '2014-05-18')
8079

8180
def test_extract2(self):
8281
try:
@@ -90,16 +89,16 @@ def test_extract2(self):
9089
ex1 = BasicTypeExtractor(template.annotations[-1], {'review': d1})
9190

9291
extractor = MdrExtractor.apply(template, [ex1])[0]
93-
items = extractor.extract(page)[0]
94-
self.assertEqual(len(items['review']), 6)
92+
items = extractor.extract(page)[0].values()[0]
93+
self.assertEqual(len(items), 6)
9594

9695
# extracted items are orderred
97-
self.assertEquals(items['review'][0], "Although it's expensive book I think it "
96+
self.assertEquals(items[0]['review'][0], "Although it's expensive book I think it "
9897
"worth the money as it is the \"Bible\" of Machine Learning and Pattern recognition. However, "
9998
"has a lot of mathematics meaning that a strong mathematical background is necessary. "
10099
"I suggest it especially for PhD candidates in this field.")
101100

102-
self.assertEquals(items['review'][-1], "As a newbie to pattern recognition I found this book very helpful. "
101+
self.assertEquals(items[-1]['review'][0], "As a newbie to pattern recognition I found this book very helpful. "
103102
"It is the clearest book I ever read! Accompanying examples and material are very illuminating. "
104103
"I particularly appreciated the gradual introduction of key concepts, often accompanied with practical "
105104
"examples and stimulating exercises.")
@@ -123,9 +122,11 @@ def test_ibl_extraction(self):
123122

124123
self.assertEqual(actual_output[0].get('name')[0].strip(), 'Gary Danko')
125124
self.assertEqual(actual_output[0].get('phone')[0].strip(), '(415) 749-2060')
126-
self.assertEqual(len(actual_output[0].get('date')), 40)
127-
self.assertEqual(len(actual_output[0].get('text')), 40)
125+
126+
self.assertEqual(len(actual_output[0].get('default_group')), 40)
128127

129128
# extracted items are orderred
130-
self.assertEquals(_get_value_with_xpath(actual_output[0].get('date')[0], '//meta/@content'), '2014-07-02')
131-
self.assertEquals(_get_value_with_xpath(actual_output[0].get('date')[-1], '//meta/@content'), '2014-05-18')
129+
print actual_output[0].get('default_group')
130+
print actual_output[0].get('default_group')[0]
131+
self.assertEquals(_get_value_with_xpath(actual_output[0].get('default_group')[0]['date'][0], '//meta/@content'), '2014-07-02')
132+
self.assertEquals(_get_value_with_xpath(actual_output[0].get('default_group')[-1]['date'][0], '//meta/@content'), '2014-05-18')

0 commit comments

Comments
 (0)