change the MDR output to list of dict

tpeng · tpeng · commit 46b96ee64d44 · 2014-09-10T22:41:41.000+02:00
diff --git a/scrapely/extraction/regionextract.py b/scrapely/extraction/regionextract.py
@@ -118,11 +118,8 @@ def _extract_content(self, extraction_page, start_index, end_index, ignored_regi
             region = FragmentedHtmlPageRegion(extraction_page.htmlpage, list(regions))
         else:
             region = extraction_page.htmlpage_region_inside(start_index, end_index)
-        if kwargs.get('no_content_validate'):
-            validated = True
-        else:
-            validated = self.content_validate(region)
-        return [(self.annotation.surrounds_attribute, self.content_validate(region))] if validated else []
+        validated = self.content_validate(region)
+        return [(self.annotation.surrounds_attribute, validated)] if validated else []
     
     def _extract_attribute(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs):
         data = []
@@ -497,25 +494,27 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **k
             warnings.warn("MDRExtractor can't find element with xpath: %s" % self.xpath)
             return [{}]
 
-        items = {}
-
-        _, mapping = mdr.extract(element[0], record=self.record)
-        for seed_elem, elements in mapping.iteritems():
-            annotation_elem = [elem for elem in ([seed_elem] + elements) if elem.attrib.get('data-scrapy-annotate')]
-            if annotation_elem:
-                annotation = self._read_template_annotation(annotation_elem[0])
-                name = annotation.get('annotations', {}).get('content')
-                ex = self.extractors[name]
-                for elem in elements:
+        items = []
+        _, mappings = mdr.extract(element[0], record=self.record)
+
+        for record, mapping in mappings.iteritems():
+            item = {}
+            for seed_elem, element in mapping.iteritems():
+                annotation_elem = [elem for elem in [seed_elem, element] if elem.attrib.get('data-scrapy-annotate')]
+                if annotation_elem:
+                    annotation = self._read_template_annotation(annotation_elem[0])
+                    group_name = annotation.get('listingDateGroupName', 'default_group')
+                    name = annotation.get('annotations', {}).get('content')
+                    ex = self.extractors[name]
                     elem_page = HtmlPage(None, {}, tostring(elem, encoding='unicode'))
                     parsed_elem_page = parse_extraction_page(self.token_dict, elem_page)
-                    items.setdefault(name, []).extend([v for _, v in ex.extract(parsed_elem_page, 0,
-                        len(parsed_elem_page.page_tokens) - 1, no_content_validate=True)])
+                    item.setdefault(name, []).extend([v for _, v in ex.extract(parsed_elem_page, 0,
+                        len(parsed_elem_page.page_tokens) - 1)])
+            items.append(item)
 
         if items:
-            lengths = [len(values) for values in items.values()]
-            assert len(set(lengths)) == 1, 'extract items %r should be have same count' % items
-        return [items]
+            return [{group_name: items}]
+        return []
 
     @classmethod
     def apply(cls, template, extractors):
@@ -558,8 +557,8 @@ def apply(cls, template, extractors):
                     if name == extractor.annotation.surrounds_attribute:
                         listing_data_extractors.append(extractor)
                         extractors.remove(extractor)
-            record, mapping = mdr.extract(candidate)
-            cls._propagate_annotations(mapping)
+            record, mappings = mdr.extract(candidate)
+            cls._propagate_annotations(mappings)
             return cls(template.token_dict, cls._get_candidate_xpath(doc, candidate), record, listing_data_extractors), extractors
 
         return None, extractors
@@ -605,16 +604,16 @@ def _get_common_ancestor_xpath(doc, elements):
         return "/".join(common_prefix(*[doc.getpath(elem).split('/') for elem in elements]))
 
     @staticmethod
-    def _propagate_annotations(mapping):
-        for elem, targ_elements in mapping.iteritems():
-            elements = [elem] + targ_elements
-            for _elem in elements:
-                annotation = _elem.attrib.get('data-scrapy-annotate')
+    def _propagate_annotations(mappings):
+        for record, mapping in mappings.iteritems():
+            for elem, targ_elem in mapping.iteritems():
+                for _elem in [elem, targ_elem]:
+                    annotation = _elem.attrib.get('data-scrapy-annotate')
+                    if annotation:
+                        break
                 if annotation:
-                    break
-            if annotation:
-                for _elem in elements:
-                    _elem.attrib['data-scrapy-annotate'] = annotation
+                    for _elem in [elem, targ_elem]:
+                        _elem.attrib['data-scrapy-annotate'] = annotation
 
     def __repr__(self):
         return "MdrExtractor(%s %r)" % (self.xpath, self.extractors)
diff --git a/tests/test_mdr_extractor.py b/tests/test_mdr_extractor.py
@@ -69,14 +69,13 @@ def test_extract(self):
         ex2 = BasicTypeExtractor(template.annotations[-2], {'date': d2})
 
         extractor = MdrExtractor.apply(template, [ex1, ex2])[0]
-        items = extractor.extract(page)[0]
+        items = extractor.extract(page)[0].values()[0]
 
-        self.assertEqual(len(items['date']), 40)
-        self.assertEqual(len(items['text']), 40)
+        self.assertEqual(len(items), 40)
 
         # extracted items are orderred
-        self.assertEquals(_get_value_with_xpath(items['date'][0], '//meta/@content'), '2014-07-02')
-        self.assertEquals(_get_value_with_xpath(items['date'][-1], '//meta/@content'), '2014-05-18')
+        self.assertEquals(_get_value_with_xpath(items[0]['date'][0], '//meta/@content'), '2014-07-02')
+        self.assertEquals(_get_value_with_xpath(items[-1]['date'][0], '//meta/@content'), '2014-05-18')
 
     def test_extract2(self):
         try:
@@ -90,16 +89,16 @@ def test_extract2(self):
         ex1 = BasicTypeExtractor(template.annotations[-1], {'review': d1})
 
         extractor = MdrExtractor.apply(template, [ex1])[0]
-        items = extractor.extract(page)[0]
-        self.assertEqual(len(items['review']), 6)
+        items = extractor.extract(page)[0].values()[0]
+        self.assertEqual(len(items), 6)
 
         # extracted items are orderred
-        self.assertEquals(items['review'][0], "Although it's expensive book I think it "
+        self.assertEquals(items[0]['review'][0], "Although it's expensive book I think it "
             "worth the money as it is the \"Bible\" of Machine Learning and Pattern recognition. However, "
             "has a lot of mathematics meaning that a strong mathematical background is necessary. "
             "I suggest it especially for PhD candidates in this field.")
 
-        self.assertEquals(items['review'][-1], "As a newbie to pattern recognition I found this book very helpful. "
+        self.assertEquals(items[-1]['review'][0], "As a newbie to pattern recognition I found this book very helpful. "
             "It is the clearest book I ever read! Accompanying examples and material are very illuminating. "
             "I particularly appreciated the gradual introduction of key concepts, often accompanied with practical "
             "examples and stimulating exercises.")
@@ -123,9 +122,11 @@ def test_ibl_extraction(self):
 
         self.assertEqual(actual_output[0].get('name')[0].strip(), 'Gary Danko')
         self.assertEqual(actual_output[0].get('phone')[0].strip(), '(415) 749-2060')
-        self.assertEqual(len(actual_output[0].get('date')), 40)
-        self.assertEqual(len(actual_output[0].get('text')), 40)
+
+        self.assertEqual(len(actual_output[0].get('default_group')), 40)
 
         # extracted items are orderred
-        self.assertEquals(_get_value_with_xpath(actual_output[0].get('date')[0], '//meta/@content'), '2014-07-02')
-        self.assertEquals(_get_value_with_xpath(actual_output[0].get('date')[-1], '//meta/@content'), '2014-05-18')
+        print actual_output[0].get('default_group')
+        print actual_output[0].get('default_group')[0]
+        self.assertEquals(_get_value_with_xpath(actual_output[0].get('default_group')[0]['date'][0], '//meta/@content'), '2014-07-02')
+        self.assertEquals(_get_value_with_xpath(actual_output[0].get('default_group')[-1]['date'][0], '//meta/@content'), '2014-05-18')