@@ -118,11 +118,8 @@ def _extract_content(self, extraction_page, start_index, end_index, ignored_regi
118
118
region = FragmentedHtmlPageRegion (extraction_page .htmlpage , list (regions ))
119
119
else :
120
120
region = extraction_page .htmlpage_region_inside (start_index , end_index )
121
- if kwargs .get ('no_content_validate' ):
122
- validated = True
123
- else :
124
- validated = self .content_validate (region )
125
- return [(self .annotation .surrounds_attribute , self .content_validate (region ))] if validated else []
121
+ validated = self .content_validate (region )
122
+ return [(self .annotation .surrounds_attribute , validated )] if validated else []
126
123
127
124
def _extract_attribute (self , extraction_page , start_index , end_index , ignored_regions = None , ** kwargs ):
128
125
data = []
@@ -497,25 +494,27 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **k
497
494
warnings .warn ("MDRExtractor can't find element with xpath: %s" % self .xpath )
498
495
return [{}]
499
496
500
- items = {}
501
-
502
- _ , mapping = mdr .extract (element [0 ], record = self .record )
503
- for seed_elem , elements in mapping .iteritems ():
504
- annotation_elem = [elem for elem in ([seed_elem ] + elements ) if elem .attrib .get ('data-scrapy-annotate' )]
505
- if annotation_elem :
506
- annotation = self ._read_template_annotation (annotation_elem [0 ])
507
- name = annotation .get ('annotations' , {}).get ('content' )
508
- ex = self .extractors [name ]
509
- for elem in elements :
497
+ items = []
498
+ _ , mappings = mdr .extract (element [0 ], record = self .record )
499
+
500
+ for record , mapping in mappings .iteritems ():
501
+ item = {}
502
+ for seed_elem , element in mapping .iteritems ():
503
+ annotation_elem = [elem for elem in [seed_elem , element ] if elem .attrib .get ('data-scrapy-annotate' )]
504
+ if annotation_elem :
505
+ annotation = self ._read_template_annotation (annotation_elem [0 ])
506
+ group_name = annotation .get ('listingDateGroupName' , 'default_group' )
507
+ name = annotation .get ('annotations' , {}).get ('content' )
508
+ ex = self .extractors [name ]
510
509
elem_page = HtmlPage (None , {}, tostring (elem , encoding = 'unicode' ))
511
510
parsed_elem_page = parse_extraction_page (self .token_dict , elem_page )
512
- items .setdefault (name , []).extend ([v for _ , v in ex .extract (parsed_elem_page , 0 ,
513
- len (parsed_elem_page .page_tokens ) - 1 , no_content_validate = True )])
511
+ item .setdefault (name , []).extend ([v for _ , v in ex .extract (parsed_elem_page , 0 ,
512
+ len (parsed_elem_page .page_tokens ) - 1 )])
513
+ items .append (item )
514
514
515
515
if items :
516
- lengths = [len (values ) for values in items .values ()]
517
- assert len (set (lengths )) == 1 , 'extract items %r should be have same count' % items
518
- return [items ]
516
+ return [{group_name : items }]
517
+ return []
519
518
520
519
@classmethod
521
520
def apply (cls , template , extractors ):
@@ -558,8 +557,8 @@ def apply(cls, template, extractors):
558
557
if name == extractor .annotation .surrounds_attribute :
559
558
listing_data_extractors .append (extractor )
560
559
extractors .remove (extractor )
561
- record , mapping = mdr .extract (candidate )
562
- cls ._propagate_annotations (mapping )
560
+ record , mappings = mdr .extract (candidate )
561
+ cls ._propagate_annotations (mappings )
563
562
return cls (template .token_dict , cls ._get_candidate_xpath (doc , candidate ), record , listing_data_extractors ), extractors
564
563
565
564
return None , extractors
@@ -605,16 +604,16 @@ def _get_common_ancestor_xpath(doc, elements):
605
604
return "/" .join (common_prefix (* [doc .getpath (elem ).split ('/' ) for elem in elements ]))
606
605
607
606
@staticmethod
608
- def _propagate_annotations (mapping ):
609
- for elem , targ_elements in mapping .iteritems ():
610
- elements = [elem ] + targ_elements
611
- for _elem in elements :
612
- annotation = _elem .attrib .get ('data-scrapy-annotate' )
607
+ def _propagate_annotations (mappings ):
608
+ for record , mapping in mappings .iteritems ():
609
+ for elem , targ_elem in mapping .iteritems ():
610
+ for _elem in [elem , targ_elem ]:
611
+ annotation = _elem .attrib .get ('data-scrapy-annotate' )
612
+ if annotation :
613
+ break
613
614
if annotation :
614
- break
615
- if annotation :
616
- for _elem in elements :
617
- _elem .attrib ['data-scrapy-annotate' ] = annotation
615
+ for _elem in [elem , targ_elem ]:
616
+ _elem .attrib ['data-scrapy-annotate' ] = annotation
618
617
619
618
def __repr__ (self ):
620
619
return "MdrExtractor(%s %r)" % (self .xpath , self .extractors )
0 commit comments