global: use relative xpath expressions

* Changes to use relative xpath expressions where possible to support multiple records per file/crawl correctly. Signed-off-by: Jan Aage Lavik <jan.age.lavik@cern.ch>
inspirehep · Jun 23, 2016 · dcf4857 · dcf4857
1 parent 5b094cc
commit dcf4857
Show file tree

Hide file tree

Showing 10 changed files with 74 additions and 74 deletions.
diff --git a/docs/contributing.rst b/docs/contributing.rst
@@ -131,8 +131,8 @@ the ``add_xpath`` function, but you are not forced to do so:
 
 .. code-block:: python
 
-    fpage = node.xpath('//fpage/text()').extract()
-    lpage = node.xpath('//lpage/text()').extract()
+    fpage = node.xpath('.//fpage/text()').extract()
+    lpage = node.xpath('.//lpage/text()').extract()
     if fpage:
         record.add_value('journal_fpage', fpage)
     if lpage:

diff --git a/docs/guide.rst b/docs/guide.rst
@@ -52,5 +52,5 @@ You can then run xpath expressions in the shell:
 
 .. code-block:: python
 
-    response.selector.xpath("//abstract").extract()
+    response.selector.xpath(".//abstract").extract()
     ["...some abstract ..."]
diff --git a/hepcrawl/extractors/jats.py b/hepcrawl/extractors/jats.py
@@ -27,29 +27,29 @@ def format_date(day, month, year):
             year = int(get_first(year, 1))
             return datetime.date(day=day, month=month, year=year).isoformat()
 
-        if node.xpath("//date[@date-type='published']"):
+        if node.xpath(".//date[@date-type='published']"):
             return format_date(
-                day=node.xpath("//date[@date-type='published']/day/text()").extract(),
-                month=node.xpath("//date[@date-type='published']/month/text()").extract(),
-                year=node.xpath("//date[@date-type='published']/year/text()").extract(),
+                day=node.xpath(".//date[@date-type='published']/day/text()").extract(),
+                month=node.xpath(".//date[@date-type='published']/month/text()").extract(),
+                year=node.xpath(".//date[@date-type='published']/year/text()").extract(),
             )
-        elif node.xpath("//pub-date[@pub-type='ppub']"):
+        elif node.xpath(".//pub-date[@pub-type='ppub']"):
             return format_date(
-                day=node.xpath("//pub-date[@pub-type='ppub']/day/text()").extract(),
-                month=node.xpath("//pub-date[@pub-type='ppub']/month/text()").extract(),
-                year=node.xpath("//pub-date[@pub-type='ppub']/year/text()").extract(),
+                day=node.xpath(".//pub-date[@pub-type='ppub']/day/text()").extract(),
+                month=node.xpath(".//pub-date[@pub-type='ppub']/month/text()").extract(),
+                year=node.xpath(".//pub-date[@pub-type='ppub']/year/text()").extract(),
             )
-        elif node.xpath("//pub-date[@pub-type='epub']"):
+        elif node.xpath(".//pub-date[@pub-type='epub']"):
             return format_date(
-                day=node.xpath("//pub-date[@pub-type='epub']/day/text()").extract(),
-                month=node.xpath("//pub-date[@pub-type='epub']/month/text()").extract(),
-                year=node.xpath("//pub-date[@pub-type='epub']/year/text()").extract(),
+                day=node.xpath(".//pub-date[@pub-type='epub']/day/text()").extract(),
+                month=node.xpath(".//pub-date[@pub-type='epub']/month/text()").extract(),
+                year=node.xpath(".//pub-date[@pub-type='epub']/year/text()").extract(),
             )
-        elif node.xpath("//pub-date"):
+        elif node.xpath(".//pub-date"):
             return format_date(
-                day=node.xpath("//pub-date/day/text()").extract(),
-                month=node.xpath("//pub-date/month/text()").extract(),
-                year=node.xpath("//pub-date/year/text()").extract(),
+                day=node.xpath(".//pub-date/day/text()").extract(),
+                month=node.xpath(".//pub-date/month/text()").extract(),
+                year=node.xpath(".//pub-date/year/text()").extract(),
             )
         else:
             # In the worst case we return today
@@ -59,7 +59,7 @@ def _get_keywords(self, node):
         """Return tuple of keywords, PACS from node."""
         free_keywords = []
         classification_numbers = []
-        for group in node.xpath('//kwd-group'):
+        for group in node.xpath('.//kwd-group'):
             if "pacs" in group.xpath('@kwd-group-type').extract():
                 for keyword in group.xpath('kwd/text()').extract():
                     classification_numbers.append(keyword)
@@ -70,14 +70,14 @@ def _get_keywords(self, node):
 
     def _get_authors(self, node):
         authors = []
-        for contrib in node.xpath("//contrib[@contrib-type='author']"):
+        for contrib in node.xpath(".//contrib[@contrib-type='author']"):
             surname = contrib.xpath("string-name/surname/text()").extract()
             given_names = contrib.xpath("string-name/given-names/text()").extract()
             email = contrib.xpath("email/text()").extract()
             affiliations = contrib.xpath('aff')
             reffered_id = contrib.xpath("xref[@ref-type='aff']/@rid").extract()
             if reffered_id:
-                affiliations += node.xpath("//aff[@id='{0}']".format(
+                affiliations += node.xpath(".//aff[@id='{0}']".format(
                     get_first(reffered_id))
                 )
             affiliations = [

diff --git a/hepcrawl/spiders/dnb_spider.py b/hepcrawl/spiders/dnb_spider.py
@@ -181,13 +181,13 @@ def scrape_for_abstract(self, response):
             abstract_raw = node.xpath(
                 "//div[@class='simple-item-view-abstract']/span/text()").extract()
         elif "hss.ulb.uni-bonn.de" in domain:
-            abstract_raw = node.xpath("//text()[contains(.,'Zusammenfassung')"
+            abstract_raw = node.xpath(".//text()[contains(.,'Zusammenfassung')"
                                       "or contains(., 'Abstract')]/ancestor::*[self::tr]/descendant::*[position() > 1]/text()").extract()
         elif "kups.ub.uni-koeln.de" in domain:
             abstract_raw = node.xpath(
                 "//div[@class='ep_summary_content_main']/h2/following-sibling::p/text()").extract()
         # if "something else" in domain:
-            # abstracts = node.xpath("//somewhere[@else]")
+            # abstracts = node.xpath(".//somewhere[@else]")
 
         if abstract_raw:
             response.meta["abstract"] = [

diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py
@@ -154,7 +154,7 @@ def handle_feed(self, response):
         """Handle the feed and yield a request for every zip package found."""
         node = response.selector
         node.remove_namespaces()
-        entry = node.xpath("//entry")
+        entry = node.xpath(".//entry")
         for ent in entry:
             self.zip_file = ent.xpath("./link/@href").extract()[0]
             yield Request(self.zip_file, callback=self.handle_package)
@@ -181,36 +181,36 @@ def handle_package(self, response):
     @staticmethod
     def get_dois(node):
         """Get the dois."""
-        dois = node.xpath("//ja:item-info/ce:doi/text()")
+        dois = node.xpath(".//ja:item-info/ce:doi/text()")
         if not dois:
-            dois = node.xpath("//prism:doi/text()")
+            dois = node.xpath(".//prism:doi/text()")
         if dois:
             return dois.extract()
 
     def get_title(self, node):
         """Get article title."""
-        title = node.xpath("//ce:title/text()")
+        title = node.xpath(".//ce:title/text()")
         if not title:
-            title = node.xpath("//dct:title/text()")
+            title = node.xpath(".//dct:title/text()")
         if title:
             return self._fix_node_text(title.extract())
 
     @staticmethod
     def get_keywords(node):
         """Get article keywords."""
-        keywords = node.xpath("//ce:keyword/ce:text/text()")
+        keywords = node.xpath(".//ce:keyword/ce:text/text()")
         if not keywords:
-            keywords = node.xpath("//dct:subject//rdf:li/text()")
+            keywords = node.xpath(".//dct:subject//rdf:li/text()")
         if keywords:
             return keywords.extract()
 
     def get_copyright(self, node):
         """Get copyright information."""
-        cr_holder = node.xpath("//ce:copyright/text()")
-        cr_year = node.xpath("//ce:copyright/@year")
-        cr_statement = node.xpath("//ce:copyright/@type").extract()
+        cr_holder = node.xpath(".//ce:copyright/text()")
+        cr_year = node.xpath(".//ce:copyright/@year")
+        cr_statement = node.xpath(".//ce:copyright/@type").extract()
         if not (cr_statement or cr_holder) or "unknown" in " ".join(cr_statement).lower():
-            cr_statement = node.xpath("//prism:copyright/text()").extract()
+            cr_statement = node.xpath(".//prism:copyright/text()").extract()
             if len(cr_statement) > 1:
                 cr_statement = [
                     st for st in cr_statement if "unknown" not in st.lower()]
@@ -277,7 +277,7 @@ def get_authors(self, node):
         """Get the authors."""
         authors = []
 
-        if node.xpath("//ce:author"):
+        if node.xpath(".//ce:author"):
             for author_group in node.xpath(".//ce:author-group"):
                 collaborations = author_group.xpath(
                     ".//ce:collaboration/ce:text/text()").extract()
@@ -303,8 +303,8 @@ def get_authors(self, node):
                     if collaborations:
                         auth_dict['collaborations'] = collaborations
                     authors.append(auth_dict)
-        elif node.xpath('//dct:creator'):
-            for author in node.xpath('//dct:creator/text()'):
+        elif node.xpath('.//dct:creator'):
+            for author in node.xpath('.//dct:creator/text()'):
                 authors.append({'raw_name': author.extract()})
 
         return authors
@@ -327,7 +327,7 @@ def get_date(self, node):
         """Get the year, month, and day."""
         # NOTE: this uses dateutils.py
 
-        cover_date = node.xpath("//prism:coverDate/text()").extract_first()
+        cover_date = node.xpath(".//prism:coverDate/text()").extract_first()
         cover_display_date = node.xpath(
             "//prism:coverDisplayDate/text()").extract_first()
         oa_effective = node.xpath(
@@ -350,20 +350,20 @@ def get_date(self, node):
 
     def get_doctype(self, node):
         """Return a doctype mapped from abbreviation."""
-        abbrv_doctype = node.xpath("//@docsubtype").extract()
+        abbrv_doctype = node.xpath(".//@docsubtype").extract()
         doctype = ''
         if abbrv_doctype:
             doctype = self.DOCTYPE_MAPPING[get_first(abbrv_doctype)]
-        elif node.xpath("//ja:article"):
+        elif node.xpath(".//ja:article"):
             doctype = "article"
-        elif node.xpath("//ja:simple-article"):
+        elif node.xpath(".//ja:simple-article"):
             doctype = "article"
-        elif node.xpath("//ja:book-review"):
+        elif node.xpath(".//ja:book-review"):
             doctype = "book-review"
-        elif node.xpath("//ja:exam"):
+        elif node.xpath(".//ja:exam"):
             doctype = "exam"
         # A scientific article in a conference proceedings is not cnf.
-        if node.xpath("//conference-info"):
+        if node.xpath(".//conference-info"):
             doctype = "conference_paper"
         if doctype:
             return doctype
@@ -665,7 +665,7 @@ def get_references(self, node):
         # ce:other-ref elements. In the original fulltext they can be weirdly
         # grouped/nested. See test record.
 
-        reference_groups = node.xpath("//ce:bib-reference")
+        reference_groups = node.xpath(".//ce:bib-reference")
         refs_out = []
         label = ""
         for ref_group in reference_groups:
@@ -713,7 +713,7 @@ def _get_publication(node):
         """Get publication (journal) title data."""
         publication = node.xpath(
             '//prism:publicationName/text()').extract_first()
-        jid = node.xpath('//ja:jid/text()').extract_first()
+        jid = node.xpath('.//ja:jid/text()').extract_first()
         if not publication and jid:
             # NOTE: JIDs should be mapped to standard journal names later
             publication = jid
@@ -745,15 +745,15 @@ def parse_node(self, response, node):
         info = {}
         xml_file = response.meta.get("xml_url")
         dois = self.get_dois(node)
-        fpage = node.xpath('//prism:startingPage/text()').extract_first()
-        lpage = node.xpath('//prism:endingPage/text()').extract_first()
-        issn = node.xpath('//prism:issn/text()').extract_first()
-        volume = node.xpath('//prism:volume/text()').extract_first()
-        issue = node.xpath('//prism:number/text()').extract_first()
+        fpage = node.xpath('.//prism:startingPage/text()').extract_first()
+        lpage = node.xpath('.//prism:endingPage/text()').extract_first()
+        issn = node.xpath('.//prism:issn/text()').extract_first()
+        volume = node.xpath('.//prism:volume/text()').extract_first()
+        issue = node.xpath('.//prism:number/text()').extract_first()
         journal_title, section = self.get_journal_and_section(
             self._get_publication(node))
         year, date_published = self.get_date(node)
-        conference = node.xpath("//conference-info").extract_first()
+        conference = node.xpath(".//conference-info").extract_first()
 
         if section and volume:
             volume = section + volume
@@ -859,7 +859,7 @@ def _get_date_from_web(self, node):
 
     def _get_dois_from_web(self, node):
         """Get DOIs from sciencedirect web page."""
-        dois = node.xpath("//meta[@name='citation_doi']/@content").extract()
+        dois = node.xpath(".//meta[@name='citation_doi']/@content").extract()
         if not dois:
             _, _, dois = self._parse_script(node)
 
@@ -939,7 +939,7 @@ def scrape_sciencedirect(self, response):
             if issue:
                 info["issue"] = issue
         if "journal_title" in keys_missing:
-            journal_title = node.xpath("//h1[@class='svTitle']").extract()
+            journal_title = node.xpath(".//h1[@class='svTitle']").extract()
             if not journal_title:
                 journal_title = node.xpath(
                     "//meta[@name='citation_journal_title']/@content"

diff --git a/hepcrawl/spiders/magic_spider.py b/hepcrawl/spiders/magic_spider.py
@@ -147,10 +147,10 @@ def scrape_for_pdf(self, response):
 
         node = response.selector
         if "title" not in response.meta:
-            response.meta["title"] = node.xpath("//div[@id='content']/h3/text()").extract()
+            response.meta["title"] = node.xpath(".//div[@id='content']/h3/text()").extract()
 
-        abstract = node.xpath("//div[@id='content']/p[@class='abstract']/text()").extract()
-        file_paths = node.xpath("//div[@id='content']/p[@class='url']/a/@href").extract()
+        abstract = node.xpath(".//div[@id='content']/p[@class='abstract']/text()").extract()
+        file_paths = node.xpath(".//div[@id='content']/p[@class='url']/a/@href").extract()
         file_paths = list(set(file_paths))
 
         response.meta["abstract"] = abstract

diff --git a/hepcrawl/spiders/mit_spider.py b/hepcrawl/spiders/mit_spider.py
@@ -197,7 +197,7 @@ def build_item(self, response):
         if doc_type and "ph" not in doc_type.lower():
             return None
 
-        pdf_files = node.xpath("//table[@id='file-table']//td/a/@href").extract()
+        pdf_files = node.xpath(".//table[@id='file-table']//td/a/@href").extract()
         if pdf_files:
             record.add_value('additional_files', self.add_fft_file(
                 pdf_files, "HIDDEN", "Fulltext"))

diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py
@@ -47,7 +47,7 @@ def parse(self, response):
         """Get PDF information."""
         node = response.selector
         node.remove_namespaces()
-        for record in node.xpath('//record'):
+        for record in node.xpath('.//record'):
             identifier = record.xpath('.//metadata/pex-dc/identifier/text()').extract_first()
             if identifier:
                 # Probably all links lead to same place, so take first
@@ -89,7 +89,7 @@ def build_item(self, response):
         if year:
             record.add_value('journal_year', year)
 
-        identifier = node.xpath("//metadata/pex-dc/identifier/text()").extract_first()
+        identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first()
         record.add_value('urls', response.meta['pos_url'])
         if response.meta['pos_pdf_url']:
             record.add_value('additional_files', {'type': "Fulltext", "url": response.meta['pos_pdf_url']})
@@ -104,7 +104,7 @@ def build_item(self, response):
             else:
                 record.add_value('pubinfo_freetext', identifier)
 
-        language = node.xpath("//metadata/pex-dc/language/text()").extract_first()
+        language = node.xpath(".//metadata/pex-dc/language/text()").extract_first()
         if language:
             record.add_value('language', language)
 
@@ -123,11 +123,11 @@ def _get_ext_systems_number(self, node):
         return [
             {
                 'institute': 'PoS',
-                'value': node.xpath('//metadata/pex-dc/identifier/text()').extract_first()
+                'value': node.xpath('.//metadata/pex-dc/identifier/text()').extract_first()
             },
             {
                 'institute': 'PoS',
-                'value': node.xpath('//identifier/text()').extract_first()
+                'value': node.xpath('.//identifier/text()').extract_first()
             },
         ]
 
@@ -136,7 +136,7 @@ def _get_license(self, node):
         licenses = \
             {'Creative Commons Attribution-NonCommercial-ShareAlike':
                 ['CC-BY-NC-SA-3.0', 'https://creativecommons.org/licenses/by-nc-sa/3.0']}
-        license_text = node.xpath("//metadata/pex-dc/rights/text()").extract_first()
+        license_text = node.xpath(".//metadata/pex-dc/rights/text()").extract_first()
         license_str = ''
         license_url = ''
         for key in licenses.keys():
@@ -155,22 +155,22 @@ def _get_date(self, node):
         """Get article date."""
         date = ''
         year = ''
-        full_date = node.xpath("//metadata/pex-dc/date/text()").extract_first()
+        full_date = node.xpath(".//metadata/pex-dc/date/text()").extract_first()
         date = create_valid_date(full_date)
         if date:
             year = date[0:4]
         return date, year
 
     def _get_authors(self, node):
         """Get article authors."""
-        author_selectors = node.xpath('//metadata/pex-dc/creator')
+        author_selectors = node.xpath('.//metadata/pex-dc/creator')
         authors = []
         for selector in author_selectors:
             auth_dict = {}
             author = Selector(text=selector.extract())
             auth_dict['raw_name'] = \
-                get_first(author.xpath('//name//text()').extract(), default='')
-            for affiliation in author.xpath('//affiliation//text()').extract():
+                get_first(author.xpath('.//name//text()').extract(), default='')
+            for affiliation in author.xpath('.//affiliation//text()').extract():
                 if 'affiliations' in auth_dict:
                     auth_dict['affiliations'].append({'value': affiliation})
                 else:
@@ -183,6 +183,6 @@ def _get_extra_data(self, node):
         """Get info to help selection - not for INSPIRE record"""
         extra_data = {}
 
-        section = node.xpath("//metadata/pex-dc/description/text()").extract_first()
+        section = node.xpath(".//metadata/pex-dc/description/text()").extract_first()
         extra_data['section'] = section.split(';', 1)[-1].strip()
         return extra_data
diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py
@@ -194,7 +194,7 @@ def parse_node(self, response, node):
 
     def _get_collections(self, node, article_type, current_journal_title):
         """Return this articles' collection."""
-        conference = node.xpath('//conference').extract()
+        conference = node.xpath('.//conference').extract()
         if conference or current_journal_title == "International Journal of Modern Physics: Conference Series":
             return ['HEP', 'ConferencePaper']
         elif article_type == "review-article":