diff --git a/docs/contributing.rst b/docs/contributing.rst index fa054f84..9818d758 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -131,8 +131,8 @@ the ``add_xpath`` function, but you are not forced to do so: .. code-block:: python - fpage = node.xpath('//fpage/text()').extract() - lpage = node.xpath('//lpage/text()').extract() + fpage = node.xpath('.//fpage/text()').extract() + lpage = node.xpath('.//lpage/text()').extract() if fpage: record.add_value('journal_fpage', fpage) if lpage: diff --git a/docs/guide.rst b/docs/guide.rst index e08ea7d5..71a16b44 100644 --- a/docs/guide.rst +++ b/docs/guide.rst @@ -52,5 +52,5 @@ You can then run xpath expressions in the shell: .. code-block:: python - response.selector.xpath("//abstract").extract() + response.selector.xpath(".//abstract").extract() ["...some abstract ..."] diff --git a/hepcrawl/extractors/jats.py b/hepcrawl/extractors/jats.py index 7e81b2a4..dfe3a1d6 100644 --- a/hepcrawl/extractors/jats.py +++ b/hepcrawl/extractors/jats.py @@ -27,29 +27,29 @@ def format_date(day, month, year): year = int(get_first(year, 1)) return datetime.date(day=day, month=month, year=year).isoformat() - if node.xpath("//date[@date-type='published']"): + if node.xpath(".//date[@date-type='published']"): return format_date( - day=node.xpath("//date[@date-type='published']/day/text()").extract(), - month=node.xpath("//date[@date-type='published']/month/text()").extract(), - year=node.xpath("//date[@date-type='published']/year/text()").extract(), + day=node.xpath(".//date[@date-type='published']/day/text()").extract(), + month=node.xpath(".//date[@date-type='published']/month/text()").extract(), + year=node.xpath(".//date[@date-type='published']/year/text()").extract(), ) - elif node.xpath("//pub-date[@pub-type='ppub']"): + elif node.xpath(".//pub-date[@pub-type='ppub']"): return format_date( - day=node.xpath("//pub-date[@pub-type='ppub']/day/text()").extract(), - month=node.xpath("//pub-date[@pub-type='ppub']/month/text()").extract(), - year=node.xpath("//pub-date[@pub-type='ppub']/year/text()").extract(), + day=node.xpath(".//pub-date[@pub-type='ppub']/day/text()").extract(), + month=node.xpath(".//pub-date[@pub-type='ppub']/month/text()").extract(), + year=node.xpath(".//pub-date[@pub-type='ppub']/year/text()").extract(), ) - elif node.xpath("//pub-date[@pub-type='epub']"): + elif node.xpath(".//pub-date[@pub-type='epub']"): return format_date( - day=node.xpath("//pub-date[@pub-type='epub']/day/text()").extract(), - month=node.xpath("//pub-date[@pub-type='epub']/month/text()").extract(), - year=node.xpath("//pub-date[@pub-type='epub']/year/text()").extract(), + day=node.xpath(".//pub-date[@pub-type='epub']/day/text()").extract(), + month=node.xpath(".//pub-date[@pub-type='epub']/month/text()").extract(), + year=node.xpath(".//pub-date[@pub-type='epub']/year/text()").extract(), ) - elif node.xpath("//pub-date"): + elif node.xpath(".//pub-date"): return format_date( - day=node.xpath("//pub-date/day/text()").extract(), - month=node.xpath("//pub-date/month/text()").extract(), - year=node.xpath("//pub-date/year/text()").extract(), + day=node.xpath(".//pub-date/day/text()").extract(), + month=node.xpath(".//pub-date/month/text()").extract(), + year=node.xpath(".//pub-date/year/text()").extract(), ) else: # In the worst case we return today @@ -59,7 +59,7 @@ def _get_keywords(self, node): """Return tuple of keywords, PACS from node.""" free_keywords = [] classification_numbers = [] - for group in node.xpath('//kwd-group'): + for group in node.xpath('.//kwd-group'): if "pacs" in group.xpath('@kwd-group-type').extract(): for keyword in group.xpath('kwd/text()').extract(): classification_numbers.append(keyword) @@ -70,14 +70,14 @@ def _get_keywords(self, node): def _get_authors(self, node): authors = [] - for contrib in node.xpath("//contrib[@contrib-type='author']"): + for contrib in node.xpath(".//contrib[@contrib-type='author']"): surname = contrib.xpath("string-name/surname/text()").extract() given_names = contrib.xpath("string-name/given-names/text()").extract() email = contrib.xpath("email/text()").extract() affiliations = contrib.xpath('aff') reffered_id = contrib.xpath("xref[@ref-type='aff']/@rid").extract() if reffered_id: - affiliations += node.xpath("//aff[@id='{0}']".format( + affiliations += node.xpath(".//aff[@id='{0}']".format( get_first(reffered_id)) ) affiliations = [ diff --git a/hepcrawl/spiders/dnb_spider.py b/hepcrawl/spiders/dnb_spider.py index 6a86af12..4a9db042 100644 --- a/hepcrawl/spiders/dnb_spider.py +++ b/hepcrawl/spiders/dnb_spider.py @@ -181,13 +181,13 @@ def scrape_for_abstract(self, response): abstract_raw = node.xpath( "//div[@class='simple-item-view-abstract']/span/text()").extract() elif "hss.ulb.uni-bonn.de" in domain: - abstract_raw = node.xpath("//text()[contains(.,'Zusammenfassung')" + abstract_raw = node.xpath(".//text()[contains(.,'Zusammenfassung')" "or contains(., 'Abstract')]/ancestor::*[self::tr]/descendant::*[position() > 1]/text()").extract() elif "kups.ub.uni-koeln.de" in domain: abstract_raw = node.xpath( "//div[@class='ep_summary_content_main']/h2/following-sibling::p/text()").extract() # if "something else" in domain: - # abstracts = node.xpath("//somewhere[@else]") + # abstracts = node.xpath(".//somewhere[@else]") if abstract_raw: response.meta["abstract"] = [ diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py index d313f0cd..5d7de117 100644 --- a/hepcrawl/spiders/elsevier_spider.py +++ b/hepcrawl/spiders/elsevier_spider.py @@ -154,7 +154,7 @@ def handle_feed(self, response): """Handle the feed and yield a request for every zip package found.""" node = response.selector node.remove_namespaces() - entry = node.xpath("//entry") + entry = node.xpath(".//entry") for ent in entry: self.zip_file = ent.xpath("./link/@href").extract()[0] yield Request(self.zip_file, callback=self.handle_package) @@ -181,36 +181,36 @@ def handle_package(self, response): @staticmethod def get_dois(node): """Get the dois.""" - dois = node.xpath("//ja:item-info/ce:doi/text()") + dois = node.xpath(".//ja:item-info/ce:doi/text()") if not dois: - dois = node.xpath("//prism:doi/text()") + dois = node.xpath(".//prism:doi/text()") if dois: return dois.extract() def get_title(self, node): """Get article title.""" - title = node.xpath("//ce:title/text()") + title = node.xpath(".//ce:title/text()") if not title: - title = node.xpath("//dct:title/text()") + title = node.xpath(".//dct:title/text()") if title: return self._fix_node_text(title.extract()) @staticmethod def get_keywords(node): """Get article keywords.""" - keywords = node.xpath("//ce:keyword/ce:text/text()") + keywords = node.xpath(".//ce:keyword/ce:text/text()") if not keywords: - keywords = node.xpath("//dct:subject//rdf:li/text()") + keywords = node.xpath(".//dct:subject//rdf:li/text()") if keywords: return keywords.extract() def get_copyright(self, node): """Get copyright information.""" - cr_holder = node.xpath("//ce:copyright/text()") - cr_year = node.xpath("//ce:copyright/@year") - cr_statement = node.xpath("//ce:copyright/@type").extract() + cr_holder = node.xpath(".//ce:copyright/text()") + cr_year = node.xpath(".//ce:copyright/@year") + cr_statement = node.xpath(".//ce:copyright/@type").extract() if not (cr_statement or cr_holder) or "unknown" in " ".join(cr_statement).lower(): - cr_statement = node.xpath("//prism:copyright/text()").extract() + cr_statement = node.xpath(".//prism:copyright/text()").extract() if len(cr_statement) > 1: cr_statement = [ st for st in cr_statement if "unknown" not in st.lower()] @@ -277,7 +277,7 @@ def get_authors(self, node): """Get the authors.""" authors = [] - if node.xpath("//ce:author"): + if node.xpath(".//ce:author"): for author_group in node.xpath(".//ce:author-group"): collaborations = author_group.xpath( ".//ce:collaboration/ce:text/text()").extract() @@ -303,8 +303,8 @@ def get_authors(self, node): if collaborations: auth_dict['collaborations'] = collaborations authors.append(auth_dict) - elif node.xpath('//dct:creator'): - for author in node.xpath('//dct:creator/text()'): + elif node.xpath('.//dct:creator'): + for author in node.xpath('.//dct:creator/text()'): authors.append({'raw_name': author.extract()}) return authors @@ -327,7 +327,7 @@ def get_date(self, node): """Get the year, month, and day.""" # NOTE: this uses dateutils.py - cover_date = node.xpath("//prism:coverDate/text()").extract_first() + cover_date = node.xpath(".//prism:coverDate/text()").extract_first() cover_display_date = node.xpath( "//prism:coverDisplayDate/text()").extract_first() oa_effective = node.xpath( @@ -350,20 +350,20 @@ def get_date(self, node): def get_doctype(self, node): """Return a doctype mapped from abbreviation.""" - abbrv_doctype = node.xpath("//@docsubtype").extract() + abbrv_doctype = node.xpath(".//@docsubtype").extract() doctype = '' if abbrv_doctype: doctype = self.DOCTYPE_MAPPING[get_first(abbrv_doctype)] - elif node.xpath("//ja:article"): + elif node.xpath(".//ja:article"): doctype = "article" - elif node.xpath("//ja:simple-article"): + elif node.xpath(".//ja:simple-article"): doctype = "article" - elif node.xpath("//ja:book-review"): + elif node.xpath(".//ja:book-review"): doctype = "book-review" - elif node.xpath("//ja:exam"): + elif node.xpath(".//ja:exam"): doctype = "exam" # A scientific article in a conference proceedings is not cnf. - if node.xpath("//conference-info"): + if node.xpath(".//conference-info"): doctype = "conference_paper" if doctype: return doctype @@ -665,7 +665,7 @@ def get_references(self, node): # ce:other-ref elements. In the original fulltext they can be weirdly # grouped/nested. See test record. - reference_groups = node.xpath("//ce:bib-reference") + reference_groups = node.xpath(".//ce:bib-reference") refs_out = [] label = "" for ref_group in reference_groups: @@ -713,7 +713,7 @@ def _get_publication(node): """Get publication (journal) title data.""" publication = node.xpath( '//prism:publicationName/text()').extract_first() - jid = node.xpath('//ja:jid/text()').extract_first() + jid = node.xpath('.//ja:jid/text()').extract_first() if not publication and jid: # NOTE: JIDs should be mapped to standard journal names later publication = jid @@ -745,15 +745,15 @@ def parse_node(self, response, node): info = {} xml_file = response.meta.get("xml_url") dois = self.get_dois(node) - fpage = node.xpath('//prism:startingPage/text()').extract_first() - lpage = node.xpath('//prism:endingPage/text()').extract_first() - issn = node.xpath('//prism:issn/text()').extract_first() - volume = node.xpath('//prism:volume/text()').extract_first() - issue = node.xpath('//prism:number/text()').extract_first() + fpage = node.xpath('.//prism:startingPage/text()').extract_first() + lpage = node.xpath('.//prism:endingPage/text()').extract_first() + issn = node.xpath('.//prism:issn/text()').extract_first() + volume = node.xpath('.//prism:volume/text()').extract_first() + issue = node.xpath('.//prism:number/text()').extract_first() journal_title, section = self.get_journal_and_section( self._get_publication(node)) year, date_published = self.get_date(node) - conference = node.xpath("//conference-info").extract_first() + conference = node.xpath(".//conference-info").extract_first() if section and volume: volume = section + volume @@ -859,7 +859,7 @@ def _get_date_from_web(self, node): def _get_dois_from_web(self, node): """Get DOIs from sciencedirect web page.""" - dois = node.xpath("//meta[@name='citation_doi']/@content").extract() + dois = node.xpath(".//meta[@name='citation_doi']/@content").extract() if not dois: _, _, dois = self._parse_script(node) @@ -939,7 +939,7 @@ def scrape_sciencedirect(self, response): if issue: info["issue"] = issue if "journal_title" in keys_missing: - journal_title = node.xpath("//h1[@class='svTitle']").extract() + journal_title = node.xpath(".//h1[@class='svTitle']").extract() if not journal_title: journal_title = node.xpath( "//meta[@name='citation_journal_title']/@content" diff --git a/hepcrawl/spiders/magic_spider.py b/hepcrawl/spiders/magic_spider.py index bf5def51..a515a0f5 100644 --- a/hepcrawl/spiders/magic_spider.py +++ b/hepcrawl/spiders/magic_spider.py @@ -147,10 +147,10 @@ def scrape_for_pdf(self, response): node = response.selector if "title" not in response.meta: - response.meta["title"] = node.xpath("//div[@id='content']/h3/text()").extract() + response.meta["title"] = node.xpath(".//div[@id='content']/h3/text()").extract() - abstract = node.xpath("//div[@id='content']/p[@class='abstract']/text()").extract() - file_paths = node.xpath("//div[@id='content']/p[@class='url']/a/@href").extract() + abstract = node.xpath(".//div[@id='content']/p[@class='abstract']/text()").extract() + file_paths = node.xpath(".//div[@id='content']/p[@class='url']/a/@href").extract() file_paths = list(set(file_paths)) response.meta["abstract"] = abstract diff --git a/hepcrawl/spiders/mit_spider.py b/hepcrawl/spiders/mit_spider.py index f38219b4..7c272ee2 100644 --- a/hepcrawl/spiders/mit_spider.py +++ b/hepcrawl/spiders/mit_spider.py @@ -197,7 +197,7 @@ def build_item(self, response): if doc_type and "ph" not in doc_type.lower(): return None - pdf_files = node.xpath("//table[@id='file-table']//td/a/@href").extract() + pdf_files = node.xpath(".//table[@id='file-table']//td/a/@href").extract() if pdf_files: record.add_value('additional_files', self.add_fft_file( pdf_files, "HIDDEN", "Fulltext")) diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index 7275b603..3c90a032 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -47,7 +47,7 @@ def parse(self, response): """Get PDF information.""" node = response.selector node.remove_namespaces() - for record in node.xpath('//record'): + for record in node.xpath('.//record'): identifier = record.xpath('.//metadata/pex-dc/identifier/text()').extract_first() if identifier: # Probably all links lead to same place, so take first @@ -89,7 +89,7 @@ def build_item(self, response): if year: record.add_value('journal_year', year) - identifier = node.xpath("//metadata/pex-dc/identifier/text()").extract_first() + identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first() record.add_value('urls', response.meta['pos_url']) if response.meta['pos_pdf_url']: record.add_value('additional_files', {'type': "Fulltext", "url": response.meta['pos_pdf_url']}) @@ -104,7 +104,7 @@ def build_item(self, response): else: record.add_value('pubinfo_freetext', identifier) - language = node.xpath("//metadata/pex-dc/language/text()").extract_first() + language = node.xpath(".//metadata/pex-dc/language/text()").extract_first() if language: record.add_value('language', language) @@ -123,11 +123,11 @@ def _get_ext_systems_number(self, node): return [ { 'institute': 'PoS', - 'value': node.xpath('//metadata/pex-dc/identifier/text()').extract_first() + 'value': node.xpath('.//metadata/pex-dc/identifier/text()').extract_first() }, { 'institute': 'PoS', - 'value': node.xpath('//identifier/text()').extract_first() + 'value': node.xpath('.//identifier/text()').extract_first() }, ] @@ -136,7 +136,7 @@ def _get_license(self, node): licenses = \ {'Creative Commons Attribution-NonCommercial-ShareAlike': ['CC-BY-NC-SA-3.0', 'https://creativecommons.org/licenses/by-nc-sa/3.0']} - license_text = node.xpath("//metadata/pex-dc/rights/text()").extract_first() + license_text = node.xpath(".//metadata/pex-dc/rights/text()").extract_first() license_str = '' license_url = '' for key in licenses.keys(): @@ -155,7 +155,7 @@ def _get_date(self, node): """Get article date.""" date = '' year = '' - full_date = node.xpath("//metadata/pex-dc/date/text()").extract_first() + full_date = node.xpath(".//metadata/pex-dc/date/text()").extract_first() date = create_valid_date(full_date) if date: year = date[0:4] @@ -163,14 +163,14 @@ def _get_date(self, node): def _get_authors(self, node): """Get article authors.""" - author_selectors = node.xpath('//metadata/pex-dc/creator') + author_selectors = node.xpath('.//metadata/pex-dc/creator') authors = [] for selector in author_selectors: auth_dict = {} author = Selector(text=selector.extract()) auth_dict['raw_name'] = \ - get_first(author.xpath('//name//text()').extract(), default='') - for affiliation in author.xpath('//affiliation//text()').extract(): + get_first(author.xpath('.//name//text()').extract(), default='') + for affiliation in author.xpath('.//affiliation//text()').extract(): if 'affiliations' in auth_dict: auth_dict['affiliations'].append({'value': affiliation}) else: @@ -183,6 +183,6 @@ def _get_extra_data(self, node): """Get info to help selection - not for INSPIRE record""" extra_data = {} - section = node.xpath("//metadata/pex-dc/description/text()").extract_first() + section = node.xpath(".//metadata/pex-dc/description/text()").extract_first() extra_data['section'] = section.split(';', 1)[-1].strip() return extra_data diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index 9c9af560..ecd1e8c6 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -194,7 +194,7 @@ def parse_node(self, response, node): def _get_collections(self, node, article_type, current_journal_title): """Return this articles' collection.""" - conference = node.xpath('//conference').extract() + conference = node.xpath('.//conference').extract() if conference or current_journal_title == "International Journal of Modern Physics: Conference Series": return ['HEP', 'ConferencePaper'] elif article_type == "review-article": diff --git a/tests/test_base.py b/tests/test_base.py index 406d9262..0f00aa8f 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -33,7 +33,7 @@ def record(): response = fake_response_from_file('base/test_1.xml') selector = Selector(response, type='xml') spider._register_namespaces(selector) - nodes = selector.xpath('//%s' % spider.itertag) + nodes = selector.xpath('.//%s' % spider.itertag) response.meta["record"] = nodes[0].extract() response.meta["urls"] = ["http://hdl.handle.net/1885/10005"] return spider.build_item(response) @@ -45,7 +45,7 @@ def urls(): response = fake_response_from_file('base/test_1.xml') selector = Selector(response, type='xml') spider._register_namespaces(selector) - nodes = selector.xpath('//%s' % spider.itertag) + nodes = selector.xpath('.//%s' % spider.itertag) return spider.get_urls_in_record(nodes[0]) @@ -152,7 +152,7 @@ def splash(): response = fake_response_from_file('base/test_1.xml') selector = Selector(response, type='xml') spider._register_namespaces(selector) - nodes = selector.xpath('//%s' % spider.itertag) + nodes = selector.xpath('.//%s' % spider.itertag) splash_response.meta["record"] = nodes[0].extract() return spider.scrape_for_pdf(splash_response)