Skip to content

Commit

Permalink
global: use relative xpath expressions
Browse files Browse the repository at this point in the history
* Changes to use relative xpath expressions where possible to
  support multiple records per file/crawl correctly.

Signed-off-by: Jan Aage Lavik <jan.age.lavik@cern.ch>
  • Loading branch information
jalavik committed Jun 23, 2016
1 parent 5b094cc commit dcf4857
Show file tree
Hide file tree
Showing 10 changed files with 74 additions and 74 deletions.
4 changes: 2 additions & 2 deletions docs/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ the ``add_xpath`` function, but you are not forced to do so:

.. code-block:: python
fpage = node.xpath('//fpage/text()').extract()
lpage = node.xpath('//lpage/text()').extract()
fpage = node.xpath('.//fpage/text()').extract()
lpage = node.xpath('.//lpage/text()').extract()
if fpage:
record.add_value('journal_fpage', fpage)
if lpage:
Expand Down
2 changes: 1 addition & 1 deletion docs/guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,5 +52,5 @@ You can then run xpath expressions in the shell:

.. code-block:: python
response.selector.xpath("//abstract").extract()
response.selector.xpath(".//abstract").extract()
["...some abstract ..."]
38 changes: 19 additions & 19 deletions hepcrawl/extractors/jats.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,29 +27,29 @@ def format_date(day, month, year):
year = int(get_first(year, 1))
return datetime.date(day=day, month=month, year=year).isoformat()

if node.xpath("//date[@date-type='published']"):
if node.xpath(".//date[@date-type='published']"):
return format_date(
day=node.xpath("//date[@date-type='published']/day/text()").extract(),
month=node.xpath("//date[@date-type='published']/month/text()").extract(),
year=node.xpath("//date[@date-type='published']/year/text()").extract(),
day=node.xpath(".//date[@date-type='published']/day/text()").extract(),
month=node.xpath(".//date[@date-type='published']/month/text()").extract(),
year=node.xpath(".//date[@date-type='published']/year/text()").extract(),
)
elif node.xpath("//pub-date[@pub-type='ppub']"):
elif node.xpath(".//pub-date[@pub-type='ppub']"):
return format_date(
day=node.xpath("//pub-date[@pub-type='ppub']/day/text()").extract(),
month=node.xpath("//pub-date[@pub-type='ppub']/month/text()").extract(),
year=node.xpath("//pub-date[@pub-type='ppub']/year/text()").extract(),
day=node.xpath(".//pub-date[@pub-type='ppub']/day/text()").extract(),
month=node.xpath(".//pub-date[@pub-type='ppub']/month/text()").extract(),
year=node.xpath(".//pub-date[@pub-type='ppub']/year/text()").extract(),
)
elif node.xpath("//pub-date[@pub-type='epub']"):
elif node.xpath(".//pub-date[@pub-type='epub']"):
return format_date(
day=node.xpath("//pub-date[@pub-type='epub']/day/text()").extract(),
month=node.xpath("//pub-date[@pub-type='epub']/month/text()").extract(),
year=node.xpath("//pub-date[@pub-type='epub']/year/text()").extract(),
day=node.xpath(".//pub-date[@pub-type='epub']/day/text()").extract(),
month=node.xpath(".//pub-date[@pub-type='epub']/month/text()").extract(),
year=node.xpath(".//pub-date[@pub-type='epub']/year/text()").extract(),
)
elif node.xpath("//pub-date"):
elif node.xpath(".//pub-date"):
return format_date(
day=node.xpath("//pub-date/day/text()").extract(),
month=node.xpath("//pub-date/month/text()").extract(),
year=node.xpath("//pub-date/year/text()").extract(),
day=node.xpath(".//pub-date/day/text()").extract(),
month=node.xpath(".//pub-date/month/text()").extract(),
year=node.xpath(".//pub-date/year/text()").extract(),
)
else:
# In the worst case we return today
Expand All @@ -59,7 +59,7 @@ def _get_keywords(self, node):
"""Return tuple of keywords, PACS from node."""
free_keywords = []
classification_numbers = []
for group in node.xpath('//kwd-group'):
for group in node.xpath('.//kwd-group'):
if "pacs" in group.xpath('@kwd-group-type').extract():
for keyword in group.xpath('kwd/text()').extract():
classification_numbers.append(keyword)
Expand All @@ -70,14 +70,14 @@ def _get_keywords(self, node):

def _get_authors(self, node):
authors = []
for contrib in node.xpath("//contrib[@contrib-type='author']"):
for contrib in node.xpath(".//contrib[@contrib-type='author']"):
surname = contrib.xpath("string-name/surname/text()").extract()
given_names = contrib.xpath("string-name/given-names/text()").extract()
email = contrib.xpath("email/text()").extract()
affiliations = contrib.xpath('aff')
reffered_id = contrib.xpath("xref[@ref-type='aff']/@rid").extract()
if reffered_id:
affiliations += node.xpath("//aff[@id='{0}']".format(
affiliations += node.xpath(".//aff[@id='{0}']".format(
get_first(reffered_id))
)
affiliations = [
Expand Down
4 changes: 2 additions & 2 deletions hepcrawl/spiders/dnb_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,13 +181,13 @@ def scrape_for_abstract(self, response):
abstract_raw = node.xpath(
"//div[@class='simple-item-view-abstract']/span/text()").extract()
elif "hss.ulb.uni-bonn.de" in domain:
abstract_raw = node.xpath("//text()[contains(.,'Zusammenfassung')"
abstract_raw = node.xpath(".//text()[contains(.,'Zusammenfassung')"
"or contains(., 'Abstract')]/ancestor::*[self::tr]/descendant::*[position() > 1]/text()").extract()
elif "kups.ub.uni-koeln.de" in domain:
abstract_raw = node.xpath(
"//div[@class='ep_summary_content_main']/h2/following-sibling::p/text()").extract()
# if "something else" in domain:
# abstracts = node.xpath("//somewhere[@else]")
# abstracts = node.xpath(".//somewhere[@else]")

if abstract_raw:
response.meta["abstract"] = [
Expand Down
62 changes: 31 additions & 31 deletions hepcrawl/spiders/elsevier_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def handle_feed(self, response):
"""Handle the feed and yield a request for every zip package found."""
node = response.selector
node.remove_namespaces()
entry = node.xpath("//entry")
entry = node.xpath(".//entry")
for ent in entry:
self.zip_file = ent.xpath("./link/@href").extract()[0]
yield Request(self.zip_file, callback=self.handle_package)
Expand All @@ -181,36 +181,36 @@ def handle_package(self, response):
@staticmethod
def get_dois(node):
"""Get the dois."""
dois = node.xpath("//ja:item-info/ce:doi/text()")
dois = node.xpath(".//ja:item-info/ce:doi/text()")
if not dois:
dois = node.xpath("//prism:doi/text()")
dois = node.xpath(".//prism:doi/text()")
if dois:
return dois.extract()

def get_title(self, node):
"""Get article title."""
title = node.xpath("//ce:title/text()")
title = node.xpath(".//ce:title/text()")
if not title:
title = node.xpath("//dct:title/text()")
title = node.xpath(".//dct:title/text()")
if title:
return self._fix_node_text(title.extract())

@staticmethod
def get_keywords(node):
"""Get article keywords."""
keywords = node.xpath("//ce:keyword/ce:text/text()")
keywords = node.xpath(".//ce:keyword/ce:text/text()")
if not keywords:
keywords = node.xpath("//dct:subject//rdf:li/text()")
keywords = node.xpath(".//dct:subject//rdf:li/text()")
if keywords:
return keywords.extract()

def get_copyright(self, node):
"""Get copyright information."""
cr_holder = node.xpath("//ce:copyright/text()")
cr_year = node.xpath("//ce:copyright/@year")
cr_statement = node.xpath("//ce:copyright/@type").extract()
cr_holder = node.xpath(".//ce:copyright/text()")
cr_year = node.xpath(".//ce:copyright/@year")
cr_statement = node.xpath(".//ce:copyright/@type").extract()
if not (cr_statement or cr_holder) or "unknown" in " ".join(cr_statement).lower():
cr_statement = node.xpath("//prism:copyright/text()").extract()
cr_statement = node.xpath(".//prism:copyright/text()").extract()
if len(cr_statement) > 1:
cr_statement = [
st for st in cr_statement if "unknown" not in st.lower()]
Expand Down Expand Up @@ -277,7 +277,7 @@ def get_authors(self, node):
"""Get the authors."""
authors = []

if node.xpath("//ce:author"):
if node.xpath(".//ce:author"):
for author_group in node.xpath(".//ce:author-group"):
collaborations = author_group.xpath(
".//ce:collaboration/ce:text/text()").extract()
Expand All @@ -303,8 +303,8 @@ def get_authors(self, node):
if collaborations:
auth_dict['collaborations'] = collaborations
authors.append(auth_dict)
elif node.xpath('//dct:creator'):
for author in node.xpath('//dct:creator/text()'):
elif node.xpath('.//dct:creator'):
for author in node.xpath('.//dct:creator/text()'):
authors.append({'raw_name': author.extract()})

return authors
Expand All @@ -327,7 +327,7 @@ def get_date(self, node):
"""Get the year, month, and day."""
# NOTE: this uses dateutils.py

cover_date = node.xpath("//prism:coverDate/text()").extract_first()
cover_date = node.xpath(".//prism:coverDate/text()").extract_first()
cover_display_date = node.xpath(
"//prism:coverDisplayDate/text()").extract_first()
oa_effective = node.xpath(
Expand All @@ -350,20 +350,20 @@ def get_date(self, node):

def get_doctype(self, node):
"""Return a doctype mapped from abbreviation."""
abbrv_doctype = node.xpath("//@docsubtype").extract()
abbrv_doctype = node.xpath(".//@docsubtype").extract()
doctype = ''
if abbrv_doctype:
doctype = self.DOCTYPE_MAPPING[get_first(abbrv_doctype)]
elif node.xpath("//ja:article"):
elif node.xpath(".//ja:article"):
doctype = "article"
elif node.xpath("//ja:simple-article"):
elif node.xpath(".//ja:simple-article"):
doctype = "article"
elif node.xpath("//ja:book-review"):
elif node.xpath(".//ja:book-review"):
doctype = "book-review"
elif node.xpath("//ja:exam"):
elif node.xpath(".//ja:exam"):
doctype = "exam"
# A scientific article in a conference proceedings is not cnf.
if node.xpath("//conference-info"):
if node.xpath(".//conference-info"):
doctype = "conference_paper"
if doctype:
return doctype
Expand Down Expand Up @@ -665,7 +665,7 @@ def get_references(self, node):
# ce:other-ref elements. In the original fulltext they can be weirdly
# grouped/nested. See test record.

reference_groups = node.xpath("//ce:bib-reference")
reference_groups = node.xpath(".//ce:bib-reference")
refs_out = []
label = ""
for ref_group in reference_groups:
Expand Down Expand Up @@ -713,7 +713,7 @@ def _get_publication(node):
"""Get publication (journal) title data."""
publication = node.xpath(
'//prism:publicationName/text()').extract_first()
jid = node.xpath('//ja:jid/text()').extract_first()
jid = node.xpath('.//ja:jid/text()').extract_first()
if not publication and jid:
# NOTE: JIDs should be mapped to standard journal names later
publication = jid
Expand Down Expand Up @@ -745,15 +745,15 @@ def parse_node(self, response, node):
info = {}
xml_file = response.meta.get("xml_url")
dois = self.get_dois(node)
fpage = node.xpath('//prism:startingPage/text()').extract_first()
lpage = node.xpath('//prism:endingPage/text()').extract_first()
issn = node.xpath('//prism:issn/text()').extract_first()
volume = node.xpath('//prism:volume/text()').extract_first()
issue = node.xpath('//prism:number/text()').extract_first()
fpage = node.xpath('.//prism:startingPage/text()').extract_first()
lpage = node.xpath('.//prism:endingPage/text()').extract_first()
issn = node.xpath('.//prism:issn/text()').extract_first()
volume = node.xpath('.//prism:volume/text()').extract_first()
issue = node.xpath('.//prism:number/text()').extract_first()
journal_title, section = self.get_journal_and_section(
self._get_publication(node))
year, date_published = self.get_date(node)
conference = node.xpath("//conference-info").extract_first()
conference = node.xpath(".//conference-info").extract_first()

if section and volume:
volume = section + volume
Expand Down Expand Up @@ -859,7 +859,7 @@ def _get_date_from_web(self, node):

def _get_dois_from_web(self, node):
"""Get DOIs from sciencedirect web page."""
dois = node.xpath("//meta[@name='citation_doi']/@content").extract()
dois = node.xpath(".//meta[@name='citation_doi']/@content").extract()
if not dois:
_, _, dois = self._parse_script(node)

Expand Down Expand Up @@ -939,7 +939,7 @@ def scrape_sciencedirect(self, response):
if issue:
info["issue"] = issue
if "journal_title" in keys_missing:
journal_title = node.xpath("//h1[@class='svTitle']").extract()
journal_title = node.xpath(".//h1[@class='svTitle']").extract()
if not journal_title:
journal_title = node.xpath(
"//meta[@name='citation_journal_title']/@content"
Expand Down
6 changes: 3 additions & 3 deletions hepcrawl/spiders/magic_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,10 +147,10 @@ def scrape_for_pdf(self, response):

node = response.selector
if "title" not in response.meta:
response.meta["title"] = node.xpath("//div[@id='content']/h3/text()").extract()
response.meta["title"] = node.xpath(".//div[@id='content']/h3/text()").extract()

abstract = node.xpath("//div[@id='content']/p[@class='abstract']/text()").extract()
file_paths = node.xpath("//div[@id='content']/p[@class='url']/a/@href").extract()
abstract = node.xpath(".//div[@id='content']/p[@class='abstract']/text()").extract()
file_paths = node.xpath(".//div[@id='content']/p[@class='url']/a/@href").extract()
file_paths = list(set(file_paths))

response.meta["abstract"] = abstract
Expand Down
2 changes: 1 addition & 1 deletion hepcrawl/spiders/mit_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def build_item(self, response):
if doc_type and "ph" not in doc_type.lower():
return None

pdf_files = node.xpath("//table[@id='file-table']//td/a/@href").extract()
pdf_files = node.xpath(".//table[@id='file-table']//td/a/@href").extract()
if pdf_files:
record.add_value('additional_files', self.add_fft_file(
pdf_files, "HIDDEN", "Fulltext"))
Expand Down
22 changes: 11 additions & 11 deletions hepcrawl/spiders/pos_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def parse(self, response):
"""Get PDF information."""
node = response.selector
node.remove_namespaces()
for record in node.xpath('//record'):
for record in node.xpath('.//record'):
identifier = record.xpath('.//metadata/pex-dc/identifier/text()').extract_first()
if identifier:
# Probably all links lead to same place, so take first
Expand Down Expand Up @@ -89,7 +89,7 @@ def build_item(self, response):
if year:
record.add_value('journal_year', year)

identifier = node.xpath("//metadata/pex-dc/identifier/text()").extract_first()
identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first()
record.add_value('urls', response.meta['pos_url'])
if response.meta['pos_pdf_url']:
record.add_value('additional_files', {'type': "Fulltext", "url": response.meta['pos_pdf_url']})
Expand All @@ -104,7 +104,7 @@ def build_item(self, response):
else:
record.add_value('pubinfo_freetext', identifier)

language = node.xpath("//metadata/pex-dc/language/text()").extract_first()
language = node.xpath(".//metadata/pex-dc/language/text()").extract_first()
if language:
record.add_value('language', language)

Expand All @@ -123,11 +123,11 @@ def _get_ext_systems_number(self, node):
return [
{
'institute': 'PoS',
'value': node.xpath('//metadata/pex-dc/identifier/text()').extract_first()
'value': node.xpath('.//metadata/pex-dc/identifier/text()').extract_first()
},
{
'institute': 'PoS',
'value': node.xpath('//identifier/text()').extract_first()
'value': node.xpath('.//identifier/text()').extract_first()
},
]

Expand All @@ -136,7 +136,7 @@ def _get_license(self, node):
licenses = \
{'Creative Commons Attribution-NonCommercial-ShareAlike':
['CC-BY-NC-SA-3.0', 'https://creativecommons.org/licenses/by-nc-sa/3.0']}
license_text = node.xpath("//metadata/pex-dc/rights/text()").extract_first()
license_text = node.xpath(".//metadata/pex-dc/rights/text()").extract_first()
license_str = ''
license_url = ''
for key in licenses.keys():
Expand All @@ -155,22 +155,22 @@ def _get_date(self, node):
"""Get article date."""
date = ''
year = ''
full_date = node.xpath("//metadata/pex-dc/date/text()").extract_first()
full_date = node.xpath(".//metadata/pex-dc/date/text()").extract_first()
date = create_valid_date(full_date)
if date:
year = date[0:4]
return date, year

def _get_authors(self, node):
"""Get article authors."""
author_selectors = node.xpath('//metadata/pex-dc/creator')
author_selectors = node.xpath('.//metadata/pex-dc/creator')
authors = []
for selector in author_selectors:
auth_dict = {}
author = Selector(text=selector.extract())
auth_dict['raw_name'] = \
get_first(author.xpath('//name//text()').extract(), default='')
for affiliation in author.xpath('//affiliation//text()').extract():
get_first(author.xpath('.//name//text()').extract(), default='')
for affiliation in author.xpath('.//affiliation//text()').extract():
if 'affiliations' in auth_dict:
auth_dict['affiliations'].append({'value': affiliation})
else:
Expand All @@ -183,6 +183,6 @@ def _get_extra_data(self, node):
"""Get info to help selection - not for INSPIRE record"""
extra_data = {}

section = node.xpath("//metadata/pex-dc/description/text()").extract_first()
section = node.xpath(".//metadata/pex-dc/description/text()").extract_first()
extra_data['section'] = section.split(';', 1)[-1].strip()
return extra_data
2 changes: 1 addition & 1 deletion hepcrawl/spiders/wsp_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def parse_node(self, response, node):

def _get_collections(self, node, article_type, current_journal_title):
"""Return this articles' collection."""
conference = node.xpath('//conference').extract()
conference = node.xpath('.//conference').extract()
if conference or current_journal_title == "International Journal of Modern Physics: Conference Series":
return ['HEP', 'ConferencePaper']
elif article_type == "review-article":
Expand Down
Loading

0 comments on commit dcf4857

Please sign in to comment.