Skip to content

Commit

Permalink
spiders: new IOP spider
Browse files Browse the repository at this point in the history
* Moves some functionality to NLM extractor.

* Adds tests.

Signed-off-by: Henrik Vesterinen <henrik.vesterinen@cern.ch>
  • Loading branch information
bittirousku committed Jun 14, 2016
1 parent a8df04e commit 796fdff
Show file tree
Hide file tree
Showing 7 changed files with 757 additions and 0 deletions.
4 changes: 4 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,9 @@ Spiders
.. automodule:: hepcrawl.spiders.wsp_spider
:members:


.. automodule:: hepcrawl.spiders.infn_spider
:members:

.. automodule:: hepcrawl.spiders.iop_spider
:members:
158 changes: 158 additions & 0 deletions hepcrawl/extractors/nlm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2016 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Common extraction from the NLM XML format."""

from __future__ import absolute_import, print_function


class NLM(object):
"""Special extractions for NLM formats."""

@staticmethod
def get_authors(node):
"""Get the authors."""
authors = []
for author in node.xpath("./AuthorList//Author"):
surname = author.xpath("./LastName/text()").extract_first()
firstname = author.xpath("./FirstName/text()").extract_first()
middlename = author.xpath("./MiddleName/text()").extract_first()
affiliations = author.xpath(".//Affiliation/text()").extract()

if not surname:
surname = ""
given_names = ""
if firstname and middlename:
given_names = "{} {}".format(firstname, middlename)
elif firstname:
given_names = firstname

auth_dict = {}
auth_dict["surname"] = surname
auth_dict["given_names"] = given_names
if affiliations:
auth_dict["affiliations"] = [
{"value": aff} for aff in affiliations]
authors.append(auth_dict)

return authors

@staticmethod
def get_collections(doctype):
"""Return the article's collection."""
collections = ["HEP", "Citeable", "Published"]
if doctype:
if doctype == "Review":
collections += ["Review"]
if "conference" in doctype.lower():
collections += ["ConferencePaper"]
return collections

@staticmethod
def get_dois(node):
"""Get DOI."""
dois = node.xpath(
".//ArticleIdList/ArticleId[@IdType='doi']/text()").extract()
if not dois:
dois = node.xpath(
".//ELocationID[@EIdType='doi']/text()").extract()

return dois

@staticmethod
def get_date_published(node):
"""Publication date."""
year = node.xpath(".//Journal/PubDate/Year/text()").extract_first()
month = node.xpath(".//Journal/PubDate/Month/text()").extract_first()
day = node.xpath(".//Journal/PubDate/Day/text()").extract_first()

date_published = ""
if year:
date_published = year
if month:
date_published += "-" + month
if day:
date_published += "-" + day

return date_published

@staticmethod
def get_pub_status(node):
"""Publication status.
cases: "aheadofprint",
"ppublish",
"epublish",
"received",
"accepted",
"revised",
"ecollection"
"""
pubstatus = node.xpath(".//Journal/PubDate/@PubStatus").extract_first()

return pubstatus

@staticmethod
def get_doctype(node):
"""Publication type.
cases : "Addresses",
"Bibliography",
"Case Reports",
"Classical Article",
"Clinical Conference",
"Clinical Trial",
"Congresses",
"Consensus Development Conference",
"Consensus Development Conference, NIH",
"Corrected and Republished Article",
"Editorial",
"Festschrift",
"Guideline",
"Interview",
"Journal Article",
"Lectures",
"Letter",
"Meta-Analysis",
"News",
"Newspaper Article",
"Observational Study",
"Patient Education Handout",
"Practice Guideline",
"Published Erratum",
"Retraction of Publication",
"Review",
"Video-Audio Media",
"Webcasts"
"""
pubtype = node.xpath(".//PublicationType/text()").extract_first()
return pubtype

@staticmethod
def get_page_numbers(node):
"""Get page numbers and number of pages."""

fpage = node.xpath(".//FirstPage/text()").extract_first()
lpage = node.xpath(".//LastPage/text()").extract_first()
if fpage and lpage:
journal_pages = "{}-{}".format(fpage, lpage)
page_nr = str(int(lpage) - int(fpage) + 1)
elif fpage:
journal_pages = fpage
page_nr = ""
else:
fpage = ""
journal_pages = ""
page_nr = ""

return (
fpage,
journal_pages,
page_nr,
)
209 changes: 209 additions & 0 deletions hepcrawl/spiders/iop_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2016 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Spider for IOP."""

from __future__ import absolute_import, print_function

import os

import tarfile

from tempfile import mkdtemp

from scrapy import Request
from scrapy.spiders import XMLFeedSpider
from ..extractors.nlm import NLM

from ..items import HEPRecord
from ..loaders import HEPLoader


class IOPSpider(XMLFeedSpider, NLM):
"""IOPSpider crawler.
This spider should first be able to harvest files from IOP STACKS
(http://stacks.iop.org/Member/). Then it should scrape through the files
and get the things we want.
XML files are in NLM PubMed format:
http://www.ncbi.nlm.nih.gov/books/NBK3828/#publisherhelp.XML_Tag_Descriptions
Examples:
http://www.ncbi.nlm.nih.gov/books/NBK3828/#publisherhelp.Example_of_a_Standard_XML
1. Fetch gzipped data packages from STACKS
2. Scrape the XML files inside.
3. Return valid JSON records.
You can also call this spider directly on gzip package or an XML file. If called
without arguments, it will attempt to fetch files from STACKS.
Example usage:
.. code-block:: console
scrapy crawl iop -a xml_file=file://`pwd`/tests/responses/iop/xml/test_standard.xml
scrapy crawl iop -a zip_file=file://`pwd`/tests/responses/iop/packages/test.tar.gz -a xml_file=file://`pwd`/tests/responses/iop/xml/test_standard.xml
scrapy crawl iop -a pdf_files=`pwd`/tests/responses/iop/pdf/ -a xml_file=file://`pwd`/tests/responses/iop/xml/test_standard.xml
for JSON output, add -s "JSON_OUTPUT_DIR=tmp/"
for logging, add -s "LOG_FILE=iop.log"
Happy crawling!
"""

name = 'iop'
start_urls = []
iterator = 'xml'
itertag = 'Article'

OPEN_ACCESS_JOURNALS = {
"J. Phys.: Conf. Ser.",
# FIXME: add more
}

def __init__(self, zip_file=None, xml_file=None, pdf_files=None, *args, **kwargs):
"""Construct IOP spider."""
super(IOPSpider, self).__init__(*args, **kwargs)
self.zip_file = zip_file
self.xml_file = xml_file
self.pdf_files = pdf_files

def start_requests(self):
"""Spider can be run on a record XML file. In addition, a gzipped package
containing PDF files or the path to the pdf files can be given.
If no arguments are given, it should try to get the package from STACKS.
"""
if self.xml_file:
if not self.pdf_files and self.zip_file:
self.pdf_files = self.handle_package(self.zip_file)
request = Request(self.xml_file)
if self.pdf_files:
request.meta["pdf_files"] = self.pdf_files
yield request
# else:
# self.fetch_packages_from_stacks()

# def fetch_packages_from_stacks(self):
# """Get the newest PDF package from STACKS. It requires authentication."""
# # FIXME: IOP STACKS is not working properly. In any case, XMLs
# # are not bundled in this package?
# package = requests.get(
# "http://stacks.iop.org/Member/lload.tar.gz",
# auth=('user', 'pass')
# )
# # Write package contents to self.zip_file
# yield Request(self.zip_file, callback=self.handle_package)

def handle_package(self, zip_file):
"""Extract all the pdf files in the gzip package."""
filename = os.path.basename(zip_file).rstrip(".tar.gz")
# FIXME: should the files be permanently stored somewhere?
# TMP dir to extract zip packages:
target_folder = mkdtemp(
prefix="iop" + filename + "_", dir="/tmp/")
zip_filepath = zip_file.replace("file://", "")
self.untar_files(zip_filepath, target_folder)

return target_folder

@staticmethod
def untar_files(zip_filepath, target_folder):
"""Unpack a tar.gz package while flattening the dir structure.
Return list of pdf paths.
"""
pdf_files = []
with tarfile.open(zip_filepath, "r:gz") as tar:
for filename in tar.getmembers():
if filename.path.endswith(".pdf"):
filename.name = os.path.basename(filename.name)
absolute_path = os.path.join(target_folder, filename.path)
if not os.path.exists(absolute_path):
tar.extract(filename, path=target_folder)
pdf_files.append(absolute_path)

return pdf_files

def get_pdf_path(self, vol, issue, fpage):
"""Get path for the correct pdf."""
pattern = "{}_{}_{}.pdf".format(vol, issue, fpage)
for pdf_path in os.listdir(self.pdf_files):
if pattern in pdf_path:
return os.path.join(self.pdf_files, pdf_path)

def add_fft_file(self, file_path, file_access, file_type):
"""Create a structured dictionary and add to 'files' item."""
file_dict = {
"access": file_access,
"description": self.name.upper(),
"url": file_path,
"type": file_type,
}
return file_dict

def parse_node(self, response, node):
"""Parse the record XML and create a HEPRecord."""
record = HEPLoader(item=HEPRecord(), selector=node, response=response)

pub_status = self.get_pub_status(node)
if pub_status in {"aheadofprint", "received"}:
return None

fpage, journal_pages, page_nr = self.get_page_numbers(node)
volume = node.xpath(".//Journal/Volume/text()").extract_first()
issue = node.xpath(".//Journal/Issue/text()").extract_first()

# FIXME: should we add this to 'additional_files':
# xml_file_path = response.url

# FIXME: how to use this information:
# replaces_article_with_this_doi = node.xpath(
# "//Replaces[@IdType='doi']/text()")

record.add_value("journal_pages", journal_pages)
record.add_xpath('abstract', ".//Abstract")
record.add_xpath("title", ".//ArticleTitle")
record.add_value('authors', self.get_authors(node))
journal_title = node.xpath(
".//Journal/JournalTitle/text()").extract_first()
record.add_value("journal_title", journal_title)
record.add_value("journal_issue", issue)
record.add_value("journal_volume", volume)
record.add_xpath("journal_issn", ".//Journal/Issn/text()")
record.add_value("dois", self.get_dois(node))
record.add_xpath("journal_year", ".//Journal/PubDate/Year/text()")
record.add_xpath("language", ".//Language/text()")
record.add_value("page_nr", page_nr)
record.add_value('date_published', self.get_date_published(node))
record.add_xpath('copyright_statement',
"./CopyrightInformation/text()")
record.add_xpath('copyright_holder', "//Journal/PublisherName/text()")
record.add_xpath(
'free_keywords', "ObjectList/Object[@Type='keyword']/Param[@Name='value']/text()")

doctype = self.get_doctype(node)
record.add_value('collections', self.get_collections(doctype))

if self.pdf_files:
pdf_file_path = self.get_pdf_path(volume, issue, fpage)
if pdf_file_path:
if doctype and "erratum" in doctype.lower():
file_type = "Erratum"
else:
file_type = "Fulltext"
if journal_title in self.OPEN_ACCESS_JOURNALS:
file_access = "INSPIRE-PUBLIC" # FIXME: right?
else:
file_access = "INSPIRE-HIDDEN"
record.add_value("additional_files",
self.add_fft_file(pdf_file_path, file_access, file_type))

return record.load_item()
Binary file added tests/responses/iop/packages/test.tar.gz
Binary file not shown.
Binary file added tests/responses/iop/pdf/test_143_3_336.pdf
Binary file not shown.
Loading

0 comments on commit 796fdff

Please sign in to comment.