spiders: new IOP spider

* Moves some functionality to NLM extractor. * Adds tests. Signed-off-by: Henrik Vesterinen <henrik.vesterinen@cern.ch>
inspirehep · Jun 14, 2016 · 796fdff · 796fdff
1 parent a8df04e
commit 796fdff
Show file tree

Hide file tree

Showing 7 changed files with 757 additions and 0 deletions.
diff --git a/docs/api.rst b/docs/api.rst
@@ -31,5 +31,9 @@ Spiders
 .. automodule:: hepcrawl.spiders.wsp_spider
    :members:
 
+
 .. automodule:: hepcrawl.spiders.infn_spider
    :members:
+
+.. automodule:: hepcrawl.spiders.iop_spider
+   :members:
diff --git a/hepcrawl/extractors/nlm.py b/hepcrawl/extractors/nlm.py
@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of hepcrawl.
+# Copyright (C) 2016 CERN.
+#
+# hepcrawl is a free software; you can redistribute it and/or modify it
+# under the terms of the Revised BSD License; see LICENSE file for
+# more details.
+
+"""Common extraction from the NLM XML format."""
+
+from __future__ import absolute_import, print_function
+
+
+class NLM(object):
+    """Special extractions for NLM formats."""
+
+    @staticmethod
+    def get_authors(node):
+        """Get the authors."""
+        authors = []
+        for author in node.xpath("./AuthorList//Author"):
+            surname = author.xpath("./LastName/text()").extract_first()
+            firstname = author.xpath("./FirstName/text()").extract_first()
+            middlename = author.xpath("./MiddleName/text()").extract_first()
+            affiliations = author.xpath(".//Affiliation/text()").extract()
+
+            if not surname:
+                surname = ""
+            given_names = ""
+            if firstname and middlename:
+                given_names = "{} {}".format(firstname, middlename)
+            elif firstname:
+                given_names = firstname
+
+            auth_dict = {}
+            auth_dict["surname"] = surname
+            auth_dict["given_names"] = given_names
+            if affiliations:
+                auth_dict["affiliations"] = [
+                    {"value": aff} for aff in affiliations]
+            authors.append(auth_dict)
+
+        return authors
+
+    @staticmethod
+    def get_collections(doctype):
+        """Return the article's collection."""
+        collections = ["HEP", "Citeable", "Published"]
+        if doctype:
+            if doctype == "Review":
+                collections += ["Review"]
+            if "conference" in doctype.lower():
+                collections += ["ConferencePaper"]
+        return collections
+
+    @staticmethod
+    def get_dois(node):
+        """Get DOI."""
+        dois = node.xpath(
+            ".//ArticleIdList/ArticleId[@IdType='doi']/text()").extract()
+        if not dois:
+            dois = node.xpath(
+                ".//ELocationID[@EIdType='doi']/text()").extract()
+
+        return dois
+
+    @staticmethod
+    def get_date_published(node):
+        """Publication date."""
+        year = node.xpath(".//Journal/PubDate/Year/text()").extract_first()
+        month = node.xpath(".//Journal/PubDate/Month/text()").extract_first()
+        day = node.xpath(".//Journal/PubDate/Day/text()").extract_first()
+
+        date_published = ""
+        if year:
+            date_published = year
+        if month:
+            date_published += "-" + month
+        if day:
+            date_published += "-" + day
+
+        return date_published
+
+    @staticmethod
+    def get_pub_status(node):
+        """Publication status.
+
+        cases: "aheadofprint",
+                "ppublish",
+                "epublish",
+                "received",
+                "accepted",
+                "revised",
+                "ecollection"
+        """
+        pubstatus = node.xpath(".//Journal/PubDate/@PubStatus").extract_first()
+
+        return pubstatus
+
+    @staticmethod
+    def get_doctype(node):
+        """Publication type.
+
+        cases : "Addresses",
+                "Bibliography",
+                "Case Reports",
+                "Classical Article",
+                "Clinical Conference",
+                "Clinical Trial",
+                "Congresses",
+                "Consensus Development Conference",
+                "Consensus Development Conference, NIH",
+                "Corrected and Republished Article",
+                "Editorial",
+                "Festschrift",
+                "Guideline",
+                "Interview",
+                "Journal Article",
+                "Lectures",
+                "Letter",
+                "Meta-Analysis",
+                "News",
+                "Newspaper Article",
+                "Observational Study",
+                "Patient Education Handout",
+                "Practice Guideline",
+                "Published Erratum",
+                "Retraction of Publication",
+                "Review",
+                "Video-Audio Media",
+                "Webcasts"
+        """
+        pubtype = node.xpath(".//PublicationType/text()").extract_first()
+        return pubtype
+
+    @staticmethod
+    def get_page_numbers(node):
+        """Get page numbers and number of pages."""
+
+        fpage = node.xpath(".//FirstPage/text()").extract_first()
+        lpage = node.xpath(".//LastPage/text()").extract_first()
+        if fpage and lpage:
+            journal_pages = "{}-{}".format(fpage, lpage)
+            page_nr = str(int(lpage) - int(fpage) + 1)
+        elif fpage:
+            journal_pages = fpage
+            page_nr = ""
+        else:
+            fpage = ""
+            journal_pages = ""
+            page_nr = ""
+
+        return (
+            fpage,
+            journal_pages,
+            page_nr,
+        )
diff --git a/hepcrawl/spiders/iop_spider.py b/hepcrawl/spiders/iop_spider.py
@@ -0,0 +1,209 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of hepcrawl.
+# Copyright (C) 2016 CERN.
+#
+# hepcrawl is a free software; you can redistribute it and/or modify it
+# under the terms of the Revised BSD License; see LICENSE file for
+# more details.
+
+"""Spider for IOP."""
+
+from __future__ import absolute_import, print_function
+
+import os
+
+import tarfile
+
+from tempfile import mkdtemp
+
+from scrapy import Request
+from scrapy.spiders import XMLFeedSpider
+from ..extractors.nlm import NLM
+
+from ..items import HEPRecord
+from ..loaders import HEPLoader
+
+
+class IOPSpider(XMLFeedSpider, NLM):
+    """IOPSpider crawler.
+
+    This spider should first be able to harvest files from IOP STACKS
+    (http://stacks.iop.org/Member/). Then it should scrape through the files
+    and get the things we want.
+
+    XML files are in NLM PubMed format:
+    http://www.ncbi.nlm.nih.gov/books/NBK3828/#publisherhelp.XML_Tag_Descriptions
+    Examples:
+    http://www.ncbi.nlm.nih.gov/books/NBK3828/#publisherhelp.Example_of_a_Standard_XML
+
+    1. Fetch gzipped data packages from STACKS
+
+    2. Scrape the XML files inside.
+
+    3. Return valid JSON records.
+
+    You can also call this spider directly on gzip package or an XML file. If called
+    without arguments, it will attempt to fetch files from STACKS.
+
+    Example usage:
+    .. code-block:: console
+
+        scrapy crawl iop -a xml_file=file://`pwd`/tests/responses/iop/xml/test_standard.xml
+        scrapy crawl iop -a zip_file=file://`pwd`/tests/responses/iop/packages/test.tar.gz -a xml_file=file://`pwd`/tests/responses/iop/xml/test_standard.xml
+        scrapy crawl iop -a pdf_files=`pwd`/tests/responses/iop/pdf/ -a xml_file=file://`pwd`/tests/responses/iop/xml/test_standard.xml
+
+    for JSON output, add -s "JSON_OUTPUT_DIR=tmp/"
+    for logging, add -s "LOG_FILE=iop.log"
+
+    Happy crawling!
+    """
+
+    name = 'iop'
+    start_urls = []
+    iterator = 'xml'
+    itertag = 'Article'
+
+    OPEN_ACCESS_JOURNALS = {
+        "J. Phys.: Conf. Ser.",
+        # FIXME: add more
+    }
+
+    def __init__(self, zip_file=None, xml_file=None, pdf_files=None, *args, **kwargs):
+        """Construct IOP spider."""
+        super(IOPSpider, self).__init__(*args, **kwargs)
+        self.zip_file = zip_file
+        self.xml_file = xml_file
+        self.pdf_files = pdf_files
+
+    def start_requests(self):
+        """Spider can be run on a record XML file. In addition, a gzipped package
+        containing PDF files or the path to the pdf files can be given.
+
+        If no arguments are given, it should try to get the package from STACKS.
+        """
+        if self.xml_file:
+            if not self.pdf_files and self.zip_file:
+                self.pdf_files = self.handle_package(self.zip_file)
+            request = Request(self.xml_file)
+            if self.pdf_files:
+                request.meta["pdf_files"] = self.pdf_files
+            yield request
+        # else:
+            # self.fetch_packages_from_stacks()
+
+    # def fetch_packages_from_stacks(self):
+        # """Get the newest PDF package from STACKS. It requires authentication."""
+        # # FIXME: IOP STACKS is not working properly. In any case, XMLs
+        # # are not bundled in this package?
+        # package = requests.get(
+            # "http://stacks.iop.org/Member/lload.tar.gz",
+            # auth=('user', 'pass')
+        # )
+        # # Write package contents to self.zip_file
+        # yield Request(self.zip_file, callback=self.handle_package)
+
+    def handle_package(self, zip_file):
+        """Extract all the pdf files in the gzip package."""
+        filename = os.path.basename(zip_file).rstrip(".tar.gz")
+        # FIXME: should the files be permanently stored somewhere?
+        # TMP dir to extract zip packages:
+        target_folder = mkdtemp(
+            prefix="iop" + filename + "_", dir="/tmp/")
+        zip_filepath = zip_file.replace("file://", "")
+        self.untar_files(zip_filepath, target_folder)
+
+        return target_folder
+
+    @staticmethod
+    def untar_files(zip_filepath, target_folder):
+        """Unpack a tar.gz package while flattening the dir structure.
+        Return list of pdf paths.
+        """
+        pdf_files = []
+        with tarfile.open(zip_filepath, "r:gz") as tar:
+            for filename in tar.getmembers():
+                if filename.path.endswith(".pdf"):
+                    filename.name = os.path.basename(filename.name)
+                    absolute_path = os.path.join(target_folder, filename.path)
+                    if not os.path.exists(absolute_path):
+                        tar.extract(filename, path=target_folder)
+                    pdf_files.append(absolute_path)
+
+        return pdf_files
+
+    def get_pdf_path(self, vol, issue, fpage):
+        """Get path for the correct pdf."""
+        pattern = "{}_{}_{}.pdf".format(vol, issue, fpage)
+        for pdf_path in os.listdir(self.pdf_files):
+            if pattern in pdf_path:
+                return os.path.join(self.pdf_files, pdf_path)
+
+    def add_fft_file(self, file_path, file_access, file_type):
+        """Create a structured dictionary and add to 'files' item."""
+        file_dict = {
+            "access": file_access,
+            "description": self.name.upper(),
+            "url": file_path,
+            "type": file_type,
+        }
+        return file_dict
+
+    def parse_node(self, response, node):
+        """Parse the record XML and create a HEPRecord."""
+        record = HEPLoader(item=HEPRecord(), selector=node, response=response)
+
+        pub_status = self.get_pub_status(node)
+        if pub_status in {"aheadofprint", "received"}:
+            return None
+
+        fpage, journal_pages, page_nr = self.get_page_numbers(node)
+        volume = node.xpath(".//Journal/Volume/text()").extract_first()
+        issue = node.xpath(".//Journal/Issue/text()").extract_first()
+
+        # FIXME: should we add this to 'additional_files':
+        # xml_file_path = response.url
+
+        # FIXME: how to use this information:
+        # replaces_article_with_this_doi = node.xpath(
+        #     "//Replaces[@IdType='doi']/text()")
+
+        record.add_value("journal_pages", journal_pages)
+        record.add_xpath('abstract', ".//Abstract")
+        record.add_xpath("title", ".//ArticleTitle")
+        record.add_value('authors', self.get_authors(node))
+        journal_title = node.xpath(
+            ".//Journal/JournalTitle/text()").extract_first()
+        record.add_value("journal_title", journal_title)
+        record.add_value("journal_issue", issue)
+        record.add_value("journal_volume", volume)
+        record.add_xpath("journal_issn", ".//Journal/Issn/text()")
+        record.add_value("dois", self.get_dois(node))
+        record.add_xpath("journal_year", ".//Journal/PubDate/Year/text()")
+        record.add_xpath("language", ".//Language/text()")
+        record.add_value("page_nr", page_nr)
+        record.add_value('date_published', self.get_date_published(node))
+        record.add_xpath('copyright_statement',
+                         "./CopyrightInformation/text()")
+        record.add_xpath('copyright_holder', "//Journal/PublisherName/text()")
+        record.add_xpath(
+            'free_keywords', "ObjectList/Object[@Type='keyword']/Param[@Name='value']/text()")
+
+        doctype = self.get_doctype(node)
+        record.add_value('collections', self.get_collections(doctype))
+
+        if self.pdf_files:
+            pdf_file_path = self.get_pdf_path(volume, issue, fpage)
+            if pdf_file_path:
+                if doctype and "erratum" in doctype.lower():
+                    file_type = "Erratum"
+                else:
+                    file_type = "Fulltext"
+                if journal_title in self.OPEN_ACCESS_JOURNALS:
+                    file_access = "INSPIRE-PUBLIC"  # FIXME: right?
+                else:
+                    file_access = "INSPIRE-HIDDEN"
+                record.add_value("additional_files",
+                                 self.add_fft_file(pdf_file_path, file_access, file_type))
+
+        return record.load_item()
diff --git a/tests/responses/iop/packages/test.tar.gz b/tests/responses/iop/packages/test.tar.gz
diff --git a/tests/responses/iop/pdf/test_143_3_336.pdf b/tests/responses/iop/pdf/test_143_3_336.pdf