-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Moves some functionality to NLM extractor. * Adds tests. Signed-off-by: Henrik Vesterinen <henrik.vesterinen@cern.ch>
- Loading branch information
1 parent
a8df04e
commit 796fdff
Showing
7 changed files
with
757 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# This file is part of hepcrawl. | ||
# Copyright (C) 2016 CERN. | ||
# | ||
# hepcrawl is a free software; you can redistribute it and/or modify it | ||
# under the terms of the Revised BSD License; see LICENSE file for | ||
# more details. | ||
|
||
"""Common extraction from the NLM XML format.""" | ||
|
||
from __future__ import absolute_import, print_function | ||
|
||
|
||
class NLM(object): | ||
"""Special extractions for NLM formats.""" | ||
|
||
@staticmethod | ||
def get_authors(node): | ||
"""Get the authors.""" | ||
authors = [] | ||
for author in node.xpath("./AuthorList//Author"): | ||
surname = author.xpath("./LastName/text()").extract_first() | ||
firstname = author.xpath("./FirstName/text()").extract_first() | ||
middlename = author.xpath("./MiddleName/text()").extract_first() | ||
affiliations = author.xpath(".//Affiliation/text()").extract() | ||
|
||
if not surname: | ||
surname = "" | ||
given_names = "" | ||
if firstname and middlename: | ||
given_names = "{} {}".format(firstname, middlename) | ||
elif firstname: | ||
given_names = firstname | ||
|
||
auth_dict = {} | ||
auth_dict["surname"] = surname | ||
auth_dict["given_names"] = given_names | ||
if affiliations: | ||
auth_dict["affiliations"] = [ | ||
{"value": aff} for aff in affiliations] | ||
authors.append(auth_dict) | ||
|
||
return authors | ||
|
||
@staticmethod | ||
def get_collections(doctype): | ||
"""Return the article's collection.""" | ||
collections = ["HEP", "Citeable", "Published"] | ||
if doctype: | ||
if doctype == "Review": | ||
collections += ["Review"] | ||
if "conference" in doctype.lower(): | ||
collections += ["ConferencePaper"] | ||
return collections | ||
|
||
@staticmethod | ||
def get_dois(node): | ||
"""Get DOI.""" | ||
dois = node.xpath( | ||
".//ArticleIdList/ArticleId[@IdType='doi']/text()").extract() | ||
if not dois: | ||
dois = node.xpath( | ||
".//ELocationID[@EIdType='doi']/text()").extract() | ||
|
||
return dois | ||
|
||
@staticmethod | ||
def get_date_published(node): | ||
"""Publication date.""" | ||
year = node.xpath(".//Journal/PubDate/Year/text()").extract_first() | ||
month = node.xpath(".//Journal/PubDate/Month/text()").extract_first() | ||
day = node.xpath(".//Journal/PubDate/Day/text()").extract_first() | ||
|
||
date_published = "" | ||
if year: | ||
date_published = year | ||
if month: | ||
date_published += "-" + month | ||
if day: | ||
date_published += "-" + day | ||
|
||
return date_published | ||
|
||
@staticmethod | ||
def get_pub_status(node): | ||
"""Publication status. | ||
cases: "aheadofprint", | ||
"ppublish", | ||
"epublish", | ||
"received", | ||
"accepted", | ||
"revised", | ||
"ecollection" | ||
""" | ||
pubstatus = node.xpath(".//Journal/PubDate/@PubStatus").extract_first() | ||
|
||
return pubstatus | ||
|
||
@staticmethod | ||
def get_doctype(node): | ||
"""Publication type. | ||
cases : "Addresses", | ||
"Bibliography", | ||
"Case Reports", | ||
"Classical Article", | ||
"Clinical Conference", | ||
"Clinical Trial", | ||
"Congresses", | ||
"Consensus Development Conference", | ||
"Consensus Development Conference, NIH", | ||
"Corrected and Republished Article", | ||
"Editorial", | ||
"Festschrift", | ||
"Guideline", | ||
"Interview", | ||
"Journal Article", | ||
"Lectures", | ||
"Letter", | ||
"Meta-Analysis", | ||
"News", | ||
"Newspaper Article", | ||
"Observational Study", | ||
"Patient Education Handout", | ||
"Practice Guideline", | ||
"Published Erratum", | ||
"Retraction of Publication", | ||
"Review", | ||
"Video-Audio Media", | ||
"Webcasts" | ||
""" | ||
pubtype = node.xpath(".//PublicationType/text()").extract_first() | ||
return pubtype | ||
|
||
@staticmethod | ||
def get_page_numbers(node): | ||
"""Get page numbers and number of pages.""" | ||
|
||
fpage = node.xpath(".//FirstPage/text()").extract_first() | ||
lpage = node.xpath(".//LastPage/text()").extract_first() | ||
if fpage and lpage: | ||
journal_pages = "{}-{}".format(fpage, lpage) | ||
page_nr = str(int(lpage) - int(fpage) + 1) | ||
elif fpage: | ||
journal_pages = fpage | ||
page_nr = "" | ||
else: | ||
fpage = "" | ||
journal_pages = "" | ||
page_nr = "" | ||
|
||
return ( | ||
fpage, | ||
journal_pages, | ||
page_nr, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,209 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# This file is part of hepcrawl. | ||
# Copyright (C) 2016 CERN. | ||
# | ||
# hepcrawl is a free software; you can redistribute it and/or modify it | ||
# under the terms of the Revised BSD License; see LICENSE file for | ||
# more details. | ||
|
||
"""Spider for IOP.""" | ||
|
||
from __future__ import absolute_import, print_function | ||
|
||
import os | ||
|
||
import tarfile | ||
|
||
from tempfile import mkdtemp | ||
|
||
from scrapy import Request | ||
from scrapy.spiders import XMLFeedSpider | ||
from ..extractors.nlm import NLM | ||
|
||
from ..items import HEPRecord | ||
from ..loaders import HEPLoader | ||
|
||
|
||
class IOPSpider(XMLFeedSpider, NLM): | ||
"""IOPSpider crawler. | ||
This spider should first be able to harvest files from IOP STACKS | ||
(http://stacks.iop.org/Member/). Then it should scrape through the files | ||
and get the things we want. | ||
XML files are in NLM PubMed format: | ||
http://www.ncbi.nlm.nih.gov/books/NBK3828/#publisherhelp.XML_Tag_Descriptions | ||
Examples: | ||
http://www.ncbi.nlm.nih.gov/books/NBK3828/#publisherhelp.Example_of_a_Standard_XML | ||
1. Fetch gzipped data packages from STACKS | ||
2. Scrape the XML files inside. | ||
3. Return valid JSON records. | ||
You can also call this spider directly on gzip package or an XML file. If called | ||
without arguments, it will attempt to fetch files from STACKS. | ||
Example usage: | ||
.. code-block:: console | ||
scrapy crawl iop -a xml_file=file://`pwd`/tests/responses/iop/xml/test_standard.xml | ||
scrapy crawl iop -a zip_file=file://`pwd`/tests/responses/iop/packages/test.tar.gz -a xml_file=file://`pwd`/tests/responses/iop/xml/test_standard.xml | ||
scrapy crawl iop -a pdf_files=`pwd`/tests/responses/iop/pdf/ -a xml_file=file://`pwd`/tests/responses/iop/xml/test_standard.xml | ||
for JSON output, add -s "JSON_OUTPUT_DIR=tmp/" | ||
for logging, add -s "LOG_FILE=iop.log" | ||
Happy crawling! | ||
""" | ||
|
||
name = 'iop' | ||
start_urls = [] | ||
iterator = 'xml' | ||
itertag = 'Article' | ||
|
||
OPEN_ACCESS_JOURNALS = { | ||
"J. Phys.: Conf. Ser.", | ||
# FIXME: add more | ||
} | ||
|
||
def __init__(self, zip_file=None, xml_file=None, pdf_files=None, *args, **kwargs): | ||
"""Construct IOP spider.""" | ||
super(IOPSpider, self).__init__(*args, **kwargs) | ||
self.zip_file = zip_file | ||
self.xml_file = xml_file | ||
self.pdf_files = pdf_files | ||
|
||
def start_requests(self): | ||
"""Spider can be run on a record XML file. In addition, a gzipped package | ||
containing PDF files or the path to the pdf files can be given. | ||
If no arguments are given, it should try to get the package from STACKS. | ||
""" | ||
if self.xml_file: | ||
if not self.pdf_files and self.zip_file: | ||
self.pdf_files = self.handle_package(self.zip_file) | ||
request = Request(self.xml_file) | ||
if self.pdf_files: | ||
request.meta["pdf_files"] = self.pdf_files | ||
yield request | ||
# else: | ||
# self.fetch_packages_from_stacks() | ||
|
||
# def fetch_packages_from_stacks(self): | ||
# """Get the newest PDF package from STACKS. It requires authentication.""" | ||
# # FIXME: IOP STACKS is not working properly. In any case, XMLs | ||
# # are not bundled in this package? | ||
# package = requests.get( | ||
# "http://stacks.iop.org/Member/lload.tar.gz", | ||
# auth=('user', 'pass') | ||
# ) | ||
# # Write package contents to self.zip_file | ||
# yield Request(self.zip_file, callback=self.handle_package) | ||
|
||
def handle_package(self, zip_file): | ||
"""Extract all the pdf files in the gzip package.""" | ||
filename = os.path.basename(zip_file).rstrip(".tar.gz") | ||
# FIXME: should the files be permanently stored somewhere? | ||
# TMP dir to extract zip packages: | ||
target_folder = mkdtemp( | ||
prefix="iop" + filename + "_", dir="/tmp/") | ||
zip_filepath = zip_file.replace("file://", "") | ||
self.untar_files(zip_filepath, target_folder) | ||
|
||
return target_folder | ||
|
||
@staticmethod | ||
def untar_files(zip_filepath, target_folder): | ||
"""Unpack a tar.gz package while flattening the dir structure. | ||
Return list of pdf paths. | ||
""" | ||
pdf_files = [] | ||
with tarfile.open(zip_filepath, "r:gz") as tar: | ||
for filename in tar.getmembers(): | ||
if filename.path.endswith(".pdf"): | ||
filename.name = os.path.basename(filename.name) | ||
absolute_path = os.path.join(target_folder, filename.path) | ||
if not os.path.exists(absolute_path): | ||
tar.extract(filename, path=target_folder) | ||
pdf_files.append(absolute_path) | ||
|
||
return pdf_files | ||
|
||
def get_pdf_path(self, vol, issue, fpage): | ||
"""Get path for the correct pdf.""" | ||
pattern = "{}_{}_{}.pdf".format(vol, issue, fpage) | ||
for pdf_path in os.listdir(self.pdf_files): | ||
if pattern in pdf_path: | ||
return os.path.join(self.pdf_files, pdf_path) | ||
|
||
def add_fft_file(self, file_path, file_access, file_type): | ||
"""Create a structured dictionary and add to 'files' item.""" | ||
file_dict = { | ||
"access": file_access, | ||
"description": self.name.upper(), | ||
"url": file_path, | ||
"type": file_type, | ||
} | ||
return file_dict | ||
|
||
def parse_node(self, response, node): | ||
"""Parse the record XML and create a HEPRecord.""" | ||
record = HEPLoader(item=HEPRecord(), selector=node, response=response) | ||
|
||
pub_status = self.get_pub_status(node) | ||
if pub_status in {"aheadofprint", "received"}: | ||
return None | ||
|
||
fpage, journal_pages, page_nr = self.get_page_numbers(node) | ||
volume = node.xpath(".//Journal/Volume/text()").extract_first() | ||
issue = node.xpath(".//Journal/Issue/text()").extract_first() | ||
|
||
# FIXME: should we add this to 'additional_files': | ||
# xml_file_path = response.url | ||
|
||
# FIXME: how to use this information: | ||
# replaces_article_with_this_doi = node.xpath( | ||
# "//Replaces[@IdType='doi']/text()") | ||
|
||
record.add_value("journal_pages", journal_pages) | ||
record.add_xpath('abstract', ".//Abstract") | ||
record.add_xpath("title", ".//ArticleTitle") | ||
record.add_value('authors', self.get_authors(node)) | ||
journal_title = node.xpath( | ||
".//Journal/JournalTitle/text()").extract_first() | ||
record.add_value("journal_title", journal_title) | ||
record.add_value("journal_issue", issue) | ||
record.add_value("journal_volume", volume) | ||
record.add_xpath("journal_issn", ".//Journal/Issn/text()") | ||
record.add_value("dois", self.get_dois(node)) | ||
record.add_xpath("journal_year", ".//Journal/PubDate/Year/text()") | ||
record.add_xpath("language", ".//Language/text()") | ||
record.add_value("page_nr", page_nr) | ||
record.add_value('date_published', self.get_date_published(node)) | ||
record.add_xpath('copyright_statement', | ||
"./CopyrightInformation/text()") | ||
record.add_xpath('copyright_holder', "//Journal/PublisherName/text()") | ||
record.add_xpath( | ||
'free_keywords', "ObjectList/Object[@Type='keyword']/Param[@Name='value']/text()") | ||
|
||
doctype = self.get_doctype(node) | ||
record.add_value('collections', self.get_collections(doctype)) | ||
|
||
if self.pdf_files: | ||
pdf_file_path = self.get_pdf_path(volume, issue, fpage) | ||
if pdf_file_path: | ||
if doctype and "erratum" in doctype.lower(): | ||
file_type = "Erratum" | ||
else: | ||
file_type = "Fulltext" | ||
if journal_title in self.OPEN_ACCESS_JOURNALS: | ||
file_access = "INSPIRE-PUBLIC" # FIXME: right? | ||
else: | ||
file_access = "INSPIRE-HIDDEN" | ||
record.add_value("additional_files", | ||
self.add_fft_file(pdf_file_path, file_access, file_type)) | ||
|
||
return record.load_item() |
Binary file not shown.
Binary file not shown.
Oops, something went wrong.