From 4b43dc09741241237c539448ffc6ba9eda84575c Mon Sep 17 00:00:00 2001 From: pazembrz Date: Tue, 20 Apr 2021 14:46:49 +0200 Subject: [PATCH] crosref_parser: accept unknown document_types as articles inspirehep/inspirehep#1817 --- hepcrawl/parsers/crossref.py | 2 +- ...ple_crossref_record_with_unknown_type.json | 728 ++++++++++++++++++ tests/unit/test_crossref.py | 32 + 3 files changed, 761 insertions(+), 1 deletion(-) create mode 100644 tests/unit/responses/crossref/sample_crossref_record_with_unknown_type.json diff --git a/hepcrawl/parsers/crossref.py b/hepcrawl/parsers/crossref.py index b5b9c16a..c0c82889 100644 --- a/hepcrawl/parsers/crossref.py +++ b/hepcrawl/parsers/crossref.py @@ -83,7 +83,7 @@ def parse(self): @property def document_type(self): doc_type = self.record.get("type") - return DOC_TYPE_MAP[doc_type] + return DOC_TYPE_MAP.get(doc_type, "article") @property def title(self): diff --git a/tests/unit/responses/crossref/sample_crossref_record_with_unknown_type.json b/tests/unit/responses/crossref/sample_crossref_record_with_unknown_type.json new file mode 100644 index 00000000..e199e9b7 --- /dev/null +++ b/tests/unit/responses/crossref/sample_crossref_record_with_unknown_type.json @@ -0,0 +1,728 @@ +{ + "status":"ok", + "message-type":"work", + "message-version":"1.0.0", + "message":{ + "indexed":{ + "date-parts":[ + [ + 2018, + 5, + 9 + ] + ], + "date-time":"2018-05-09T08:02:32Z", + "timestamp":1525852952818 + }, + "reference-count":100, + "publisher":"American Physical Society (APS)", + "issue":"1", + "license":[ + { + "URL":"http:\/\/link.aps.org\/licenses\/aps-default-license", + "start":{ + "date-parts":[ + [ + 2016, + 1, + 11 + ] + ], + "date-time":"2016-01-11T00:00:00Z", + "timestamp":1452470400000 + }, + "delay-in-days":0, + "content-version":"vor" + }, + { + "URL":"http:\/\/link.aps.org\/licenses\/aps-default-accepted-manuscript-license", + "start":{ + "date-parts":[ + [ + 2017, + 1, + 10 + ] + ], + "date-time":"2017-01-10T00:00:00Z", + "timestamp":1484006400000 + }, + "delay-in-days":365, + "content-version":"am" + } + ], + "funder":[ + { + "DOI":"10.13039\/100000015", + "name":"U.S. Department of Energy", + "doi-asserted-by":"publisher", + "award":[ + "DE-FG02-00ER41132" + ] + } + ], + "content-domain":{ + "domain":[ + + ], + "crossmark-restriction":false + }, + "short-container-title":[ + "Phys. Rev. D" + ], + "DOI":"10.1103\/physrevd.93.016005", + "type":"something-unknown", + "created":{ + "date-parts":[ + [ + 2016, + 1, + 11 + ] + ], + "date-time":"2016-01-11T22:03:09Z", + "timestamp":1452549789000 + }, + "source":"Crossref", + "is-referenced-by-count":7, + "title":[ + "Perturbative renormalization of neutron-antineutron operators" + ], + "prefix":"10.1103", + "volume":"93", + "author":[ + { + "given":"Michael I.", + "family":"Buchoff", + "sequence":"first", + "affiliation":[ + + ] + }, + { + "family":"Wagman", + "sequence":"additional", + "affiliation":[ + + ] + } + ], + "member":"16", + "published-online":{ + "date-parts":[ + [ + 2016, + 1, + 11 + ] + ] + }, + "reference":[ + { + "key":"PhysRevD.93.016005Cc1R1", + "DOI":"10.1051\/0004-6361\/201321591", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc2R1", + "first-page":"32", + "volume":"5", + "author":"A. Sakharov", + "year":"1967", + "journal-title":"Pis\u2019ma Zh. Eksp. Teor. Fiz.", + "ISSN":"http:\/\/id.crossref.org\/issn\/0370-274X", + "issn-type":"print" + }, + { + "key":"PhysRevD.93.016005Cc3R1", + "DOI":"10.1103\/PhysRevLett.37.8", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc4R1", + "DOI":"10.1103\/PhysRevD.14.3432", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc5R1", + "DOI":"10.1016\/0370-2693(85)91028-7", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc6R1", + "DOI":"10.1146\/annurev.ns.43.120193.000331", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc7R1", + "DOI":"10.1016\/S0370-2693(98)01009-0", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc8R1", + "DOI":"10.1063\/1.3327552", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc9R1", + "DOI":"10.1103\/PhysRevLett.102.141801", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc10R1", + "DOI":"10.1103\/PhysRevD.86.012006", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc11R1", + "DOI":"10.1103\/PhysRevD.90.072005", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc12R1", + "author":"S. Glashow", + "year":"1979", + "volume-title":"Proceedings of the 7th Neutrino International Conference on Neutrinos, Weak Interactions, and Cosmology (Neutrino \u201979)" + }, + { + "key":"PhysRevD.93.016005Cc13R1", + "DOI":"10.1103\/PhysRevLett.44.1316", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc14R1", + "DOI":"10.1103\/PhysRevD.59.055004", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc15R1", + "DOI":"10.1016\/S0370-2693(99)00766-2", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc16R1", + "DOI":"10.1016\/S0370-2693(01)01077-2", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc17R1", + "DOI":"10.1103\/PhysRevLett.88.171601", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc18R1", + "DOI":"10.1016\/j.physrep.2005.08.006", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc19R1", + "DOI":"10.1103\/PhysRevLett.93.201301", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc20R1", + "DOI":"10.1016\/j.nuclphysb.2006.11.010", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc21R1", + "DOI":"10.1016\/j.nuclphysb.2006.06.035", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc22R1", + "DOI":"10.1103\/PhysRevD.79.015017", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc23R1", + "DOI":"10.1103\/PhysRevD.81.106010", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc24R1", + "DOI":"10.1103\/PhysRevD.85.095009", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc25R1", + "DOI":"10.1103\/PhysRevD.87.075004", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc26R1", + "DOI":"10.1016\/j.physletb.2012.08.006", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc27R1", + "DOI":"10.1103\/PhysRevD.87.115019", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc29R1", + "DOI":"10.1103\/PhysRevD.91.015018", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc30R1", + "DOI":"10.1007\/JHEP05(2015)006", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc31R1", + "DOI":"10.1016\/j.physrep.2015.09.001", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc32R1", + "DOI":"10.1007\/JHEP04(2015)153", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc34R1", + "DOI":"10.1103\/PhysRevD.92.016007", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc36R1", + "DOI":"10.1007\/JHEP12(2014)089", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc37R1", + "DOI":"10.1007\/JHEP06(2015)012", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc38R1", + "DOI":"10.1007\/JHEP07(2015)144", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc40R1", + "DOI":"10.1007\/BF01580321", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc41R1", + "DOI":"10.1103\/PhysRevD.91.072006", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc42R1", + "DOI":"10.1103\/PhysRevD.78.016002", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc46R1", + "DOI":"10.1016\/0370-2693(80)90314-7", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc47R1", + "DOI":"10.1103\/PhysRevLett.45.93", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc48R1", + "DOI":"10.1016\/0370-2693(82)90333-1", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc49R1", + "DOI":"10.1016\/0550-3213(84)90365-1", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc50R1", + "DOI":"10.1016\/0370-2693(83)91585-X", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc51R1", + "DOI":"10.1103\/PhysRevD.91.096010", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc52R1", + "DOI":"10.1103\/PhysRevD.91.096009", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc52R2", + "DOI":"10.1103\/PhysRevD.91.119905", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc53R1", + "DOI":"10.1016\/0370-2693(83)90342-8", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc54R1", + "author":"M.\u2009I. Buchoff", + "volume-title":"Proc. Sci." + }, + { + "key":"PhysRevD.93.016005Cc55R1", + "DOI":"10.1140\/epjc\/s10052-014-2890-7", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc56R1", + "DOI":"10.1016\/0550-3213(95)00126-D", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc57R1", + "DOI":"10.1016\/0370-2693(74)90060-4", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc58R1", + "DOI":"10.1103\/PhysRevLett.33.108", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc59R1", + "DOI":"10.1016\/0550-3213(90)90223-Z", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc60R1", + "DOI":"10.1016\/0550-3213(93)90397-8", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc61R1", + "DOI":"10.1016\/S0550-3213(00)00437-5", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc62R1", + "DOI":"10.1103\/PhysRevD.78.054510", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc63R1", + "DOI":"10.1103\/PhysRevD.78.114509", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc64R1", + "DOI":"10.1103\/PhysRevD.84.014503", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc65R1", + "DOI":"10.1007\/JHEP03(2012)052", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc66R1", + "DOI":"10.1016\/0550-3213(82)90220-6", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc67R1", + "DOI":"10.1016\/0550-3213(91)90436-2", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc68R1", + "DOI":"10.1143\/ptp\/93.3.665", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc69R1", + "DOI":"10.1103\/PhysRevD.75.014507", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc70R1", + "DOI":"10.1103\/PhysRevD.78.054505", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc71R1", + "DOI":"10.1016\/j.nuclphysb.2008.12.015", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc72R1", + "DOI":"10.1016\/j.physletb.2011.08.028", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc73R1", + "DOI":"10.1103\/PhysRevD.89.014505", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc74R1", + "DOI":"10.1007\/JHEP09(2012)052", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc76R1", + "DOI":"10.1140\/epjc\/s10052-009-1173-1", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc77R1", + "DOI":"10.1016\/0550-3213(72)90279-9", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc78R1", + "author":"J.\u2009C. Collins", + "year":"1984", + "volume-title":"Cambridge Monographs on Mathematical Physics" + }, + { + "key":"PhysRevD.93.016005Cc79R1", + "first-page":"26", + "volume":"41", + "author":"E. Egorian", + "year":"1979", + "journal-title":"Teor. Mat. Fiz.", + "ISSN":"http:\/\/id.crossref.org\/issn\/0564-6162", + "issn-type":"print" + }, + { + "key":"PhysRevD.93.016005Cc80R1", + "author":"S. Syritsyn", + "year":"2015", + "volume-title":"Proceedings of the 33rd International Symposium on Lattice Field Theory" + }, + { + "key":"PhysRevD.93.016005Cc81R1", + "author":"M.\u2009I. Buchoff", + "volume-title":"Proc. Sci." + }, + { + "key":"PhysRevD.93.016005Cc82R1", + "DOI":"10.1006\/jcph.1993.1074", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc83R1", + "DOI":"10.1016\/0370-2693(91)90680-O", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc84R1", + "DOI":"10.1016\/0550-3213(95)00474-7", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc85R1", + "DOI":"10.1103\/PhysRevLett.30.1343", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc86R1", + "DOI":"10.1103\/PhysRevLett.30.1346", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc87R1", + "DOI":"10.1016\/0550-3213(74)90093-5", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc88R1", + "DOI":"10.1103\/PhysRevLett.33.244", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc89R1", + "DOI":"10.1016\/S0550-3213(98)00131-X", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc92R1", + "DOI":"10.1103\/PhysRevD.48.2250", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc93R1", + "DOI":"10.1016\/0550-3213(93)90054-S", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc95R1", + "DOI":"10.1016\/j.cpc.2004.05.001", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc96R1", + "first-page":"179", + "volume":"103", + "author":"A.\u2009N. Vasiliev", + "year":"1995", + "journal-title":"Teor. Mat. Fiz.", + "ISSN":"http:\/\/id.crossref.org\/issn\/0564-6162", + "issn-type":"print" + }, + { + "key":"PhysRevD.93.016005Cc96R2", + "DOI":"10.1007\/BF02274026", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc97R1", + "DOI":"10.1007\/BF01018394", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc98R1", + "DOI":"10.1016\/S0370-1573(02)00017-0", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc99R1", + "DOI":"10.1016\/0550-3213(81)90199-1", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc100R1", + "DOI":"10.1007\/BF01086253", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc101R1", + "DOI":"10.1142\/S0217751X04016775", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc102R1", + "DOI":"10.1016\/0550-3213(79)90605-9", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc103R1", + "DOI":"10.1088\/0305-4470\/25\/21\/017", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc104R1", + "DOI":"10.1016\/0370-2693(93)91834-A", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc105R1", + "DOI":"10.1016\/0550-3213(93)90338-P", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc106R1", + "DOI":"10.1016\/0550-3213(94)90325-5", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc107R1", + "DOI":"10.1016\/0550-3213(79)90234-7", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc108R1", + "DOI":"10.1016\/0370-2693(91)91715-8", + "doi-asserted-by":"publisher" + }, + { + "key":"PhysRevD.93.016005Cc109R1", + "DOI":"10.1002\/prop.19930410402", + "doi-asserted-by":"publisher" + } + ], + "container-title":[ + "Physical Review D" + ], + "original-title":[ + + ], + "language":"en", + "link":[ + { + "URL":"http:\/\/link.aps.org\/accepted\/10.1103\/PhysRevD.93.016005", + "content-type":"application\/pdf", + "content-version":"am", + "intended-application":"unspecified" + }, + { + "URL":"http:\/\/link.aps.org\/article\/10.1103\/PhysRevD.93.016005", + "content-type":"unspecified", + "content-version":"vor", + "intended-application":"syndication" + }, + { + "URL":"http:\/\/harvest.aps.org\/v2\/journals\/articles\/10.1103\/PhysRevD.93.016005\/fulltext", + "content-type":"unspecified", + "content-version":"vor", + "intended-application":"similarity-checking" + } + ], + "deposited":{ + "date-parts":[ + [ + 2017, + 4, + 6 + ] + ], + "date-time":"2017-04-06T16:02:24Z", + "timestamp":1491494544000 + }, + "score":1.0, + "subtitle":[ + + ], + "short-title":[ + + ], + "issued":{ + "date-parts":[ + [ + 2016, + 1, + 11 + ] + ] + }, + "references-count":100, + "journal-issue":{ + "published-print":{ + "date-parts":[ + [ + 2016, + 1 + ] + ] + }, + "issue":"1" + }, + "URL":"http:\/\/dx.doi.org\/10.1103\/physrevd.93.016005", + "relation":{ + "cites":[ + + ] + }, + "ISSN":[ + "2470-0010", + "2470-0029" + ], + "issn-type":[ + { + "value":"2470-0010", + "type":"print" + }, + { + "value":"2470-0029", + "type":"electronic" + } + ], + "article-number":"016005" + } +} diff --git a/tests/unit/test_crossref.py b/tests/unit/test_crossref.py index ce97bece..9c969e91 100644 --- a/tests/unit/test_crossref.py +++ b/tests/unit/test_crossref.py @@ -50,6 +50,33 @@ def _get_record_from_processed_item(item, spider): clean_dir() +@pytest.fixture +def record_with_unknown_type(): + """Return results generator from the crossref spider. All fields, one record. + """ + def _get_record_from_processed_item(item, spider): + crawl_result = pipeline.process_item(item, spider) + validate(crawl_result['record'], 'hep') + assert crawl_result + return crawl_result['record'] + + crawler = Crawler(spidercls=crossref_spider.CrossrefSpider) + spider = crossref_spider.CrossrefSpider.from_crawler(crawler, 'fakedoi') + fake_response = fake_response_from_file( + 'crossref/sample_crossref_record_with_unknown_type.json', + response_type=TextResponse, + ) + + parsed_items = spider.parse(fake_response) + + pipeline = InspireCeleryPushPipeline() + pipeline.open_spider(spider) + + yield _get_record_from_processed_item(parsed_items, spider) + + clean_dir() + + def test_titles(record): """Test extracting title.""" expected_titles = [{ @@ -97,6 +124,11 @@ def test_collections(record): assert record['document_type'] == ['article'] +def test_unknown_document_type(record_with_unknown_type): + """Test extracting collections""" + assert record_with_unknown_type['document_type'] == ['article'] + + def test_imprints(record): """Test extracting imprints.""" imprints = [{