-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathtest_parsers_jats.py
138 lines (109 loc) · 3.42 KB
/
test_parsers_jats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016, 2017, 2019 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.
from __future__ import (
absolute_import,
division,
print_function,
unicode_literals,
)
import pytest
import yaml
import sys
from deepdiff import DeepDiff
from inspire_schemas.utils import validate
from hepcrawl.testlib.fixtures import get_test_suite_path
from hepcrawl.parsers.jats import JatsParser
def get_parsed_from_file(filename):
"""A dictionary holding the parsed elements of the record."""
path = get_test_suite_path('responses', 'aps', filename)
with open(path) as f:
aps_expected_dict = yaml.load(f)
return aps_expected_dict
def get_parser_by_file(filename):
"""A JatsParser instanciated on an APS article."""
path = get_test_suite_path('responses', 'aps', filename)
with open(path) as f:
aps_jats = f.read()
return JatsParser(aps_jats)
@pytest.fixture(scope='module', params=[
('PhysRevX.7.021022.xml', 'PhysRevX.7.021022_expected.yml'),
('PhysRevX.4.021018.xml', 'PhysRevX.4.021018_expected.yml'),
('PhysRevD.96.095036.xml', 'PhysRevD.96.095036_expected.yml'),
])
def records(request):
return {
'jats': get_parser_by_file(request.param[0]),
'expected': get_parsed_from_file(request.param[1]),
'file_name': request.param[0],
}
FIELDS_TO_CHECK = [
'abstract',
'copyright_holder',
'copyright_statement',
'copyright_year',
'document_type',
'license_url',
'license_statement',
'article_type',
'journal_title',
'material',
'publisher',
'year',
'authors',
'artid',
'title',
'number_of_pages',
'dois',
'references',
'journal_volume',
'journal_issue',
'is_conference_paper',
]
FIELDS_TO_CHECK_SEPARATELY = [
'publication_date',
'documents',
]
def test_data_completeness(records):
tested_fields = FIELDS_TO_CHECK + FIELDS_TO_CHECK_SEPARATELY
for field in records['expected'].keys():
assert field in tested_fields
@pytest.mark.parametrize(
'field_name',
FIELDS_TO_CHECK
)
def test_field(field_name, records):
result = getattr(records['jats'], field_name)
expected = records['expected'][field_name]
if field_name == 'authors':
diffs = DeepDiff(result, expected, ignore_order=True)
if sys.version_info[0] < 3 and 'type_changes' in diffs:
del diffs['type_changes']
assert diffs == {}
else:
assert result == expected
def test_publication_date(records):
result = records['jats'].publication_date.dumps()
expected = records['expected']['publication_date'].isoformat()
assert result == expected
@pytest.mark.skip(reason='No collaboration in input')
def test_collaborations(records):
result = records['jats'].collaborations
expected = records['expected']['collaborations']
assert result == expected
def test_parse(records):
record = records['jats'].parse()
assert validate(record, 'hep') == None
def test_attach_fulltext_document(records):
parser = records['jats']
parser.attach_fulltext_document(
records['file_name'],
'http://example.org/{}'.format(records['file_name'])
)
result = parser.parse()
assert result['documents'] == records['expected']['documents']