diff --git a/README.md b/README.md index 4f892b9..f10a197 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,6 @@ Springer Nature currently offers three APIs: - **Springer Meta API:** Advanced version offering versioned metadata. - **Springer OpenAccess API:** Metadata and, where available, full-text -**Note:** sprynger currently supports the Metadata and OpenAccess API - ## ⬇️ Install Download and install the package from PyPI: diff --git a/docs/source/classes/Meta.rst b/docs/source/classes/Meta.rst new file mode 100644 index 0000000..28a8954 --- /dev/null +++ b/docs/source/classes/Meta.rst @@ -0,0 +1,8 @@ +sprynger.Meta +================== + +.. automodule:: sprynger.meta + :members: + :undoc-members: + :show-inheritance: + :inherited-members: diff --git a/docs/source/index.rst b/docs/source/index.rst index 1a33af7..55b2f0e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -38,7 +38,6 @@ Springer Nature currently offers three APIs: - **Springer Meta API:** Advanced version offering versioned metadata. - **Springer OpenAccess API:** Metadata and, where available, full-text -**Note:** sprynger currently supports the Metadata and OpenAccess API ⬇️ Install ----------- @@ -137,6 +136,7 @@ OpenAccess :maxdepth: 1 classes/Metadata.rst + classes/Meta.rst classes/OpenAccess.rst diff --git a/pyproject.toml b/pyproject.toml index d24c802..0a2c6b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "sprynger" -version = "0.0.2" +version = "0.0.3" authors = [ { name="Nils Herrmann", email="nils_herrmann@outlook.de" }, ] diff --git a/sprynger/__init__.py b/sprynger/__init__.py index df81d5e..c4b23e4 100644 --- a/sprynger/__init__.py +++ b/sprynger/__init__.py @@ -1,4 +1,5 @@ """Import all the necessary classes and functions for the user to use the package.""" +from sprynger.meta import Meta from sprynger.metadata import Metadata from sprynger.openaccess import OpenAccess from sprynger.utils.startup import init diff --git a/sprynger/base.py b/sprynger/base.py index a97b9d9..b32eb12 100644 --- a/sprynger/base.py +++ b/sprynger/base.py @@ -121,10 +121,10 @@ def _fetch_or_load(self) -> Union[Response, MockResponse]: def _get_total_results(self): """Get the total number of results for the query.""" - if self._api == 'Metadata': + if (self._api == 'Metadata') or (self._api == 'Meta'): res_json = _to_json(self._res) total = res_json['result'][0]['total'] - elif self._api in ['OpenAccess', 'OpenAccessJournal', 'OpenAccessBook']: + elif self._api in ['OpenAccess']: res_xml = _to_xml(self._res) total = res_xml.find('./result/total').text else: diff --git a/sprynger/meta.py b/sprynger/meta.py new file mode 100644 index 0000000..ebeaf57 --- /dev/null +++ b/sprynger/meta.py @@ -0,0 +1,123 @@ +""" +Module with Meta class. +""" +from typing import Union + +from sprynger.metadata import Metadata +from sprynger.utils.data_structures import MetadataCreator, MetaDiscipline, MetaRecord, MetaURL +from sprynger.utils.parse import make_int_if_possible, str_to_bool + + +class Meta(Metadata): + """Class to retreive the metadata of a document from the Springer Meta v2 API.""" + @property + def records(self) -> list[MetaRecord]: + """Contains the individual records that matched the query. + + Returns: + list[MetadataRecord]: List of MetadataRecord objects which contain the following + items of a document: `contentType`, + `identifier`, `language`, `urls`, `title`, `creators`, + `publicationName`, `openaccess`, `doi`, `publisher`, `publicationDate`, + `publicationType`, `issn`, `eIssn`, `volume`, `number`, `issueType`, + `topicalCollection`, `genre`, `startingPage`, + `endingPage`, `journalId`, `onlineDate`, + `copyright`, `abstract`, `conferenceInfo`, + `keyword`, `subjects` and `disciplines`. + """ + records_list = [] + for record in self.json.get('records', []): + # Parse the URLs + url_list = [] + for url in record.get('url', []): + url_format = url.get('format') + platform = url.get('platform') + value = url.get('value') + url_list.append(MetaURL(format=url_format, platform=platform, value=value)) + # Parse the creators + creators = [] + for ceator in record.get('creators', []): + creators.append(MetadataCreator(creator=ceator.get('creator'), + ORCID=ceator.get('ORCID'))) + # Parse the disciplines + disciplines = [] + for discipline in record.get('disciplines', []): + disciplines.append( + MetaDiscipline(id=discipline.get('id'), + term=discipline.get('term')) + ) + + records_list.append( + MetaRecord( + contentType=record.get('contentType'), + identifier=record.get('identifier'), + language=record.get('language'), + urls=url_list, + title=record.get('title'), + creators=creators, + publicationName=record.get('publicationName'), + openaccess=str_to_bool(record.get('openaccess')), + doi=record.get('doi'), + publisher=record.get('publisher'), + publicationDate=record.get('publicationDate'), + publicationType=record.get('publicationType'), + issn=record.get('issn'), + eIssn=record.get('eIssn'), + volume=make_int_if_possible(record.get('volume')), + number=make_int_if_possible(record.get('number')), + issueType=record.get('issueType'), + topicalCollection=record.get('topicalCollection'), + genre=record.get('genre'), + startingPage=make_int_if_possible(record.get('startingPage')), + endingPage=make_int_if_possible(record.get('endingPage')), + journalId=make_int_if_possible(record.get('journalId')), + onlineDate=record.get('onlineDate'), + copyright=record.get('copyright'), + abstract=record.get('abstract'), + conferenceInfo = record.get('conferenceInfo'), + keyword = record.get('keyword'), + subjects=record.get('subjects'), + disciplines=disciplines + ) + ) + return records_list + + def __init__(self, + query: str = '', + start: int = 1, + nr_results: int = 10, + premium: bool = False, + cache: bool = True, + refresh: Union[bool, int] = False, + **kwargs): + """ + Args: + query (str): The query to search for. + start (int): The starting index for the results. Defaults to 1. + nr_results (int): The number of results to retrieve. Defaults to 10. + premium (bool): Whether the user has a premium account. Defaults to False. + cache (bool): Whether to cache the results. Defaults to True. + refresh (bool|int): Weather to refresh the cache. If an integer is provided, + it will be used as the cache expiration time in days. Defaults to False. + kwargs: Additional fields for query. + + This class is iterable, allowing you to iterate over the metadata `records` retrieved. + It also supports indexing to access the metadata of specific documents. + + Example: + >>> meta = Meta('Segmentation', issn='1573-7497', datefrom='2024-01-01') + >>> for record in metadata: + >>> print(record) + + Note: + - All properties can be converted to a pandas DataFrame with `pd.DataFrame(object.property)`. + """ + super().__init__(query=query, + start=start, + nr_results=nr_results, + premium=premium, + cache=cache, + refresh=refresh, + **kwargs) + self._nr_results = nr_results + self._records = self.records diff --git a/sprynger/metadata.py b/sprynger/metadata.py index 72f68f2..7334696 100644 --- a/sprynger/metadata.py +++ b/sprynger/metadata.py @@ -130,8 +130,9 @@ def __init__(self, Note: - All properties can be converted to a pandas DataFrame with `pd.DataFrame(object.property)`. """ + api = self.__class__.__name__ super().__init__(query=query, - api='Metadata', + api=api, start=start, nr_results=nr_results, premium=premium, diff --git a/sprynger/openaccess.py b/sprynger/openaccess.py index 957e523..8284561 100644 --- a/sprynger/openaccess.py +++ b/sprynger/openaccess.py @@ -13,17 +13,87 @@ from typing import Optional, Union from sprynger.retrieve import Retrieve -from sprynger.utils.data_structures import Paragraph +from sprynger.utils.data_structures import Affiliation, Contributor, Date, Paragraph from sprynger.utils.parse import get_attr, get_text, make_int_if_possible -from sprynger.utils.parse_openaccess import get_paragraphs +from sprynger.utils.parse_openaccess import ( + affs_to_dict, + get_contributors, + get_affiliations, + get_date, + get_paragraphs, +) + class Article: """Auxiliary class to parse an article from a journal.""" + @property + def affiliations(self) -> list[Affiliation]: + """List of affiliations of the collaborators of the article. Each affiliation is represented + as a named tuple with the following fields: + `type`, `ref_nr`, `ror`, `grid`, `isni`, `division`, `name`, `city`, `country`. + + Note: To match affiliations with contributors use the affiliation's `ref_nr` and the + contributor's `affiliations_ref_nr`. + """ + return get_affiliations(self._data) + + @property + def affiliations_dict(self) -> dict[str, Affiliation]: + """Auxiliary property to query the affiliations by their reference number.""" + return affs_to_dict(self.affiliations) + @property def article_type(self) -> Optional[str]: """Type of the article.""" return self._data.get('article-type') + @property + def contributors(self) -> list[Contributor]: + """List of contributors of the article. Each contributor is represented as a named tuple + with the following fields: + `type`, `nr`, `orcid`, `surname`, `given_name`, `email`, `affiliations_ref_nr`. + + Note: To match contributors with affiliations use the contributor's `affiliations_ref_nr` + and the affiliation's `ref_nr`. + """ + return get_contributors(self._article_meta) + + @property + def date_epub(self) -> Date: + """Electronic publication date of the article.""" + date_node = self._article_meta.find('.//pub-date[@publication-format="electronic"]') + return get_date(date_node) + + @property + def date_ppub(self) -> Date: + """Print publication date of the article.""" + date_node = self._article_meta.find('.//pub-date[@publication-format="print"]') + return get_date(date_node) + + @property + def date_registration(self) -> Date: + """Registration date of the article.""" + date_node = self._article_meta.find('.//history/date[@date-type="registration"]') + return get_date(date_node) + + @property + def date_received(self) -> Date: + """Date when article was recieved.""" + date_node = self._article_meta.find('.//history/date[@date-type="received"]') + return get_date(date_node) + + @property + def date_accepted(self) -> Date: + """Accepted date of the article.""" + date_node = self._article_meta.find('.//history/date[@date-type="accepted"]') + return get_date(date_node) + + @property + def date_online(self) -> Date: + """Online date of the article.""" + date_node = self._article_meta.find('.//history/date[@date-type="online"]') + return get_date(date_node) + @property def doi(self) -> Optional[str]: """DOI of the article.""" @@ -108,9 +178,35 @@ def __repr__(self) -> str: return f'Article {self.doi}' - class Chapter: """Auxiliary class to parse a chapter from a book.""" + @property + def affiliations(self) -> list[Affiliation]: + """List of affiliations of the collaborators of the chapter. Each affiliation is represented + as a named tuple with the following fields: + `type`, `ref_nr`, `ror`, `grid`, `isni`, `division`, `name`, `city`, `country`. + + Note: To match affiliations with contributors use the affiliation's `ref_nr` and the + contributor's `affiliations_ref_nr`. + """ + return get_affiliations(self._data) + + @property + def affiliations_dict(self) -> dict[str, Affiliation]: + """Auxiliary property to query the affiliations by their reference number.""" + return affs_to_dict(self.affiliations) + + @property + def contributors(self) -> list[Contributor]: + """List of contributors of the chapter. Each contributor is represented as a named tuple + with the following fields: + `type`, `nr`, `orcid`, `surname`, `given_name`, `email`, `affiliations_ref_nr`. + + Note: To match contributors with affiliations use the contributor's `affiliations_ref_nr` + and the affiliation's `ref_nr`. + """ + return get_contributors(self._data) + @property def book_doi(self) -> Optional[str]: """DOI of the book.""" @@ -142,6 +238,30 @@ def chapter_nr(self) -> Optional[Union[int, str]]: chapter_nr = get_attr(self._chapter_meta, 'book-part-id', 'book-part-id-type', 'chapter') return make_int_if_possible(chapter_nr) + @property + def date_epub(self) -> Date: + """Electronic publication date of the chapter.""" + date_node = self._chapter_meta.find('.//pub-date[@publication-format="electronic"]') + return get_date(date_node) + + @property + def date_ppub(self) -> Date: + """Print publication date of the chapter.""" + date_node = self._chapter_meta.find('.//pub-date[@publication-format="print"]') + return get_date(date_node) + + @property + def date_registration(self) -> Date: + """Registration date of the chapter.""" + date_node = self._chapter_meta.find('.//pub-history/date[@date-type="registration"]') + return get_date(date_node) + + @property + def date_online(self) -> Date: + """Online date of the chapter.""" + date_node = self._chapter_meta.find('.//pub-history/date[@date-type="online"]') + return get_date(date_node) + @property def doi(self) -> Optional[str]: """DOI of the chapter.""" diff --git a/sprynger/tests/test_meta.py b/sprynger/tests/test_meta.py new file mode 100644 index 0000000..102f922 --- /dev/null +++ b/sprynger/tests/test_meta.py @@ -0,0 +1,87 @@ +"""Tests for the Meta class.""" +from sprynger import init +from sprynger import Meta +from sprynger.utils.data_structures import ( + MetadataCreator, + MetaDiscipline, + MetadataFacets, + MetaRecord, + MetaURL, +) + +init() + +article = Meta(doi='10.1007/s00394-024-03496-7') + +def test_results(): + """Test the results.""" + assert article.results.total == 1 + assert article.results.start == 1 + assert article.results.pageLength == 10 + assert article.results.recordsRetrieved == 1 + +def test_records(): + """Test the records.""" + expected_record = MetaRecord( + contentType='Article', + identifier='doi:10.1007/s00394-024-03496-7', + language='en', + urls=[ + MetaURL( + format='html', + platform='web', + value='http://link.springer.com/openurl/fulltext?id=doi:10.1007/s00394-024-03496-7', + ), + MetaURL( + format='pdf', + platform='web', + value='http://link.springer.com/openurl/pdf?id=doi:10.1007/s00394-024-03496-7', + ), + MetaURL( + format='', + platform='', + value='http://dx.doi.org/10.1007/s00394-024-03496-7', + ), + ], + title='Ultra-processed food intake in toddlerhood and mid-childhood in the UK: cross sectional and longitudinal perspectives', + creators=[ + MetadataCreator(creator='Conway, Rana E.', ORCID='0000-0003-0955-7107'), + MetadataCreator(creator='Heuchan, Gabriella N.', ORCID=None), + MetadataCreator(creator='Heggie, Lisa', ORCID='0000-0002-4846-2357'), + MetadataCreator(creator='Rauber, Fernanda', ORCID='0000-0001-9693-7954'), + MetadataCreator(creator='Lowry, Natalie', ORCID='0000-0002-9137-5005'), + MetadataCreator(creator='Hallen, Hannah', ORCID=None), + MetadataCreator(creator='Llewellyn, Clare H.', ORCID='0000-0002-0066-2827'), + ], + publicationName='European Journal of Nutrition', + openaccess=True, + doi='10.1007/s00394-024-03496-7', + publisher='Springer', + publicationDate='2024-10-04', + publicationType='Journal', + issn='1436-6207', + eIssn='1436-6215', + volume='', + number='', + issueType='', + topicalCollection='', + genre=['OriginalPaper', 'Original Contribution'], + startingPage=1, + endingPage=12, + journalId=394, + onlineDate='2024-10-04', + copyright='©2024 The Author(s)', + abstract='Purpose (i) Characterize ultra-processed food (UPF) intakes in toddlerhood and mid-childhood, including identifying principal UPF sub-groups and associations with nutrient profile; (ii) explore stability and change in UPF intake between toddlerhood and mid-childhood. Methods Data were from children in the UK Gemini twin cohort at 21 months ( n \u2009=\u20092,591) and 7 years ( n \u2009=\u2009592) of age. UPF intakes were estimated using diet diaries and Nova classification. Complex samples general linear or logistic regression models were used to explore associations between UPF intake, UPF sub-groups and nutrients, and changes in intake over time. Results The contribution of UPF to total energy was 46.9% (±\u200914.7) at 21 months and 59.4% (±\u200912.5) at 7 years. Principal UPF sub-groups were yogurts, higher-fiber breakfast cereals, and wholegrain breads in toddlerhood, and puddings and sweet cereal products and white breads in mid-childhood. At both ages, mean free sugar and sodium intakes exceeded recommended maximums and higher UPF consumption was associated with consuming more of each nutrient ( P \u2009<\u20090.001). UPF intake was negatively associated with fat, saturated fat and protein intake in toddlerhood, and fiber intake in mid-childhood ( P \u2009<\u20090.001). Being in the highest UPF intake quintile in toddlerhood was predictive of being in the highest quintile in mid-childhood (OR 9.40, 95%CI 3.94–22.46). Conclusions UPF accounted for nearly half of toddlers’ energy, increasing to 59% in mid-childhood. Higher UPF consumers had higher intakes of free sugar and sodium. UPF intake in toddlerhood was predictive of mid-childhood intake. Effective policies are needed to reduce UPF intakes in the early years.', + conferenceInfo=[], + keyword=['Ultra-processed foods', 'Diet quality', 'Toddlers', 'Children', 'UK'], + subjects=['Chemistry', 'Nutrition'], + disciplines=[MetaDiscipline(id='3524', term='Nutrition')], + ) + assert article.records[0] == expected_record + +def test_facets(): + """Test the facets.""" + assert len(article.facets) == 12 + + expected_first_facet = MetadataFacets(facet='subject', value='Chemistry', count='1') + assert article.facets[0] == expected_first_facet diff --git a/sprynger/tests/test_metadata.py b/sprynger/tests/test_metadata.py index 9c83620..6209228 100644 --- a/sprynger/tests/test_metadata.py +++ b/sprynger/tests/test_metadata.py @@ -1,6 +1,4 @@ -"""Tests for the metadata module.""" -import pytest - +"""Tests for the Meta class.""" from sprynger import init from sprynger import Metadata from sprynger.utils.data_structures import (MetadataCreator, diff --git a/sprynger/tests/test_openaccess.py b/sprynger/tests/test_openaccess.py index 94aa61c..b81bf63 100644 --- a/sprynger/tests/test_openaccess.py +++ b/sprynger/tests/test_openaccess.py @@ -1,9 +1,7 @@ """Tests for the OpenAccess class.""" -import pytest - from sprynger import init, OpenAccess from sprynger.openaccess import Article, Chapter -from sprynger.utils.data_structures import Paragraph +from sprynger.utils.data_structures import Affiliation, Contributor, Date, Paragraph init() @@ -17,6 +15,47 @@ article = OpenAccess(doi="10.1007/s40747-024-01577-y", refresh=30) +def test_article_affiliations(): + """Test the affiliations of the article.""" + expected = [ + Affiliation(type=None, ref_nr='Aff1', ror='https://ror.org/04z7qrj66', grid='grid.412518.b', isni='0000 0001 0008 0619', division='College of Information Engineering', name='Shanghai Maritime University', city='Shanghai', country='China'), + Affiliation(type=None, ref_nr='Aff2', ror='https://ror.org/05k2j8e48', grid='grid.495244.a', isni='0000 0004 1761 5722', division='College of Artificial Intelligence', name='Jiangxi University of Technology', city='Jiangxi', country='China'), + Affiliation(type=None, ref_nr='Aff3', ror='https://ror.org/04z7qrj66', grid='grid.412518.b', isni='0000 0001 0008 0619', division='College of Merchant Marine', name='Shanghai Maritime University', city='Shanghai', country='China') + ] + for a in article: + assert a.affiliations == expected + + +def test_article_contributors(): + """Test the contributors of the article.""" + expected = [ + Contributor(type='author', nr='Au1', orcid='http://orcid.org/0000-0002-1606-3511', surname='Hu', given_name='Zhanhui', email=None, affiliations_ref_nr=['Aff1']), + Contributor(type='author', nr='Au2', orcid=None, surname='Liu', given_name='Guangzhong', email='gzhliu@shmtu.edu.cn', affiliations_ref_nr=['Aff1']), + Contributor(type='author', nr='Au3', orcid=None, surname='Li', given_name='Yanping', email=None, affiliations_ref_nr=['Aff2']), + Contributor(type='author', nr='Au4', orcid=None, surname='Zhuang', given_name='Siqing', email=None, affiliations_ref_nr=['Aff3']) + ] + for a in article: + assert a.contributors == expected + + +def test_article_dates(): + """Test the dates of the article.""" + expected_date_accepted = Date(year=2024, month=7, day=21) + expected_date_epub = Date(year=2024, month=9, day=9) + expected_date_online = Date(year=2024, month=9, day=9) + expected_date_ppub = Date(year=None, month=None, day=None) + expected_date_received = Date(year=2024, month=4, day=12) + expected_date_registration = Date(year=2024, month=7, day=24) + + for a in article: + assert a.date_accepted == expected_date_accepted + assert a.date_epub == expected_date_epub + assert a.date_online == expected_date_online + assert a.date_ppub == expected_date_ppub + assert a.date_received == expected_date_received + assert a.date_registration == expected_date_registration + + def test_article_meta(): """Test the article meta-data.""" for a in article: @@ -43,6 +82,48 @@ def test_book_meta(): assert chapter.publisher_loc == "Cham" +def test_chapter_affiliations(): + """Test the affiliations of the chapter.""" + expected_affiliations = [ + Affiliation(type='book author', ref_nr='Aff1', ror='https://ror.org/00d7xrm67', grid='grid.410413.3', isni='0000 0001 2294 748X', division='Institute of Software Technology', name='Graz University of Technology', city='Graz', country='Austria'), + Affiliation(type='book author', ref_nr='Aff2', ror=None, grid='grid.426094.d', isni='0000 0004 0625 6437', division='Corporate Technology', name='Siemens (Austria)', city='Wien', country='Austria'), + Affiliation(type='book author', ref_nr='Aff3', ror='https://ror.org/03yxnpp24', grid='grid.9224.d', isni='0000 0001 2168 1229', division='ETS de Ingeniería Informática', name='University of Seville', city='Sevilla', country='Spain'), + Affiliation(type=None, ref_nr='Aff4', ror='https://ror.org/00d7xrm67', grid='grid.410413.3', isni='0000 0001 2294 748X', division='Institute of Software Technology', name='Graz University of Technology', city='Graz', country='Austria'), + Affiliation(type=None, ref_nr='Aff5', ror=None, grid='grid.426094.d', isni='0000 0004 0625 6437', division='Corporate Technology', name='Siemens (Austria)', city='Wien', country='Austria'), + Affiliation(type=None, ref_nr='Aff6', ror='https://ror.org/03yxnpp24', grid='grid.9224.d', isni='0000 0001 2168 1229', division='ETS de Ingeniería Informática', name='University of Seville', city='Sevilla', country='Spain') + ] + for one_chapter in chapter: + assert one_chapter.affiliations == expected_affiliations + + +def test_chapter_contributors(): + """Test the contributors of the chapter.""" + expected_contributors = [ + Contributor(type='author', nr=None, orcid=None, surname='Felfernig', given_name='Alexander', email='alexander.felfernig@ist.tugraz.at', affiliations_ref_nr=['Aff1']), + Contributor(type='author', nr=None, orcid=None, surname='Falkner', given_name='Andreas', email='andreas.a.falkner@siemens.com', affiliations_ref_nr=['Aff2']), + Contributor(type='author', nr=None, orcid=None, surname='Benavides', given_name='David', email='benavides@us.es', affiliations_ref_nr=['Aff3']), + Contributor(type='author', nr=None, orcid=None, surname='Felfernig', given_name='Alexander', email=None, affiliations_ref_nr=['Aff4']), + Contributor(type='author', nr=None, orcid=None, surname='Falkner', given_name='Andreas', email=None, affiliations_ref_nr=['Aff5']), + Contributor(type='author', nr=None, orcid=None, surname='Benavides', given_name='David', email=None, affiliations_ref_nr=['Aff6']) + ] + for one_chapter in chapter: + assert one_chapter.contributors == expected_contributors + + +def test_chapter_dates(): + """Test the dates of the chapter.""" + expected_date_epub = Date(year=2024, month=6, day=30) + expected_date_online = Date(year=2024, month=6, day=30) + expected_date_ppub = Date(year=None, month=None, day=None) + expected_date_registration = Date(year=2024, month=5, day=27) + + for one_chapter in chapter: + assert one_chapter.date_epub == expected_date_epub + assert one_chapter.date_online == expected_date_online + assert one_chapter.date_ppub == expected_date_ppub + assert one_chapter.date_registration == expected_date_registration + + def test_chapter_meta(): """Test the chapter meta data.""" for one_chapter in chapter: diff --git a/sprynger/utils/data_structures.py b/sprynger/utils/data_structures.py index b5eb310..e2c666b 100644 --- a/sprynger/utils/data_structures.py +++ b/sprynger/utils/data_structures.py @@ -32,5 +32,38 @@ def create_namedtuple(name: str, fields: list, defaults=None): ############################# # Open Access Paragraph -fields_openaccess_paragraphs = ['paragraph_id', 'section_id', 'section_title', 'text'] -Paragraph = create_namedtuple('Paragraph', fields_openaccess_paragraphs) +fields_oa_paragraphs = ['paragraph_id', 'section_id', 'section_title', 'text'] +Paragraph = create_namedtuple('Paragraph', fields_oa_paragraphs) + +fields_oa_contributor = ['type', 'nr', 'orcid', 'surname', 'given_name', 'email', 'affiliations_ref_nr'] +Contributor = create_namedtuple('Contributor', fields_oa_contributor) + +fields_oa_aff = ['type', 'ref_nr', 'ror', 'grid', 'isni', 'division', 'name', 'city', 'country'] +Affiliation = create_namedtuple('Affiliation', fields_oa_aff) + +fields_date = ['year', 'month', 'day'] +Date = create_namedtuple('Date', fields_date) + +############################# +# Meta # +############################# + +fields_meta_url = ['format', 'platform', 'value'] +MetaURL = create_namedtuple('MetaURL', fields_meta_url) + +fields_meta_discipline = ['id', 'term'] +MetaDiscipline = create_namedtuple('MetaDiscipline', fields_meta_discipline) + +fields_meta_record = [ + 'contentType', 'identifier', 'language', + 'urls', 'title', 'creators', + 'publicationName', 'openaccess', 'doi', + 'publisher', 'publicationDate', 'publicationType', + 'issn', 'eIssn', 'volume', + 'number', 'issueType', 'topicalCollection', + 'genre', 'startingPage', 'endingPage', + 'journalId', 'onlineDate', 'copyright', + 'abstract', 'conferenceInfo', 'keyword', + 'subjects', 'disciplines' +] +MetaRecord = create_namedtuple('MetaRecord', fields_meta_record) diff --git a/sprynger/utils/parse.py b/sprynger/utils/parse.py index 7dc93c8..146cc73 100644 --- a/sprynger/utils/parse.py +++ b/sprynger/utils/parse.py @@ -16,11 +16,12 @@ def get_attr(node: Optional[_Element], return None -def get_text(node: _Element, +def get_text(node: Optional[_Element], path: str) -> Optional[str]: """Get the text of an XML node.""" - if node.find(path) is not None: - return node.find(path).text + if node is not None: + if node.find(path) is not None: + return node.find(path).text return None diff --git a/sprynger/utils/parse_openaccess.py b/sprynger/utils/parse_openaccess.py index 080a193..6d5258e 100644 --- a/sprynger/utils/parse_openaccess.py +++ b/sprynger/utils/parse_openaccess.py @@ -1,6 +1,67 @@ """Module with auxiliary functions to parse OpenAccess documents.""" -from sprynger.utils.data_structures import Paragraph -from sprynger.utils.parse import get_text +from lxml.etree import _Element + +from sprynger.utils.data_structures import Affiliation, Contributor, Date, Paragraph +from sprynger.utils.parse import get_attr, get_text, make_int_if_possible + + +def affs_to_dict(affs) -> dict[str, Affiliation]: + """Auxiliary function to query the affiliations by their number.""" + return {aff.nr: aff for aff in affs} + + +def get_affiliations(data: _Element) -> list[Affiliation]: + """Parse the affiliations of the document.""" + affiliations = [] + for contrib_group in data.findall('.//contrib-group'): + contribution_group = contrib_group.get('content-type') + for a in contrib_group.findall('.//aff'): + institution = a.find('.//institution-wrap') + new_aff = Affiliation( + type=contribution_group, + ref_nr=a.get('id'), + ror=get_attr(institution, 'institution-id', 'institution-id-type', 'ROR'), + grid=get_attr(institution, 'institution-id', 'institution-id-type', 'GRID'), + isni=get_attr(institution, 'institution-id', 'institution-id-type', 'ISNI'), + division=get_attr(institution, 'institution', 'content-type', 'org-division'), + name=get_attr(institution, 'institution', 'content-type', 'org-name'), + city=get_attr(a, 'addr-line', 'content-type', 'city'), + country=get_text(a, './/country') + ) + affiliations.append(new_aff) + return affiliations + + +def get_contributors(data: _Element) -> list[Contributor]: + """Parse the contributors of the document and matcg them with their affiliations.""" + contributors = [] + for c in data.findall('.//contrib'): + # Get affiliation + affs_nr = [] + for aff_ref in c.findall('.//xref[@ref-type="aff"]'): + aff_nr = aff_ref.get('rid') + affs_nr.append(aff_nr) + + # Get contributor data + new_contrib = Contributor( + type=c.get('contrib-type'), + nr=c.get('id'), + orcid=get_attr(c, 'contrib-id', 'contrib-id-type', 'orcid'), + surname=get_text(c, './/name/surname'), + given_name=get_text(c, './/name/given-names'), + email=get_text(c, './/email'), + affiliations_ref_nr=affs_nr + ) + contributors.append(new_contrib) + return contributors + + +def get_date(date_node: _Element) -> Date: + """Auxiliary function to extract date information from a date node.""" + return Date(day=make_int_if_possible(get_text(date_node, './/day')), + month=make_int_if_possible(get_text(date_node, './/month')), + year=make_int_if_possible(get_text(date_node, './/year'))) + def get_paragraphs(xml) -> list[Paragraph]: """Paragraphs of the OpenAccess document.