Skip to content

Commit a8003e5

Browse files
committed
Adding script to add orcids to externalcontacts
This script uses the publications to pull in ORCIDs using OpenAlex and article DOIs where available. These new ORCIDs are then added to the table `ndb.externalcontacts` with identifier `7`. The script can be run using `uv run get_orcid.py`.
1 parent deb91fe commit a8003e5

File tree

5 files changed

+257
-0
lines changed

5 files changed

+257
-0
lines changed

Proposals/orcids/.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.12

Proposals/orcids/README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Adding ORCIDs to Neotoma
2+
3+
Adding ORCIDs takes several steps. We need to create a table that links individuals to `contacts`.
4+
5+
```sql
6+
CREATE TABLE ndb.externalcontacts (
7+
contactid INT references ndb.contacts (contactid),
8+
identifier text,
9+
extdatabaseid INT references ndb.externaldatabases (extdatabaseid),
10+
UNIQUE(contactid, identifier, extdatabaseid));
11+
```
12+
13+
Once this table is created, we can search for ORCIDs and insert them into the database, tying them to publications and users.
14+
15+
The script

Proposals/orcids/get_orcids.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# We'll read in the `outputs.csv` file, and then for each row, update the DOI column.
2+
import psycopg2
3+
import dotenv
4+
import os
5+
import json
6+
import requests
7+
import re
8+
from pyalex import config, Works, Authors, Topics
9+
10+
config.max_retries = 0
11+
config.retry_backoff_factor = 0.1
12+
config.retry_http_codes = [429, 500, 503]
13+
config.email = "goring@wisc.edu"
14+
15+
dotenv.load_dotenv()
16+
17+
dbauth = json.loads(os.getenv('DBAUTH'))
18+
19+
conn = psycopg2.connect(**dbauth, connect_timeout=5)
20+
21+
def get_publications(limit:int=100000) -> list:
22+
url = 'https://api.neotomadb.org/v2.0/data/publications'
23+
all_pub = requests.get(url,
24+
params = {'limit': limit})
25+
if all_pub.status_code == 200:
26+
pub_data = json.loads(all_pub.content).get('data').get('result')
27+
else:
28+
return None
29+
doi_set = list()
30+
for i in pub_data:
31+
doi = i.get('publication').get('doi')
32+
pubid = i.get('publication').get('publicationid', '')
33+
if doi is not None:
34+
doi_set.append({'doi': doi, 'publicationid': pubid})
35+
return doi_set
36+
37+
def check_authors(doi:str):
38+
full_doi = "https://doi.org/" + doi
39+
alex_result = Works()[full_doi]
40+
return alex_result
41+
42+
def get_pub_auth(pubid) -> list:
43+
query = """
44+
SELECT pa.*
45+
FROM ndb.publicationauthors AS pa
46+
LEFT JOIN ndb.externalcontacts AS exct ON exct.contactid = pa.contactid
47+
WHERE publicationid = %(pubid)s AND exct.contactid IS NULL;"""
48+
with conn.cursor() as cur:
49+
cur.execute(query, {'pubid': pubid})
50+
result = cur.fetchall()
51+
return result
52+
53+
def align_pubs(doi_set:dict) -> dict:
54+
pub_auth = get_pub_auth(doi_set['publicationid'])
55+
if len(pub_auth) == 0:
56+
return []
57+
try:
58+
alex_res = check_authors(doi_set['doi'])
59+
except Exception:
60+
return []
61+
authors = alex_res['authorships']
62+
author_set = []
63+
for i in pub_auth:
64+
paper_auth = {'contactid': i[6],
65+
'name': None,
66+
'orcid': None}
67+
orcids = [j['author']['orcid'] for j in authors if re.search(i[3], j['raw_author_name'])]
68+
name = [j['raw_author_name'] for j in authors if re.search(i[3], j['raw_author_name'])]
69+
if len(orcids) > 0:
70+
paper_auth['orcid'] = orcids[0]
71+
if len(name) > 0:
72+
paper_auth['name'] = name[0]
73+
author_set.append(paper_auth)
74+
return author_set
75+
76+
def insert_orcid(author):
77+
query = """
78+
INSERT INTO ndb.externalcontacts(contactid, identifier, extdatabaseid)
79+
VALUES (%(contactid)s, %(orcid)s, 7)
80+
ON CONFLICT (contactid, identifier, extdatabaseid) DO NOTHING;"""
81+
with conn.cursor() as cur:
82+
cur.execute(query, author)
83+
conn.commit()
84+
return None
85+
86+
all_pubs = get_publications(100000)
87+
88+
find_authors = []
89+
for i in all_pubs:
90+
new_outcome = align_pubs(i)
91+
for j in new_outcome:
92+
if j['orcid']:
93+
print(f"Found ORCID for {j['name']} through OpenAlex.")
94+
insert_orcid(j)
95+

Proposals/orcids/pyproject.toml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[project]
2+
name = "orcids"
3+
version = "0.1.0"
4+
description = "Add your description here"
5+
readme = "README.md"
6+
requires-python = ">=3.12"
7+
dependencies = [
8+
"psycopg2>=2.9.10",
9+
"pyalex>=0.15.1",
10+
"python-dotenv>=1.0.1",
11+
"requests>=2.32.3",
12+
]

Proposals/orcids/uv.lock

Lines changed: 134 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)