|
| 1 | +# We'll read in the `outputs.csv` file, and then for each row, update the DOI column. |
| 2 | +import psycopg2 |
| 3 | +import dotenv |
| 4 | +import os |
| 5 | +import json |
| 6 | +import requests |
| 7 | +import re |
| 8 | +from pyalex import config, Works, Authors, Topics |
| 9 | + |
| 10 | +config.max_retries = 0 |
| 11 | +config.retry_backoff_factor = 0.1 |
| 12 | +config.retry_http_codes = [429, 500, 503] |
| 13 | +config.email = "goring@wisc.edu" |
| 14 | + |
| 15 | +dotenv.load_dotenv() |
| 16 | + |
| 17 | +dbauth = json.loads(os.getenv('DBAUTH')) |
| 18 | + |
| 19 | +conn = psycopg2.connect(**dbauth, connect_timeout=5) |
| 20 | + |
| 21 | +def get_publications(limit:int=100000) -> list: |
| 22 | + url = 'https://api.neotomadb.org/v2.0/data/publications' |
| 23 | + all_pub = requests.get(url, |
| 24 | + params = {'limit': limit}) |
| 25 | + if all_pub.status_code == 200: |
| 26 | + pub_data = json.loads(all_pub.content).get('data').get('result') |
| 27 | + else: |
| 28 | + return None |
| 29 | + doi_set = list() |
| 30 | + for i in pub_data: |
| 31 | + doi = i.get('publication').get('doi') |
| 32 | + pubid = i.get('publication').get('publicationid', '') |
| 33 | + if doi is not None: |
| 34 | + doi_set.append({'doi': doi, 'publicationid': pubid}) |
| 35 | + return doi_set |
| 36 | + |
| 37 | +def check_authors(doi:str): |
| 38 | + full_doi = "https://doi.org/" + doi |
| 39 | + alex_result = Works()[full_doi] |
| 40 | + return alex_result |
| 41 | + |
| 42 | +def get_pub_auth(pubid) -> list: |
| 43 | + query = """ |
| 44 | + SELECT pa.* |
| 45 | + FROM ndb.publicationauthors AS pa |
| 46 | + LEFT JOIN ndb.externalcontacts AS exct ON exct.contactid = pa.contactid |
| 47 | + WHERE publicationid = %(pubid)s AND exct.contactid IS NULL;""" |
| 48 | + with conn.cursor() as cur: |
| 49 | + cur.execute(query, {'pubid': pubid}) |
| 50 | + result = cur.fetchall() |
| 51 | + return result |
| 52 | + |
| 53 | +def align_pubs(doi_set:dict) -> dict: |
| 54 | + pub_auth = get_pub_auth(doi_set['publicationid']) |
| 55 | + if len(pub_auth) == 0: |
| 56 | + return [] |
| 57 | + try: |
| 58 | + alex_res = check_authors(doi_set['doi']) |
| 59 | + except Exception: |
| 60 | + return [] |
| 61 | + authors = alex_res['authorships'] |
| 62 | + author_set = [] |
| 63 | + for i in pub_auth: |
| 64 | + paper_auth = {'contactid': i[6], |
| 65 | + 'name': None, |
| 66 | + 'orcid': None} |
| 67 | + orcids = [j['author']['orcid'] for j in authors if re.search(i[3], j['raw_author_name'])] |
| 68 | + name = [j['raw_author_name'] for j in authors if re.search(i[3], j['raw_author_name'])] |
| 69 | + if len(orcids) > 0: |
| 70 | + paper_auth['orcid'] = orcids[0] |
| 71 | + if len(name) > 0: |
| 72 | + paper_auth['name'] = name[0] |
| 73 | + author_set.append(paper_auth) |
| 74 | + return author_set |
| 75 | + |
| 76 | +def insert_orcid(author): |
| 77 | + query = """ |
| 78 | + INSERT INTO ndb.externalcontacts(contactid, identifier, extdatabaseid) |
| 79 | + VALUES (%(contactid)s, %(orcid)s, 7) |
| 80 | + ON CONFLICT (contactid, identifier, extdatabaseid) DO NOTHING;""" |
| 81 | + with conn.cursor() as cur: |
| 82 | + cur.execute(query, author) |
| 83 | + conn.commit() |
| 84 | + return None |
| 85 | + |
| 86 | +all_pubs = get_publications(100000) |
| 87 | + |
| 88 | +find_authors = [] |
| 89 | +for i in all_pubs: |
| 90 | + new_outcome = align_pubs(i) |
| 91 | + for j in new_outcome: |
| 92 | + if j['orcid']: |
| 93 | + print(f"Found ORCID for {j['name']} through OpenAlex.") |
| 94 | + insert_orcid(j) |
| 95 | + |
0 commit comments