Updating publication data including bibtex texts

SimonGoring · SimonGoring · commit 146cbc5689bc · 2024-10-22T13:09:02.000-07:00
This has been a process that is intended to be itterative.

The scripts added provide tools to:

1. pull in publication data that does not currently have a DOI and resolve it against CrossRef
2. Take updated and validated information from the first script (following evaluatoin by a user) and update the tables in Neotoma.
diff --git a/Proposals/publications/post_doi_bibtex.py b/Proposals/publications/post_doi_bibtex.py
@@ -0,0 +1,56 @@
+'''
+This script will read in the modified outputs.csv file and post new DOIs to Neotoma.
+'''
+
+# We'll read in the `outputs.csv` file, and then for each row, update the DOI column.
+import psycopg2
+import dotenv
+import os
+import json
+import csv
+import requests
+import urllib
+
+dotenv.load_dotenv()
+
+dbauth = json.loads(os.getenv('DBAUTH'))
+
+conn = psycopg2.connect(**dbauth, connect_timeout=5)
+
+QUERY = """INSERT INTO ndb.publications(publicationid, doi, bibtex)
+           VALUES (%(publicationid)s, %(doi)s, %(bibtex)s)
+           ON CONFLICT (publicationid)
+           DO UPDATE SET    doi = EXCLUDED.doi,
+                         bibtex = EXCLUDED.bibtex;"""
+
+
+def return_bibtex(doi_string:str):
+    url = 'https://doi.org/' + urllib.request.quote(doi_string)
+    header = {
+        'Accept': 'application/x-bibtex',
+        'User-Agent': 'Neotoma Publication Augmenter; mailto:goring@wisc.edu'
+    }
+    try:
+        response = requests.get(url, headers=header, timeout = 10)
+    except requests.exceptions.ReadTimeout as e:
+        return None
+    if response.status_code == 200:
+        return response.text.strip()
+    else:
+        return None
+
+
+with open('output.csv', 'r') as newdois:
+    reader = csv.DictReader(newdois)
+    for i in reader:
+        bibtex = return_bibtex(i.get('doi'))
+        if bibtex is not None:
+            with conn.cursor() as cur:
+                cur.execute(QUERY, {'doi': i.get('doi'),
+                                    'bibtex': bibtex,
+                                    'publicationid': int(i.get('publicationid'))})
+                conn.commit()
+                cur.close()
+            print(f'Added bibtex and DOI for publication {i.get('publicationid')}.')
+        else:
+            print(f'Could not resolve bibtex/ DOI for publication {i.get('publicationid')}: {i.get('doi')}.')
diff --git a/Proposals/publications/update_doi_bibtex.py b/Proposals/publications/update_doi_bibtex.py
@@ -40,13 +40,10 @@ def clean_doi(doi_string:str):
 
 def break_citation(citation:str):
     """_Break Citation String Apart_
-
     Args:
         citation (str): _A citation string from the Neotoma Database._
-
     Raises:
         Exception: _A ValueError exception if the object could not be parsed._
-
     Returns:
         _dict_: _A dict representation of the anystyle output._
     """    
@@ -65,19 +62,28 @@ def return_bibtex(doi_string:str):
         'Accept': 'application/x-bibtex',
         'User-Agent': 'Neotoma Publication Augmenter; mailto:goring@wisc.edu'
     }
-    response = requests.get(url, headers=header)
-    return response.text.strip()
+    try:
+        response = requests.get(url, headers=header, timeout = 10)
+    except requests.exceptions.ReadTimeout as e:
+        return None
+    if response.status_code == 200:
+        return response.text.strip()
+    else:
+        return None
 
 
 def check_crossref(cite_object:str):
     url = 'https://api.crossref.org/works'
-    url_call = requests.get(url,
+    try:
+        url_call = requests.get(url,
                             headers = {'Accept': 'application/json',
                                        'User-Agent': 'Neotoma Publication Augmenter; mailto:goring@wisc.edu'},
                             params = {'rows':1,
                                     'mailto':'goring@wisc.edu',
                                     'select':'DOI,title,container-title,published',
-                                    'query':f'query.title={cite_object}'})
+                                    'query':f'query.title={cite_object}'}, timeout = 10)
+    except requests.exceptions.ReadTimeout as e:
+        return None
     if url_call.status_code == 200:
         cross_ref = json.loads(url_call.content)
         if cross_ref.get('message').get('total-results') > 0:
@@ -90,14 +96,19 @@ def check_crossref(cite_object:str):
 
 def call_publications():
     """_Get Publications from Neotoma_
-
     Returns:
         _dict_: _A dictionary of Neotoma Publications_
     """    
-    result = requests.get("https://api.neotomadb.org/v2.0/data/publications?limit=100000")
+    try:
+        result = requests.get("https://api.neotomadb.org/v2.0/data/publications?limit=100000", timeout = 10)
+    except requests.exceptions.ReadTimeout as e:
+        return None
     if result.status_code == 200:
         pubs = json.loads(result.content).get('data').get('result')
-    return pubs
+        return pubs
+    else:
+        return None
+
 
 db_data = [i.get('publication') for i in call_publications()]
 
@@ -115,7 +126,11 @@ def call_publications():
             else:
                 print('DOI match:')
                 bibtex = return_bibtex(outcome)
-                i['bibtex'] = i.get('bibtex', '') + bibtex
+                if bibtex is None:
+                    print(f'Issue with DOI {outcome}')
+                    i['notes'] = (i.get('notes', '') or '') + f' CrossRef DOI does not exists; '
+                else:
+                    i['bibtex'] = bibtex
         except TypeError as e:
             print('DOI present but not of the correct type.')
     else:
@@ -136,7 +151,6 @@ def call_publications():
             i['bibtex'] = i.get('bibtex', '') + bibtex
         else:
             print('No new match.')
-    sleep(2)
 
 
 with open('output.csv', 'w') as file:
@@ -145,3 +159,4 @@ def call_publications():
     for i in db_data:
         row = {j: i.get(j) for j in ['publicationid', 'citation', 'doi', 'notes', 'newdoi', 'json', 'bibtex']}
         writer.writerow(row)
+
diff --git a/Proposals/publications/update_publications.qmd b/Proposals/publications/update_publications.qmd
@@ -36,6 +36,34 @@ For publications the primary concern is that DOIs are unique. This is the nature
 ALTER TABLE ndb.publications ADD CONSTRAINT publications_doi_unique UNIQUE(doi);
 ```
 
+### Ensuring Proper Metadata
+
+The CrossRef API provides a source of publisher "validated" metadata. We put validated in quotes because the validation is up to the publisher. We have found instances in the past where publishers provide incomplete metadata, or alternate formatting for test strings (for example, failing to capitalize specific epithets). Regardless, the CrossRef metadata form the basis for updating and improving our internal metadata.
+
+In the script `add_bibtex.py` in this folder we see the function `check_crossref()` which calls out using the title string for a paper. By calling the `works` endpoint of the CrossRef API we can either call for a DOI or call with a `title` query. Here we have pulled the title from the citation string and passed it into the `cite_object` variable:
+
+```py
+def check_crossref(cite_object:str):
+    url = 'https://api.crossref.org/works'
+    url_call = requests.get(url,
+                            headers = {'Accept': 'application/json',
+                                       'User-Agent': 'Neotoma Publication Augmenter; mailto:goring@wisc.edu'},
+                            params = {'rows':1,
+                                    'mailto':'goring@wisc.edu',
+                                    'select':'DOI,title,container-title,published',
+                                    'query':f'query.title={cite_object}'})
+    if url_call.status_code == 200:
+        cross_ref = json.loads(url_call.content)
+        if cross_ref.get('message').get('total-results') > 0:
+            return cross_ref.get('message').get('items', '')[0]
+        else:
+            return None
+    else:
+        return None
+```
+
+
+
 ### Adding a BibTeX column to Neotoma
 
 We want to add a column to the `ndb.publications` table to ensure that the BibTex column can be supported. Although BibTex looks like JSON, it is not. A BibTex entry looks like this:
@@ -59,3 +87,12 @@ Within the BibTeX entry, each field has the format `field = {text},` so we can b
 ```regex
 ^@[article|book|booklet|conference|inbook|incollection|inproceedings|manual|mastersthesis|misc|phdthesis|proceedings|techreport|unpublished]{(.*={.*},)*}$
 ```
+
+The problem is that ultimately, bibtex is very hard to parse with Regex because of the nested brackets and various formats for different fields. For now we will put the processing of the Bibtex at the application layer.
+
+The SQL to add the column then is simply:
+
+```sql
+ALTER TABLE ndb.publications ADD COLUMN IF NOT EXISTS bibtex TEXT;
+```
+