Skip to content

Commit 146cbc5

Browse files
committed
Updating publication data including bibtex texts
This has been a process that is intended to be itterative. The scripts added provide tools to: 1. pull in publication data that does not currently have a DOI and resolve it against CrossRef 2. Take updated and validated information from the first script (following evaluatoin by a user) and update the tables in Neotoma.
1 parent 6ec9993 commit 146cbc5

File tree

3 files changed

+120
-12
lines changed

3 files changed

+120
-12
lines changed
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
'''
2+
This script will read in the modified outputs.csv file and post new DOIs to Neotoma.
3+
'''
4+
5+
# We'll read in the `outputs.csv` file, and then for each row, update the DOI column.
6+
import psycopg2
7+
import dotenv
8+
import os
9+
import json
10+
import csv
11+
import requests
12+
import urllib
13+
14+
dotenv.load_dotenv()
15+
16+
dbauth = json.loads(os.getenv('DBAUTH'))
17+
18+
conn = psycopg2.connect(**dbauth, connect_timeout=5)
19+
20+
QUERY = """INSERT INTO ndb.publications(publicationid, doi, bibtex)
21+
VALUES (%(publicationid)s, %(doi)s, %(bibtex)s)
22+
ON CONFLICT (publicationid)
23+
DO UPDATE SET doi = EXCLUDED.doi,
24+
bibtex = EXCLUDED.bibtex;"""
25+
26+
27+
def return_bibtex(doi_string:str):
28+
url = 'https://doi.org/' + urllib.request.quote(doi_string)
29+
header = {
30+
'Accept': 'application/x-bibtex',
31+
'User-Agent': 'Neotoma Publication Augmenter; mailto:goring@wisc.edu'
32+
}
33+
try:
34+
response = requests.get(url, headers=header, timeout = 10)
35+
except requests.exceptions.ReadTimeout as e:
36+
return None
37+
if response.status_code == 200:
38+
return response.text.strip()
39+
else:
40+
return None
41+
42+
43+
with open('output.csv', 'r') as newdois:
44+
reader = csv.DictReader(newdois)
45+
for i in reader:
46+
bibtex = return_bibtex(i.get('doi'))
47+
if bibtex is not None:
48+
with conn.cursor() as cur:
49+
cur.execute(QUERY, {'doi': i.get('doi'),
50+
'bibtex': bibtex,
51+
'publicationid': int(i.get('publicationid'))})
52+
conn.commit()
53+
cur.close()
54+
print(f'Added bibtex and DOI for publication {i.get('publicationid')}.')
55+
else:
56+
print(f'Could not resolve bibtex/ DOI for publication {i.get('publicationid')}: {i.get('doi')}.')

Proposals/publications/add_bibtex.py renamed to Proposals/publications/update_doi_bibtex.py

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,10 @@ def clean_doi(doi_string:str):
4040

4141
def break_citation(citation:str):
4242
"""_Break Citation String Apart_
43-
4443
Args:
4544
citation (str): _A citation string from the Neotoma Database._
46-
4745
Raises:
4846
Exception: _A ValueError exception if the object could not be parsed._
49-
5047
Returns:
5148
_dict_: _A dict representation of the anystyle output._
5249
"""
@@ -65,19 +62,28 @@ def return_bibtex(doi_string:str):
6562
'Accept': 'application/x-bibtex',
6663
'User-Agent': 'Neotoma Publication Augmenter; mailto:goring@wisc.edu'
6764
}
68-
response = requests.get(url, headers=header)
69-
return response.text.strip()
65+
try:
66+
response = requests.get(url, headers=header, timeout = 10)
67+
except requests.exceptions.ReadTimeout as e:
68+
return None
69+
if response.status_code == 200:
70+
return response.text.strip()
71+
else:
72+
return None
7073

7174

7275
def check_crossref(cite_object:str):
7376
url = 'https://api.crossref.org/works'
74-
url_call = requests.get(url,
77+
try:
78+
url_call = requests.get(url,
7579
headers = {'Accept': 'application/json',
7680
'User-Agent': 'Neotoma Publication Augmenter; mailto:goring@wisc.edu'},
7781
params = {'rows':1,
7882
'mailto':'goring@wisc.edu',
7983
'select':'DOI,title,container-title,published',
80-
'query':f'query.title={cite_object}'})
84+
'query':f'query.title={cite_object}'}, timeout = 10)
85+
except requests.exceptions.ReadTimeout as e:
86+
return None
8187
if url_call.status_code == 200:
8288
cross_ref = json.loads(url_call.content)
8389
if cross_ref.get('message').get('total-results') > 0:
@@ -90,14 +96,19 @@ def check_crossref(cite_object:str):
9096

9197
def call_publications():
9298
"""_Get Publications from Neotoma_
93-
9499
Returns:
95100
_dict_: _A dictionary of Neotoma Publications_
96101
"""
97-
result = requests.get("https://api.neotomadb.org/v2.0/data/publications?limit=100000")
102+
try:
103+
result = requests.get("https://api.neotomadb.org/v2.0/data/publications?limit=100000", timeout = 10)
104+
except requests.exceptions.ReadTimeout as e:
105+
return None
98106
if result.status_code == 200:
99107
pubs = json.loads(result.content).get('data').get('result')
100-
return pubs
108+
return pubs
109+
else:
110+
return None
111+
101112

102113
db_data = [i.get('publication') for i in call_publications()]
103114

@@ -115,7 +126,11 @@ def call_publications():
115126
else:
116127
print('DOI match:')
117128
bibtex = return_bibtex(outcome)
118-
i['bibtex'] = i.get('bibtex', '') + bibtex
129+
if bibtex is None:
130+
print(f'Issue with DOI {outcome}')
131+
i['notes'] = (i.get('notes', '') or '') + f' CrossRef DOI does not exists; '
132+
else:
133+
i['bibtex'] = bibtex
119134
except TypeError as e:
120135
print('DOI present but not of the correct type.')
121136
else:
@@ -136,7 +151,6 @@ def call_publications():
136151
i['bibtex'] = i.get('bibtex', '') + bibtex
137152
else:
138153
print('No new match.')
139-
sleep(2)
140154

141155

142156
with open('output.csv', 'w') as file:
@@ -145,3 +159,4 @@ def call_publications():
145159
for i in db_data:
146160
row = {j: i.get(j) for j in ['publicationid', 'citation', 'doi', 'notes', 'newdoi', 'json', 'bibtex']}
147161
writer.writerow(row)
162+

Proposals/publications/update_publications.qmd

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,34 @@ For publications the primary concern is that DOIs are unique. This is the nature
3636
ALTER TABLE ndb.publications ADD CONSTRAINT publications_doi_unique UNIQUE(doi);
3737
```
3838

39+
### Ensuring Proper Metadata
40+
41+
The CrossRef API provides a source of publisher "validated" metadata. We put validated in quotes because the validation is up to the publisher. We have found instances in the past where publishers provide incomplete metadata, or alternate formatting for test strings (for example, failing to capitalize specific epithets). Regardless, the CrossRef metadata form the basis for updating and improving our internal metadata.
42+
43+
In the script `add_bibtex.py` in this folder we see the function `check_crossref()` which calls out using the title string for a paper. By calling the `works` endpoint of the CrossRef API we can either call for a DOI or call with a `title` query. Here we have pulled the title from the citation string and passed it into the `cite_object` variable:
44+
45+
```py
46+
def check_crossref(cite_object:str):
47+
url = 'https://api.crossref.org/works'
48+
url_call = requests.get(url,
49+
headers = {'Accept': 'application/json',
50+
'User-Agent': 'Neotoma Publication Augmenter; mailto:goring@wisc.edu'},
51+
params = {'rows':1,
52+
'mailto':'goring@wisc.edu',
53+
'select':'DOI,title,container-title,published',
54+
'query':f'query.title={cite_object}'})
55+
if url_call.status_code == 200:
56+
cross_ref = json.loads(url_call.content)
57+
if cross_ref.get('message').get('total-results') > 0:
58+
return cross_ref.get('message').get('items', '')[0]
59+
else:
60+
return None
61+
else:
62+
return None
63+
```
64+
65+
66+
3967
### Adding a BibTeX column to Neotoma
4068

4169
We want to add a column to the `ndb.publications` table to ensure that the BibTex column can be supported. Although BibTex looks like JSON, it is not. A BibTex entry looks like this:
@@ -59,3 +87,12 @@ Within the BibTeX entry, each field has the format `field = {text},` so we can b
5987
```regex
6088
^@[article|book|booklet|conference|inbook|incollection|inproceedings|manual|mastersthesis|misc|phdthesis|proceedings|techreport|unpublished]{(.*={.*},)*}$
6189
```
90+
91+
The problem is that ultimately, bibtex is very hard to parse with Regex because of the nested brackets and various formats for different fields. For now we will put the processing of the Bibtex at the application layer.
92+
93+
The SQL to add the column then is simply:
94+
95+
```sql
96+
ALTER TABLE ndb.publications ADD COLUMN IF NOT EXISTS bibtex TEXT;
97+
```
98+

0 commit comments

Comments
 (0)