Skip to content

Commit 68fe784

Browse files
committed
Working through scripts to add new publicaitons from DOIs
This proposal section works through the process of adding and uploading records to Neotoma through publication data stored online in CrossRef.
1 parent 7eb735d commit 68fe784

File tree

7 files changed

+124
-21
lines changed

7 files changed

+124
-21
lines changed

Proposals/publications/README.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Resolving Publications in Neotoma
2+
3+
This project is intended to help add DOIs to existing publications, support meta-data enrichment from manual citation editing, and support the addition of records to Neotoma from external sources. This workflow links the [Neotoma API](https://api.neotomadb.org) to the [CrossRef Works API](https://api.crossref.org/works), helping to validate and augment information in Neotoma.
4+
5+
## Updating Existing Neotoma Records
6+
7+
### Finding new DOIs
8+
9+
For existing publications that do not include DOIs we can scan the Neotoma Publications database from the commandline:
10+
11+
```python
12+
uv run src/find_potential_dois.py --limit 100 --skip 100 --output ./data/offset100.csv
13+
```
14+
15+
This will return a CSV file (saved in the `--output` directory) with the Neotoma `publicationid`, current `citation`, the `doi` stored in Neotoma (generally empty) and then columns for the `newdoi`, obtained from a CrossRef search, as well as the `bibtex` citation.
16+
17+
This output file can be manually edited, and, subsequently used as the source for data upload.
18+
19+
![A screenshot of the edited csv file, with the "good" DOIs remaining.](./assets/editedoffset100.png)
20+
21+
### Submitting New DOIs for Existing Records
22+
23+
If we have a csv file with existing publication IDs and "new" DOIs, we can then upload the data from the commandline:
24+
25+
```py
26+
uv run src/post_doi_bibtex.py --input ./data/offset100.csv --doi newdoi --commit True
27+
```
28+
29+
The `--commit` flag allows us to test the run, to ensure that we don't accidentally upload data incorrectly. If it is set to `True` we will see a complete run:
30+
31+
![Running the script with commit set to True](./assets/script_output.png)
32+
33+
Otherwise the upload will end with the statement:
34+
35+
```
36+
The --commit flag was set to False, rolling back operation.
37+
```
38+
39+
## Inserting New Publicaitons from DOIs
40+

Proposals/publications/src/find_potential_dois.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,15 @@
1212

1313
parser = argparse.ArgumentParser()
1414
parser.add_argument('--output', '-O', help="A valid output filename.", type= str, default = 'output.csv')
15-
parser.add_argument('--limit', '-l', help="How many Neotoma publicaitons to process?", type= int, default= 100)
15+
parser.add_argument('--limit', '-L', help="How many Neotoma publications to process?", type= int, default= 100)
16+
parser.add_argument('--skip', '-S', help="How many Neotoma publications to skip?", type= int, default= 0)
1617

1718
args = parser.parse_args()
1819

1920
csv.field_size_limit(sys.maxsize)
2021

21-
db_data = [j for j in [i.get('publication') for i in call_publications(limit = args.limit)] if j.get('doi') is None]
22+
ndb_pubs = call_publications(limit = args.limit, offset = args.skip)
23+
db_data = [j for j in [i.get('publication') for i in ndb_pubs] if j.get('doi') is None]
2224

2325
# For each Neotoma publication record without a DOI:
2426
for i in db_data:
@@ -56,7 +58,7 @@
5658
i['newdoi'] = outcome.get('DOI')
5759
i['json'] = json.dumps(outcome)
5860
bibtex = return_bibtex(outcome.get('DOI'))
59-
i['bibtex'] = i.get('bibtex', '') + bibtex
61+
i['bibtex'] = i.get('bibtex', '') + (bibtex or '')
6062
else:
6163
print('No new match.')
6264

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,44 @@
1+
from publications import return_bibtex, parse_raw, add_citation
2+
import argparse
3+
import psycopg2
4+
import json
5+
import dotenv
6+
import os
17

8+
parser = argparse.ArgumentParser()
9+
parser.add_argument('--input', '-I', help="A valid output filename.", type= str, default = 'output.csv')
10+
parser.add_argument('--doi', '-d', help="Which column contains the DOI for upload?", type=str)
11+
parser.add_argument('--commit', '-c', help="Should we commit the data to the database?", type=bool, default= False)
12+
13+
args = parser.parse_args()
14+
15+
dotenv.load_dotenv()
16+
17+
dbauth = json.loads(os.getenv('DBAUTH'))
18+
19+
conn = psycopg2.connect(**dbauth, connect_timeout=5)
20+
21+
QUERY = """
22+
INSERT INTO ndb.publications(citation, doi, bibtex)
23+
VALUES(%(citation)s, %(doi)s, %(bibtex)s);"""
24+
25+
contents = parse_raw(args.input)
26+
27+
for i in [i for i in contents if i is not None]:
28+
bibtex = return_bibtex(i)
29+
citation = add_citation(bibtex)
30+
if bibtex is not None and citation is not None:
31+
with conn.cursor() as cur:
32+
cur.execute(QUERY, {'doi': i,
33+
'bibtex': bibtex,
34+
'citation': citation})
35+
if args.commit:
36+
print(f'Committing the following citation:\n{citation}.')
37+
conn.commit()
38+
cur.close()
39+
40+
if args.commit is True:
41+
conn.commit()
42+
else:
43+
conn.rollback()
44+
print("The --commit flag was set to False, rolling back operation.")

Proposals/publications/src/post_doi_bibtex.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,15 @@
88
import os
99
import json
1010
import csv
11-
import sys
1211
from publications import return_bibtex
12+
import argparse
13+
14+
parser = argparse.ArgumentParser()
15+
parser.add_argument('--input', '-I', help="A valid output filename.", type= str, default = 'output.csv')
16+
parser.add_argument('--doi', '-d', help="Which column contains the DOI for upload?", type=str, default = 'newdoi')
17+
parser.add_argument('--commit', '-c', help="Should we commit the data to the database?", type=bool, default= False)
18+
19+
args = parser.parse_args()
1320

1421
dotenv.load_dotenv()
1522

@@ -23,19 +30,23 @@
2330
DO UPDATE SET doi = EXCLUDED.doi,
2431
bibtex = EXCLUDED.bibtex;"""
2532

26-
with open(sys.argv[1], 'r') as newdois:
33+
with open(args.input, 'r') as newdois:
2734
reader = csv.DictReader(newdois)
28-
if sys.argv[2] not in reader.fieldnames:
29-
raise KeyError(f'The value {sys.argv[2]} is not a column heading in {sys.argv[1]}')
35+
if args.doi not in reader.fieldnames:
36+
raise KeyError(f'The value {args.doi} is not a column heading in {args.input}')
3037
for i in reader:
31-
bibtex = return_bibtex(i.get(sys.argv[2]))
38+
bibtex = return_bibtex(i.get(args.doi))
3239
if bibtex is not None:
3340
with conn.cursor() as cur:
34-
cur.execute(QUERY, {'doi': i.get(sys.argv[2]),
41+
cur.execute(QUERY, {'doi': i.get(args.doi),
3542
'bibtex': bibtex,
3643
'publicationid': int(i.get('publicationid'))})
37-
conn.commit()
44+
if args.commit:
45+
conn.commit()
3846
cur.close()
3947
print(f'Added bibtex and DOI for publication {i.get('publicationid')}.')
4048
else:
41-
print(f'Could not resolve bibtex/ DOI for publication {i.get('publicationid')}: {i.get(sys.argv[2])}.')
49+
print(f'Could not resolve bibtex/DOI for publication {i.get('publicationid')}: {i.get(args.doi)}.')
50+
if not args.commit:
51+
print("The --commit flag was set to False, rolling back operation.")
52+
conn.rollback()

Proposals/publications/src/publications/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@
44
from .clean_doi import clean_doi as clean_doi
55
from .parse_raw import parse_raw as parse_raw
66
from .return_bibtex import return_bibtex as return_bibtex
7+
from .add_citation import add_citation as add_citation

Proposals/publications/src/publications/add_citation.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
from pybtex.database import parse_string
33

44
def add_citation(bibtex:str) -> str:
5-
"""_Return an APA citation from Bibtex_
5+
"""_Return an APA citation from Bibtex._
66
77
Args:
8-
bibtex (str): _A properly formatted Bibtex string_
8+
bibtex (str): _A properly formatted Bibtex string._
99
1010
Returns:
11-
str: _An APA formatted citation string generated from the bibtex field._
11+
str: _An APA formatted citation string generated from the Bibtex field._
1212
>>> bibtex = "@article{Davis_1979, title={Wetland Succession, Fire and the Pollen Record: A Midwestern Example}, volume={102}, ISSN={0003-0031}, url={http://dx.doi.org/10.2307/2425069}, DOI={10.2307/2425069}, number={1}, journal={American Midland Naturalist}, publisher={JSTOR}, author={Davis, Anthony M.}, year={1979}, month=jul, pages={86} }"
1313
>>> add_citation(bibtex)
1414
'Davis, A. M. (1979 , July). Wetland succession, fire and the pollen record: a midwestern example. American Midland Naturalist, 102(1), 86. URL: http://dx.doi.org/10.2307/2425069, doi:10.2307/2425069'
@@ -22,8 +22,11 @@ def add_citation(bibtex:str) -> str:
2222
"""
2323
aa = parse_string(bibtex, "bibtex")
2424
APA = find_plugin('pybtex.style.formatting', 'apa')()
25-
formattedBib = APA.format_bibliography(aa)
26-
if len(formattedBib.entries) == 0:
27-
raise ValueError(f"The passed Bibtex string:\n\n{bibtex}\n\nis not valid Bibtex.")
28-
else:
29-
return [entry.text.render_as('text') for entry in formattedBib][0]
25+
try:
26+
formattedBib = APA.format_bibliography(aa)
27+
if len(formattedBib.entries) == 0:
28+
raise ValueError(f"The passed Bibtex string:\n\n{bibtex}\n\nis not valid Bibtex.")
29+
else:
30+
return [entry.text.render_as('text') for entry in formattedBib][0]
31+
except Exception as e:
32+

Proposals/publications/src/publications/call_publications.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@
22
from requests.exceptions import ReadTimeout
33
from json import loads
44

5-
def call_publications(limit:int = 100):
5+
def call_publications(limit:int = 100, offset:int = 0):
66
"""_Get Publications from Neotoma_
77
Returns:
88
_dict_: _A dictionary of Neotoma Publications_
99
"""
1010
try:
11-
result = get(f"https://api.neotomadb.org/v2.0/data/publications?limit={limit}", timeout = 10)
11+
result = get("https://api.neotomadb.org/v2.0/data/publications",
12+
params = {"limit": limit,
13+
"offset": offset},
14+
timeout = 10)
1215
except ReadTimeout as e:
1316
print(e)
1417
return None

0 commit comments

Comments
 (0)