-
Notifications
You must be signed in to change notification settings - Fork 0
/
searchHathiTrustByOCLC.py
52 lines (43 loc) · 1.89 KB
/
searchHathiTrustByOCLC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pandas as pd
import requests
import argparse
# This script searches HathiTrust for OCLC numbers from a CSV and creates a new CSV with the metadata of any matches.
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file', help='Enter filename with csv.')
args = parser.parse_args()
if args.file:
filename = args.file
else:
filename = input('Enter filename (including \'.csv\'): ')
# From the Hathi API documentation: https://www.hathitrust.org/bib_api
hURL_brief = 'http://catalog.hathitrust.org/api/volumes/brief/oclc/'
# Reads CSV as DataFrame, grabs OCLC identifiers from column named "oclc_id."
df = pd.read_csv(filename, dtype={'oclc_id': str})
df.dropna(subset=['oclc_id'], inplace=True) # Drop blank values.
oclc_identifiers = df['oclc_id'].unique()
oclc_identifiers = list(oclc_identifiers)
# Loops through list of oclc_identifiers and searches for matches in HathiTrust.
all_results = []
for index, identifier in enumerate(oclc_identifiers):
print(index, identifier)
identifier = identifier.strip()
search_url = hURL_brief+identifier+'.json'
h_response = requests.get(search_url).json()
records = h_response.get('records')
# If matches are found, adds HathiTrust metadata to all_results.
if records:
for record in records:
record_values = records.get(record)
result = {}
for k, v in record_values.items():
if isinstance(v, list):
v = '|'.join(v)
k = 'HT_'+k
result[k] = v
all_results.append(result)
# Creates DataFrame from all_results.
df_results = pd.DataFrame.from_dict(all_results)
# Rename columns
df_results.rename(columns={'HT_oclcs': 'HT_oclc', 'HT_titles': 'HT_title', 'HT_recordURL': 'HT_link'}, inplace=True)
# Creates CSV called "hathiTrustResults.csv" from DataFrame.
df_results.to_csv('hathiTrustResults.csv', index=False)