Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion ames/harvesters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@
from .caltechauthors import clean_link
from .caltechauthors import extract_filename_from_link
from .caltechauthors import is_file_present
from .caltechauthors import get_series_records
from .caltechauthors import get_series_records
from .caltechauthors import generate_data_citation_csv
118 changes: 118 additions & 0 deletions ames/harvesters/caltechauthors.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,3 +312,121 @@ def get_records_from_date(date="2023-08-25", test=False):
hits += response["hits"]["hits"]

return hits

def doi2url(doi):
if not doi.startswith("10."):
return doi
req_url = f"https://doi.org/api/handles/{doi}"
resp = requests.get(req_url, allow_redirects=True)
if resp.status_code == 200:
for v in resp.json().get("values", []):
if v["type"] == "URL":
resolved_url = v["data"]["value"]
if "data.caltech.edu/records/" in resolved_url:
caltechdata_id = resolved_url.split("/records/")[-1]
if caltechdata_id.isdigit():
final_resp = requests.get(resolved_url, allow_redirects=True)
resolved_url = final_resp.url
return resolved_url
return doi

def fetch_metadata(record_id):
url = f"https://authors.library.caltech.edu/api/records/{record_id}"
try:
response = requests.get(url)
response.raise_for_status()
return response.json()
except:
return None

def search_resource_type(obj):
if isinstance(obj, dict):
for k, v in obj.items():
if k == 'resource_type' and isinstance(v, dict) and 'id' in v:
return v['id']
result = search_resource_type(v)
if result:
return result
elif isinstance(obj, list):
for item in obj:
result = search_resource_type(item)
if result:
return result
return None

def fetch_resource_type(data):
return search_resource_type(data) or 'N/A'

def search_records(prefix):
base_url = "https://authors.library.caltech.edu/api/records"
query = f'?q=metadata.related_identifiers.identifier:["{prefix}/0" TO "{prefix}/z"]&size=1000'
url = base_url + query
response = requests.get(url)
if response.status_code == 200:
return response.json()
return None

def extract_data_citations(hits):
citations = []
for hit in hits:
record_id = hit["id"]
metadata = fetch_metadata(record_id)
if not metadata:
continue

caltechauthors_doi = metadata.get("pids", {}).get("doi", {}).get("identifier", "")
resource_type = fetch_resource_type(metadata)

related_dois = []
for identifier in metadata.get("metadata", {}).get("related_identifiers", []):
if identifier.get("scheme") == "doi":
doi = identifier["identifier"]
if any(doi.startswith(prefix) for prefix in ["10.22002/", "10.14291/", "10.25989/"]):
related_dois.append(doi)

for doi in related_dois:
caltechdata_url = doi2url(doi)
if "data.caltech.edu/records/" in caltechdata_url:
caltechdata_id = caltechdata_url.split("/records/")[-1]
caltechdata_metadata = requests.get(f"https://data.caltech.edu/api/records/{caltechdata_id}").json()

cross_link = "No"
for identifier in caltechdata_metadata.get("metadata", {}).get("related_identifiers", []):
if identifier.get("identifier") == caltechauthors_doi:
cross_link = "Yes"
break

citations.append({
"CaltechAUTHORS_ID": record_id,
"CaltechAUTHORS_DOI": caltechauthors_doi,
"Related_DOI": doi,
"CaltechDATA_ID": caltechdata_id,
"Cross_Link": cross_link,
"resource_type": resource_type
})
return citations

def generate_data_citation_csv():
prefixes = ["10.22002", "10.14291", "10.25989"]
all_citations = []

for prefix in prefixes:
results = search_records(prefix)
if results and "hits" in results:
all_citations.extend(extract_data_citations(results["hits"]["hits"]))

output_file = "data_citations_with_type.csv"
with open(output_file, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["CaltechAUTHORS_ID", "CaltechAUTHORS_DOI", "Related_DOI", "CaltechDATA_ID", "Cross_Link", "resource_type"])
for citation in all_citations:
writer.writerow([
citation["CaltechAUTHORS_ID"],
citation["CaltechAUTHORS_DOI"],
citation["Related_DOI"],
citation["CaltechDATA_ID"],
citation["Cross_Link"],
citation["resource_type"]
])

print(f"Saved {len(all_citations)} citations to {output_file}")
1 change: 1 addition & 0 deletions ames/matchers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@
from .caltechauthors import get_record_metadata
from .caltechauthors import update_related_identifiers
from .caltechauthors import save_metadata_to_file
from .caltechauthors import add_related_identifiers_from_csv
87 changes: 87 additions & 0 deletions ames/matchers/caltechauthors.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,3 +312,90 @@ def move_doi(record, token, test=False):
publish=True,
authors=True,
)


def add_related_identifiers_from_csv(csv_path, test=False):
"""Reads a CSV file and adds related identifiers to each record using the CaltechDATA API."""

base_url = "https://data.caltechlibrary.dev" if test else "https://data.caltechlibrary.caltech.edu"
headers = {
"Authorization": f"Bearer {token}",
"Content-type": "application/json",
}

with open(csv_path, 'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
record_id = row['Test_ID']
doi = row['CaltechAUTHORS_DOI']
caltech_author_id = row['CaltechAUTHORS_ID']
resource_type = row['resource_type']

print(f"\nProcessing Test_ID: {record_id} with DOI: {doi} and CaltechAUTHORS_ID: {caltech_author_id}")
print(f"Using resource_type: {resource_type}")

# Fetch the current record
response = requests.get(f"{base_url}/api/records/{record_id}", headers=headers)
if response.status_code != 200:
print(f"Error fetching record {record_id}: {response.status_code}")
continue
record_data = response.json()

# Draft check or create
draft_response = requests.get(f"{base_url}/api/records/{record_id}/draft", headers=headers)
if draft_response.status_code == 200:
record_data = draft_response.json()
else:
draft_create_response = requests.post(f"{base_url}/api/records/{record_id}/draft", headers=headers)
if draft_create_response.status_code != 201:
print(f"Error creating draft: {draft_create_response.status_code}")
continue
record_data = draft_create_response.json()

related_identifiers = record_data.get("metadata", {}).get("related_identifiers", []) or []

doi_exists = any(ri.get("identifier") == doi for ri in related_identifiers)
author_url = f"https://authors.library.caltech.edu/records/{caltech_author_id}"
author_url_exists = any(ri.get("identifier") == author_url for ri in related_identifiers)

if not doi_exists:
related_identifiers.append({
"relation_type": {"id": "issupplementedby"},
"identifier": doi,
"scheme": "doi",
"resource_type": {"id": resource_type}
})
print(f"Adding DOI: {doi}")
else:
print(f"DOI already exists")

if not author_url_exists:
related_identifiers.append({
"relation_type": {"id": "isreferencedby"},
"identifier": author_url,
"scheme": "url",
"resource_type": {"id": resource_type}
})
print(f"Adding CaltechAUTHORS link: {author_url}")
else:
print(f"CaltechAUTHORS link already exists")

record_data["metadata"]["related_identifiers"] = related_identifiers

update_response = requests.put(
f"{base_url}/api/records/{record_id}/draft", headers=headers, json=record_data
)
if update_response.status_code != 200:
print(f"Error updating draft: {update_response.status_code}")
continue

publish_response = requests.post(
f"{base_url}/api/records/{record_id}/draft/actions/publish", headers=headers
)
if publish_response.status_code != 202:
print(f"Error publishing record {record_id}: {publish_response.status_code}")
continue

print(f"Successfully updated and published {record_id}")

print("All records processed.")