caltechlibrary · tmorrell · Apr 14, 2025 · Mar 31, 2025 · Mar 31, 2025 · Apr 14, 2025
diff --git a/ames/harvesters/__init__.py b/ames/harvesters/__init__.py
@@ -22,4 +22,5 @@
 from .caltechauthors import clean_link
 from .caltechauthors import extract_filename_from_link
 from .caltechauthors import is_file_present
-from .caltechauthors import get_series_records
+from .caltechauthors import get_series_records
+from .caltechauthors import generate_data_citation_csv
diff --git a/ames/harvesters/caltechauthors.py b/ames/harvesters/caltechauthors.py
@@ -312,3 +312,121 @@ def get_records_from_date(date="2023-08-25", test=False):
         hits += response["hits"]["hits"]
 
     return hits
+
+def doi2url(doi):
+    if not doi.startswith("10."):
+        return doi
+    req_url = f"https://doi.org/api/handles/{doi}"
+    resp = requests.get(req_url, allow_redirects=True)
+    if resp.status_code == 200:
+        for v in resp.json().get("values", []):
+            if v["type"] == "URL":
+                resolved_url = v["data"]["value"]
+                if "data.caltech.edu/records/" in resolved_url:
+                    caltechdata_id = resolved_url.split("/records/")[-1]
+                    if caltechdata_id.isdigit():
+                        final_resp = requests.get(resolved_url, allow_redirects=True)
+                        resolved_url = final_resp.url
+                return resolved_url
+    return doi
+
+def fetch_metadata(record_id):
+    url = f"https://authors.library.caltech.edu/api/records/{record_id}"
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        return response.json()
+    except:
+        return None
+
+def search_resource_type(obj):
+    if isinstance(obj, dict):
+        for k, v in obj.items():
+            if k == 'resource_type' and isinstance(v, dict) and 'id' in v:
+                return v['id']
+            result = search_resource_type(v)
+            if result:
+                return result
+    elif isinstance(obj, list):
+        for item in obj:
+            result = search_resource_type(item)
+            if result:
+                return result
+    return None
+
+def fetch_resource_type(data):
+    return search_resource_type(data) or 'N/A'
+
+def search_records(prefix):
+    base_url = "https://authors.library.caltech.edu/api/records"
+    query = f'?q=metadata.related_identifiers.identifier:["{prefix}/0" TO "{prefix}/z"]&size=1000'
+    url = base_url + query
+    response = requests.get(url)
+    if response.status_code == 200:
+        return response.json()
+    return None
+
+def extract_data_citations(hits):
+    citations = []
+    for hit in hits:
+        record_id = hit["id"]
+        metadata = fetch_metadata(record_id)
+        if not metadata:
+            continue
+
+        caltechauthors_doi = metadata.get("pids", {}).get("doi", {}).get("identifier", "")
+        resource_type = fetch_resource_type(metadata)
+
+        related_dois = []
+        for identifier in metadata.get("metadata", {}).get("related_identifiers", []):
+            if identifier.get("scheme") == "doi":
+                doi = identifier["identifier"]
+                if any(doi.startswith(prefix) for prefix in ["10.22002/", "10.14291/", "10.25989/"]):
+                    related_dois.append(doi)
+
+        for doi in related_dois:
+            caltechdata_url = doi2url(doi)
+            if "data.caltech.edu/records/" in caltechdata_url:
+                caltechdata_id = caltechdata_url.split("/records/")[-1]
+                caltechdata_metadata = requests.get(f"https://data.caltech.edu/api/records/{caltechdata_id}").json()
+
+                cross_link = "No"
+                for identifier in caltechdata_metadata.get("metadata", {}).get("related_identifiers", []):
+                    if identifier.get("identifier") == caltechauthors_doi:
+                        cross_link = "Yes"
+                        break
+
+                citations.append({
+                    "CaltechAUTHORS_ID": record_id,
+                    "CaltechAUTHORS_DOI": caltechauthors_doi,
+                    "Related_DOI": doi,
+                    "CaltechDATA_ID": caltechdata_id,
+                    "Cross_Link": cross_link,
+                    "resource_type": resource_type
+                })
+    return citations
+
+def generate_data_citation_csv():
+    prefixes = ["10.22002", "10.14291", "10.25989"]
+    all_citations = []
+
+    for prefix in prefixes:
+        results = search_records(prefix)
+        if results and "hits" in results:
+            all_citations.extend(extract_data_citations(results["hits"]["hits"]))
+
+    output_file = "data_citations_with_type.csv"
+    with open(output_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["CaltechAUTHORS_ID", "CaltechAUTHORS_DOI", "Related_DOI", "CaltechDATA_ID", "Cross_Link", "resource_type"])
+        for citation in all_citations:
+            writer.writerow([
+                citation["CaltechAUTHORS_ID"],
+                citation["CaltechAUTHORS_DOI"],
+                citation["Related_DOI"],
+                citation["CaltechDATA_ID"],
+                citation["Cross_Link"],
+                citation["resource_type"]
+            ])
+
+    print(f"Saved {len(all_citations)} citations to {output_file}")
diff --git a/ames/matchers/__init__.py b/ames/matchers/__init__.py
@@ -22,3 +22,4 @@
 from .caltechauthors import get_record_metadata
 from .caltechauthors import update_related_identifiers
 from .caltechauthors import save_metadata_to_file
+from .caltechauthors import add_related_identifiers_from_csv
diff --git a/ames/matchers/caltechauthors.py b/ames/matchers/caltechauthors.py
@@ -312,3 +312,90 @@ def move_doi(record, token, test=False):
             publish=True,
             authors=True,
         )
+
+
+def add_related_identifiers_from_csv(csv_path, test=False):
+    """Reads a CSV file and adds related identifiers to each record using the CaltechDATA API."""
+
+    base_url = "https://data.caltechlibrary.dev" if test else "https://data.caltechlibrary.caltech.edu"
+    headers = {
+        "Authorization": f"Bearer {token}",
+        "Content-type": "application/json",
+    }
+
+    with open(csv_path, 'r') as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            record_id = row['Test_ID']
+            doi = row['CaltechAUTHORS_DOI']
+            caltech_author_id = row['CaltechAUTHORS_ID']
+            resource_type = row['resource_type']
+
+            print(f"\nProcessing Test_ID: {record_id} with DOI: {doi} and CaltechAUTHORS_ID: {caltech_author_id}")
+            print(f"Using resource_type: {resource_type}")
+
+            # Fetch the current record
+            response = requests.get(f"{base_url}/api/records/{record_id}", headers=headers)
+            if response.status_code != 200:
+                print(f"Error fetching record {record_id}: {response.status_code}")
+                continue
+            record_data = response.json()
+
+            # Draft check or create
+            draft_response = requests.get(f"{base_url}/api/records/{record_id}/draft", headers=headers)
+            if draft_response.status_code == 200:
+                record_data = draft_response.json()
+            else:
+                draft_create_response = requests.post(f"{base_url}/api/records/{record_id}/draft", headers=headers)
+                if draft_create_response.status_code != 201:
+                    print(f"Error creating draft: {draft_create_response.status_code}")
+                    continue
+                record_data = draft_create_response.json()
+
+            related_identifiers = record_data.get("metadata", {}).get("related_identifiers", []) or []
+
+            doi_exists = any(ri.get("identifier") == doi for ri in related_identifiers)
+            author_url = f"https://authors.library.caltech.edu/records/{caltech_author_id}"
+            author_url_exists = any(ri.get("identifier") == author_url for ri in related_identifiers)
+
+            if not doi_exists:
+                related_identifiers.append({
+                    "relation_type": {"id": "issupplementedby"},
+                    "identifier": doi,
+                    "scheme": "doi",
+                    "resource_type": {"id": resource_type}
+                })
+                print(f"Adding DOI: {doi}")
+            else:
+                print(f"DOI already exists")
+
+            if not author_url_exists:
+                related_identifiers.append({
+                    "relation_type": {"id": "isreferencedby"},
+                    "identifier": author_url,
+                    "scheme": "url",
+                    "resource_type": {"id": resource_type}
+                })
+                print(f"Adding CaltechAUTHORS link: {author_url}")
+            else:
+                print(f"CaltechAUTHORS link already exists")
+
+            record_data["metadata"]["related_identifiers"] = related_identifiers
+
+            update_response = requests.put(
+                f"{base_url}/api/records/{record_id}/draft", headers=headers, json=record_data
+            )
+            if update_response.status_code != 200:
+                print(f"Error updating draft: {update_response.status_code}")
+                continue
+
+            publish_response = requests.post(
+                f"{base_url}/api/records/{record_id}/draft/actions/publish", headers=headers
+            )
+            if publish_response.status_code != 202:
+                print(f"Error publishing record {record_id}: {publish_response.status_code}")
+                continue
+
+            print(f"Successfully updated and published {record_id}")
+
+    print("All records processed.")