SciCrunch · alan-wu · May 2, 2025 · Jun 5, 2025 · Jun 10, 2025 · Jun 12, 2025
diff --git a/sparcur_internal/bioluc_upload/bioluc_imagemap.py b/sparcur_internal/bioluc_upload/bioluc_imagemap.py
@@ -0,0 +1,66 @@
+import math
+import base64
+import pathlib
+import boto3  # sigh
+import requests
+from config import Config
+import json
+
+log_file = open('progress_log.txt', 'a')
+
+bp_list = []
+
+
+def get_biolucida_token():
+    url_bl_auth = f"{Config.BIOLUCIDA_ENDPOINT}/authenticate"
+    response = requests.post(url_bl_auth,
+                        data=dict(
+                            username=Config.BIOLUCIDA_USERNAME,
+                            password=Config.BIOLUCIDA_PASSWORD,
+                            token=''))
+    if response.status_code == requests.codes.ok:
+        content = response.json()
+        if content['status'] == 'success':
+            return content['token']
+    return None
+
+def map_id(item, token, dataset_id, discover_id):
+    print(item, token)
+    url_bl_imagemap = f"{Config.BIOLUCIDA_ENDPOINT}/imagemap/add"
+    resp = requests.post(url_bl_imagemap,
+                          data=dict(
+                            imageId=item['img_id'],
+                            sourceId=item['package_id'],
+                            blackfynn_datasetId=dataset_id,
+                            discover_datasetId=discover_id
+                            ),
+                        headers=dict(token=token))
+    print(resp)
+    if resp.status_code == requests.codes.ok:
+        content = resp.json()
+        print(content)
+
+    return item
+
+
+def main():
+    dataset_id = Config.DATASET_UUID
+    discover_id = Config.DISCOVER_ID
+    bp_list = []
+    if dataset_id and discover_id:
+      try:
+        f = open('output_with_id.json', 'rb')
+        with f:
+          token = get_biolucida_token()
+          data = json.load(f)
+          for item in data:
+            if item['status'] == 'successful' and 'img_id' in item and item['img_id']:
+              map_id(item, token, dataset_id, discover_id)
+
+      except OSError:
+        print("No input file")
+    else:
+      print("Missing dataset uuid or discover id or both.")
+
+if __name__ == "__main__":
+    main()
diff --git a/sparcur_internal/bioluc_upload/config.py b/sparcur_internal/bioluc_upload/config.py
@@ -0,0 +1,16 @@
+import os
+
+class Config(object):
+    PENNSIEVE_API_HOST = os.environ.get("PENNSIEVE_API_HOST", "https://api.pennsieve.io")
+    PENNSIEVE_API_SECRET = os.environ.get("PENNSIEVE_API_SECRET", "local-secret-key")
+    PENNSIEVE_API_TOKEN = os.environ.get("PENNSIEVE_API_TOKEN", "local-api-key")
+    PENNSIEVE_ORGANIZATION_ID = os.environ.get("PENNSIEVE_ORGANIZATION")
+    BIOLUCIDA_ENDPOINT = os.environ.get("BIOLUCIDA_ENDPOINT", "https://sparc.biolucida.net/api/v1")
+    BIOLUCIDA_USERNAME = os.environ.get("BIOLUCIDA_USERNAME", "major-user")
+    BIOLUCIDA_PASSWORD = os.environ.get("BIOLUCIDA_PASSWORD", "local-password")
+    TEST_DATASET_ID = os.environ.get("TEST_DATASET_ID", "")
+    TEST_PACKAGE_ID = os.environ.get("TEST_PACKAGE_ID", "")
+    SPARC_API = os.environ.get("SPARC_API", "https://api.sparc.science/")
+    DATASET_UUID = os.environ.get("DATASET_UUID", "")
+    DISCOVER_ID = os.environ.get("DISCOVER_ID", "")
+    COLLECTION_ID = os.environ.get("COLLECTION_ID", "")
diff --git a/sparcur_internal/bioluc_upload/get_id.py b/sparcur_internal/bioluc_upload/get_id.py
@@ -0,0 +1,69 @@
+import math
+import base64
+import pathlib
+import boto3  # sigh
+import requests
+from config import Config
+import json
+
+log_file = open('progress_log.txt', 'a')
+
+bp_list = []
+
+
+def get_biolucida_token():
+    url_bl_auth = f"{Config.BIOLUCIDA_ENDPOINT}/authenticate"
+    response = requests.post(url_bl_auth,
+                        data=dict(
+                            username=Config.BIOLUCIDA_USERNAME,
+                            password=Config.BIOLUCIDA_PASSWORD,
+                            token=''))
+    if response.status_code == requests.codes.ok:
+        content = response.json()
+        if content['status'] == 'success':
+            return content['token']
+    return None
+
+def get_biolucida_id(item, token, collection_id):
+    print("original data:", item, token)
+    col_id = collection_id
+    if not col_id:
+      col_id=item['collection_id']
+    url_bl_colandbasename = f"{Config.BIOLUCIDA_ENDPOINT}/image/colandbasename"
+    resp = requests.post(url_bl_colandbasename,
+                        data=dict(
+                            col_id=col_id,
+                            basename=item['basename'],
+                            ),
+                        headers=dict(token=token))
+    print(resp)
+    if resp.status_code == requests.codes.ok:
+        content = resp.json()
+        print(content)
+        if content['status'] == 'success' and 'image_id' in content:
+            item['img_id'] = content['image_id']
+            item['collection_id'] = col_id
+    return item
+
+
+def main():
+    dataset_id = Config.DATASET_UUID  # f001
+    collection_id = Config.COLLECTION_ID
+    bp_list = []
+    try:
+      f = open('input.json', 'rb')
+      with f:
+        token = get_biolucida_token()
+        data = json.load(f)
+        for item in data:
+          if item['status'] == 'successful':
+            bp_list.append(get_biolucida_id(item, token, collection_id))
+
+    except OSError:
+      print("No input file")
+
+    with open('output_with_id.json', 'w') as f:
+        json.dump(bp_list, f)
+
+if __name__ == "__main__":
+    main()
diff --git a/sparcur_internal/bioluc_upload/instructions.txt b/sparcur_internal/bioluc_upload/instructions.txt
@@ -0,0 +1,41 @@
+Before running the script, please make sure the following environment variables are ready:
+
+PENNSIEVE_API_SECRET
+PENNSIEVE_API_TOKEN
+PENNSIEVE_ORGANIZATION
+BIOLUCIDA_USERNAME
+BIOLUCIDA_PASSWORD
+DATASET_UUID
+
+Note: The script does not create a new collection, you will need to move the images to a collection then make the collection public if neccessary but please beware that moving an image to a new collection chages the image ids.
+It may be possible to upload an image directly into a collection but I have not tried that yet.
+
+First run the penn_bioluc.py script:
+1. For the dataset of interest, get the dataset UUID and set it with the DATASET_UUID environment variable.
+
+Steps 2 to 7 are details in the penn_bioluc.py script:
+
+2. Using the dataset UUID, get metadata and path metadata information from SciCrunch? - 
+"https://cassava.ucsd.edu/sparc/datasets/{dataset_uuid}/LATEST/curation-export.json" and "https://cassava.ucsd.edu/sparc/datasets/{dataset_uuid}/LATEST/path-metadata.json". Information required are pennsieve dataset id, published id, package id, filename and filesize.
+
+3. Authenticate to Pennsieve API server with curator access. From this server, we get the s3 URL for downloading/streaming the file.
+
+4. Authenticate to the Biolucida server and get the access token for further API calls
+
+5. Initiate Biolucidaupload with /upload/init, pass in the filename, filesize, chunk_size and token as parameters
+
+6. Request data of the file in chunks and send them to Biolucida using the /upload/continue endpoint
+
+7. After the last chunk has been sent, finalise the Biolucida upload by calling /upload/finish
+
+8. Timeout may occur and in that case, wait for the process to finish running and then copy and rename output.json to input.json and run the penn_bioluc.py script again.
+Based on my experience, the script may need to be rerun multiple names.
+
+9. Once the script runs successfully, copy and rename output.json to input.json then run get_id.py, a new file called output_with_id.json will be created.
+
+
+Note: If the images have been moved to a collection, all the image ids will be changed in the new collection, in that case, please run get_id.py with the environment variable COLLECTION_ID set which will allow the image_ids to be fetched from the dataset.
+
+10. The output_with_id.json should contain the basename, collection id, pennsieve package id and biolucida image id for each of the uploaded images. It may take some time for the biolucida server to process all the files and the biolucida image id may be missing for some entries in the json file, in that case please rerun step 9 again after a few hours.
+
+11. Once all image ids have been collected, with the output_with_id.json file in the directory; set the environment variable DISCOVER_ID then run bioluc_imagemap.py to map the images on biolucida server.