Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions sparcur_internal/bioluc_upload/bioluc_imagemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import math
import base64
import pathlib
import boto3 # sigh
import requests
from config import Config
import json

log_file = open('progress_log.txt', 'a')

bp_list = []


def get_biolucida_token():
url_bl_auth = f"{Config.BIOLUCIDA_ENDPOINT}/authenticate"
response = requests.post(url_bl_auth,
data=dict(
username=Config.BIOLUCIDA_USERNAME,
password=Config.BIOLUCIDA_PASSWORD,
token=''))
if response.status_code == requests.codes.ok:
content = response.json()
if content['status'] == 'success':
return content['token']
return None

def map_id(item, token, dataset_id, discover_id):
print(item, token)
url_bl_imagemap = f"{Config.BIOLUCIDA_ENDPOINT}/imagemap/add"
resp = requests.post(url_bl_imagemap,
data=dict(
imageId=item['img_id'],
sourceId=item['package_id'],
blackfynn_datasetId=dataset_id,
discover_datasetId=discover_id
),
headers=dict(token=token))
print(resp)
if resp.status_code == requests.codes.ok:
content = resp.json()
print(content)

return item


def main():
dataset_id = Config.DATASET_UUID
discover_id = Config.DISCOVER_ID
bp_list = []
if dataset_id and discover_id:
try:
f = open('output_with_id.json', 'rb')
with f:
token = get_biolucida_token()
data = json.load(f)
for item in data:
if item['status'] == 'successful' and 'img_id' in item and item['img_id']:
map_id(item, token, dataset_id, discover_id)

except OSError:
print("No input file")
else:
print("Missing dataset uuid or discover id or both.")

if __name__ == "__main__":
main()
16 changes: 16 additions & 0 deletions sparcur_internal/bioluc_upload/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os

class Config(object):
PENNSIEVE_API_HOST = os.environ.get("PENNSIEVE_API_HOST", "https://api.pennsieve.io")
PENNSIEVE_API_SECRET = os.environ.get("PENNSIEVE_API_SECRET", "local-secret-key")
PENNSIEVE_API_TOKEN = os.environ.get("PENNSIEVE_API_TOKEN", "local-api-key")
PENNSIEVE_ORGANIZATION_ID = os.environ.get("PENNSIEVE_ORGANIZATION")
BIOLUCIDA_ENDPOINT = os.environ.get("BIOLUCIDA_ENDPOINT", "https://sparc.biolucida.net/api/v1")
BIOLUCIDA_USERNAME = os.environ.get("BIOLUCIDA_USERNAME", "major-user")
BIOLUCIDA_PASSWORD = os.environ.get("BIOLUCIDA_PASSWORD", "local-password")
TEST_DATASET_ID = os.environ.get("TEST_DATASET_ID", "")
TEST_PACKAGE_ID = os.environ.get("TEST_PACKAGE_ID", "")
SPARC_API = os.environ.get("SPARC_API", "https://api.sparc.science/")
DATASET_UUID = os.environ.get("DATASET_UUID", "")
DISCOVER_ID = os.environ.get("DISCOVER_ID", "")
COLLECTION_ID = os.environ.get("COLLECTION_ID", "")
69 changes: 69 additions & 0 deletions sparcur_internal/bioluc_upload/get_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import math
import base64
import pathlib
import boto3 # sigh
import requests
from config import Config
import json

log_file = open('progress_log.txt', 'a')

bp_list = []


def get_biolucida_token():
url_bl_auth = f"{Config.BIOLUCIDA_ENDPOINT}/authenticate"
response = requests.post(url_bl_auth,
data=dict(
username=Config.BIOLUCIDA_USERNAME,
password=Config.BIOLUCIDA_PASSWORD,
token=''))
if response.status_code == requests.codes.ok:
content = response.json()
if content['status'] == 'success':
return content['token']
return None

def get_biolucida_id(item, token, collection_id):
print("original data:", item, token)
col_id = collection_id
if not col_id:
col_id=item['collection_id']
url_bl_colandbasename = f"{Config.BIOLUCIDA_ENDPOINT}/image/colandbasename"
resp = requests.post(url_bl_colandbasename,
data=dict(
col_id=col_id,
basename=item['basename'],
),
headers=dict(token=token))
print(resp)
if resp.status_code == requests.codes.ok:
content = resp.json()
print(content)
if content['status'] == 'success' and 'image_id' in content:
item['img_id'] = content['image_id']
item['collection_id'] = col_id
return item


def main():
dataset_id = Config.DATASET_UUID # f001
collection_id = Config.COLLECTION_ID
bp_list = []
try:
f = open('input.json', 'rb')
with f:
token = get_biolucida_token()
data = json.load(f)
for item in data:
if item['status'] == 'successful':
bp_list.append(get_biolucida_id(item, token, collection_id))

except OSError:
print("No input file")

with open('output_with_id.json', 'w') as f:
json.dump(bp_list, f)

if __name__ == "__main__":
main()
41 changes: 41 additions & 0 deletions sparcur_internal/bioluc_upload/instructions.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
Before running the script, please make sure the following environment variables are ready:

PENNSIEVE_API_SECRET
PENNSIEVE_API_TOKEN
PENNSIEVE_ORGANIZATION
BIOLUCIDA_USERNAME
BIOLUCIDA_PASSWORD
DATASET_UUID

Note: The script does not create a new collection, you will need to move the images to a collection then make the collection public if neccessary but please beware that moving an image to a new collection chages the image ids.
It may be possible to upload an image directly into a collection but I have not tried that yet.

First run the penn_bioluc.py script:
1. For the dataset of interest, get the dataset UUID and set it with the DATASET_UUID environment variable.

Steps 2 to 7 are details in the penn_bioluc.py script:

2. Using the dataset UUID, get metadata and path metadata information from SciCrunch? -
"https://cassava.ucsd.edu/sparc/datasets/{dataset_uuid}/LATEST/curation-export.json" and "https://cassava.ucsd.edu/sparc/datasets/{dataset_uuid}/LATEST/path-metadata.json". Information required are pennsieve dataset id, published id, package id, filename and filesize.

3. Authenticate to Pennsieve API server with curator access. From this server, we get the s3 URL for downloading/streaming the file.

4. Authenticate to the Biolucida server and get the access token for further API calls

5. Initiate Biolucidaupload with /upload/init, pass in the filename, filesize, chunk_size and token as parameters

6. Request data of the file in chunks and send them to Biolucida using the /upload/continue endpoint

7. After the last chunk has been sent, finalise the Biolucida upload by calling /upload/finish

8. Timeout may occur and in that case, wait for the process to finish running and then copy and rename output.json to input.json and run the penn_bioluc.py script again.
Based on my experience, the script may need to be rerun multiple names.

9. Once the script runs successfully, copy and rename output.json to input.json then run get_id.py, a new file called output_with_id.json will be created.


Note: If the images have been moved to a collection, all the image ids will be changed in the new collection, in that case, please run get_id.py with the environment variable COLLECTION_ID set which will allow the image_ids to be fetched from the dataset.

10. The output_with_id.json should contain the basename, collection id, pennsieve package id and biolucida image id for each of the uploaded images. It may take some time for the biolucida server to process all the files and the biolucida image id may be missing for some entries in the json file, in that case please rerun step 9 again after a few hours.

11. Once all image ids have been collected, with the output_with_id.json file in the directory; set the environment variable DISCOVER_ID then run bioluc_imagemap.py to map the images on biolucida server.
Loading