Skip to content

Commit

Permalink
Merge pull request #215 from SciCatProject/use-ds-datablock-endpoint
Browse files Browse the repository at this point in the history
Use datasets/pid/origdatablocks endpoint in upload
  • Loading branch information
jl-wynen authored May 24, 2024
2 parents b1dd441 + 045b0c9 commit 1b554e0
Show file tree
Hide file tree
Showing 10 changed files with 52 additions and 79 deletions.
20 changes: 14 additions & 6 deletions src/scitacean/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,8 @@ def upload_new_dataset_now(self, dataset: Dataset) -> Dataset:

with_new_pid = dataset.replace(_read_only={"pid": finalized_model.pid})
finalized_orig_datablocks = self._upload_orig_datablocks(
with_new_pid.make_datablock_upload_models().orig_datablocks
with_new_pid.make_datablock_upload_models().orig_datablocks,
with_new_pid.pid, # type: ignore[arg-type]
)
finalized_attachments = self._upload_attachments_for_dataset(
with_new_pid.make_attachment_upload_models(),
Expand Down Expand Up @@ -333,20 +334,22 @@ def upload_new_sample_now(self, sample: model.Sample) -> model.Sample:
return model.Sample.from_download_model(finalized_model)

def _upload_orig_datablocks(
self, orig_datablocks: list[model.UploadOrigDatablock] | None
self,
orig_datablocks: list[model.UploadOrigDatablock] | None,
dataset_id: PID,
) -> list[model.DownloadOrigDatablock]:
if not orig_datablocks:
return []

try:
return [
self.scicat.create_orig_datablock(orig_datablock)
self.scicat.create_orig_datablock(orig_datablock, dataset_id=dataset_id)
for orig_datablock in orig_datablocks
]
except ScicatCommError as exc:
raise RuntimeError(
"Failed to upload original datablocks for SciCat dataset "
f"{orig_datablocks[0].datasetId}:"
f"{dataset_id}:"
f"\n{exc.args}\nThe dataset and data files were successfully uploaded "
"but are not linked with each other. Please fix the dataset manually!"
) from exc
Expand Down Expand Up @@ -862,7 +865,10 @@ def create_dataset_model(
)

def create_orig_datablock(
self, dblock: model.UploadOrigDatablock
self,
dblock: model.UploadOrigDatablock,
*,
dataset_id: PID,
) -> model.DownloadOrigDatablock:
"""Create a new orig datablock in SciCat.
Expand All @@ -878,6 +884,8 @@ def create_orig_datablock(
----------
dblock:
Model of the orig datablock to create.
dataset_id:
PID of the dataset that this datablock belongs to.
Raises
------
Expand All @@ -887,7 +895,7 @@ def create_orig_datablock(
"""
uploaded = self._call_endpoint(
cmd="post",
url="origdatablocks",
url=f"datasets/{quote_plus(str(dataset_id))}/origdatablocks",
data=dblock,
operation="create_orig_datablock",
)
Expand Down
28 changes: 1 addition & 27 deletions src/scitacean/datablock.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,9 @@
import dataclasses
from collections.abc import Iterable, Iterator
from datetime import datetime
from typing import TYPE_CHECKING

from .file import File
from .model import DownloadOrigDatablock, UploadOrigDatablock
from .pid import PID

if TYPE_CHECKING:
from .dataset import Dataset

# TODO Datablock

Expand All @@ -32,13 +27,10 @@ class OrigDatablock:

_files: list[File] = dataclasses.field(init=False)
checksum_algorithm: str | None = None
instrument_group: str | None = None
owner_group: str | None = None
init_files: dataclasses.InitVar[Iterable[File] | None] = None
_access_groups: list[str] | None = None
_created_at: datetime | None = None
_created_by: str | None = None
_dataset_id: PID | None = None
_id: str | None = None
_is_published: bool | None = None
_updated_at: datetime | None = None
Expand Down Expand Up @@ -67,12 +59,9 @@ def from_download_model(
dblock = orig_datablock_model
return OrigDatablock(
checksum_algorithm=dblock.chkAlg,
owner_group=dblock.ownerGroup,
instrument_group=dblock.instrumentGroup,
_access_groups=dblock.accessGroups,
_created_at=dblock.createdAt,
_created_by=dblock.createdBy,
_dataset_id=orig_datablock_model.datasetId,
_id=orig_datablock_model.id,
_is_published=orig_datablock_model.isPublished,
_updated_at=dblock.updatedAt,
Expand Down Expand Up @@ -118,11 +107,6 @@ def updated_by(self) -> str | None:
"""User who last updated this datablock."""
return self._updated_by

@property
def dataset_id(self) -> PID | None:
"""PID of the dataset this datablock belongs to."""
return self._dataset_id

@property
def datablock_id(self) -> str | None:
"""ID of this datablock."""
Expand All @@ -146,26 +130,16 @@ def add_files(self, *files: File) -> None:
for f in files
)

def make_upload_model(self, dataset: Dataset) -> UploadOrigDatablock:
def make_upload_model(self) -> UploadOrigDatablock:
"""Build a new pydantic model to upload this datablock.
Parameters
----------
dataset:
The dataset that this orig datablock belongs to.
Returns
-------
:
A new model for this orig datablock.
"""
owner_group = self.owner_group or dataset.owner_group
return UploadOrigDatablock(
chkAlg=self.checksum_algorithm,
size=self.size,
dataFileList=[file.make_model(for_archive=False) for file in self.files],
datasetId=dataset.pid, # type: ignore[arg-type]
ownerGroup=owner_group,
accessGroups=self.access_groups or dataset.access_groups,
instrumentGroup=self.instrument_group or dataset.instrument_group,
)
4 changes: 2 additions & 2 deletions src/scitacean/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ def add_orig_datablock(self, *, checksum_algorithm: str | None) -> OrigDatablock
The newly added datablock.
"""
dblock = OrigDatablock(
checksum_algorithm=checksum_algorithm, _dataset_id=self.pid
checksum_algorithm=checksum_algorithm,
)
self._orig_datablocks.append(dblock)
return dblock
Expand Down Expand Up @@ -470,7 +470,7 @@ def make_datablock_upload_models(self) -> DatablockUploadModels:
return DatablockUploadModels(orig_datablocks=None)
return DatablockUploadModels(
orig_datablocks=[
dblock.make_upload_model(self) for dblock in self._orig_datablocks
dblock.make_upload_model() for dblock in self._orig_datablocks
]
)

Expand Down
4 changes: 0 additions & 4 deletions src/scitacean/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,12 +340,8 @@ def upload_model_type(cls) -> type[UploadOrigDatablock]:

class UploadOrigDatablock(BaseModel):
dataFileList: list[UploadDataFile]
datasetId: PID
size: NonNegativeInt
accessGroups: list[str] | None = None
chkAlg: str | None = None
instrumentGroup: str | None = None
ownerGroup: str | None = None

@classmethod
def download_model_type(cls) -> type[DownloadOrigDatablock]:
Expand Down
42 changes: 11 additions & 31 deletions src/scitacean/testing/backend/seed.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,9 @@
datasetName="My darkest magic yet",
description="Doing some dark shit",
isPublished=False,
numberOfFiles=2,
numberOfFilesArchived=0,
owner="PLACEHOLDER",
ownerEmail="PLACE@HOLD.ER",
size=619,
sourceFolder=RemotePath("/hex/data/123"),
type=DatasetType.RAW,
principalInvestigator="Ponder Stibbons",
Expand All @@ -68,11 +66,9 @@
datasetName="Reprocessed dark magic",
description="Making it even darker",
isPublished=True,
numberOfFiles=1,
numberOfFilesArchived=0,
owner="PLACEHOLDER",
ownerEmail="PLACE@HOLD.ER",
size=464,
sourceFolder=RemotePath("/hex/data/dd"),
type=DatasetType.DERIVED,
investigator="Ponder Stibbons",
Expand All @@ -92,11 +88,9 @@
datasetName="Shoe counter",
description="Got all these shoes!",
isPublished=True,
numberOfFiles=1,
numberOfFilesArchived=0,
owner="PLACEHOLDER",
ownerEmail="PLACE@HOLD.ER",
size=64,
sourceFolder=RemotePath("/hex/secret/stuff"),
type=DatasetType.RAW,
principalInvestigator="Mustrum Ridcully",
Expand Down Expand Up @@ -133,10 +127,7 @@
_ORIG_DATABLOCKS: dict[str, list[UploadOrigDatablock]] = {
"raw": [
UploadOrigDatablock(
datasetId=PID(pid="PLACEHOLDER"),
ownerGroup="PLACEHOLDER",
size=619,
accessGroups=["uu", "faculty"],
chkAlg="md5",
dataFileList=[
UploadDataFile(
Expand All @@ -162,10 +153,7 @@
],
"derived": [
UploadOrigDatablock(
datasetId=PID(pid="PLACEHOLDER"),
ownerGroup="PLACEHOLDER",
size=464,
accessGroups=["uu", "faculty"],
chkAlg="sha256",
dataFileList=[
UploadDataFile(
Expand All @@ -182,10 +170,7 @@
],
"public": [
UploadOrigDatablock(
datasetId=PID(pid="PLACEHOLDER"),
ownerGroup="PLACEHOLDER",
size=64,
accessGroups=["uu"],
chkAlg="md5",
dataFileList=[
UploadDataFile(
Expand Down Expand Up @@ -239,15 +224,6 @@ def _apply_config_dataset(
return dset


def _apply_config_orig_datablock(
dblock: UploadOrigDatablock, dset: DownloadDataset, user: SciCatUser
) -> UploadOrigDatablock:
dblock = deepcopy(dblock)
dblock.ownerGroup = user.group
dblock.datasetId = dset.pid # type: ignore[assignment]
return dblock


def _apply_config_attachment(
attachment: UploadAttachment, user: SciCatUser
) -> UploadAttachment:
Expand Down Expand Up @@ -282,20 +258,24 @@ def seed_database(*, client: Client, scicat_access: SciCatAccess) -> None:
}
INITIAL_DATASETS.update(download_datasets)

upload_orig_datablocks = {
download_orig_datablocks = {
key: [
_apply_config_orig_datablock(
dblock, download_datasets[key], scicat_access.user
client.scicat.create_orig_datablock(
dblock,
dataset_id=download_datasets[key].pid, # type: ignore[arg-type]
)
for dblock in dblocks
]
for key, dblocks in _ORIG_DATABLOCKS.items()
}
download_orig_datablocks = {
key: [client.scicat.create_orig_datablock(dblock) for dblock in dblocks]
for key, dblocks in upload_orig_datablocks.items()
}
INITIAL_ORIG_DATABLOCKS.update(download_orig_datablocks)
for key, dblocks in INITIAL_ORIG_DATABLOCKS.items():
# Need to set these after uploading the datablocks to
# make sure that the database has the correct values.
INITIAL_DATASETS[key].numberOfFiles = sum(
len(dblock.dataFileList or ()) for dblock in dblocks
)
INITIAL_DATASETS[key].size = sum(dblock.size or 0 for dblock in dblocks)

upload_attachments = {
key: [
Expand Down
4 changes: 2 additions & 2 deletions src/scitacean/testing/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,10 +236,9 @@ def create_dataset_model(

@_conditionally_disabled
def create_orig_datablock(
self, dblock: model.UploadOrigDatablock
self, dblock: model.UploadOrigDatablock, *, dataset_id: PID
) -> model.DownloadOrigDatablock:
"""Create a new orig datablock in SciCat."""
dataset_id = dblock.datasetId
if (dset := self.main.datasets.get(dataset_id)) is None:
raise ScicatCommError(f"No dataset with id {dataset_id}")
ingested = _process_orig_datablock(dblock, dset)
Expand Down Expand Up @@ -357,6 +356,7 @@ def _process_orig_datablock(
createdAt=created_at,
updatedBy="fake",
updatedAt=created_at,
datasetId=dset.pid,
**fields,
)
return processed
Expand Down
5 changes: 1 addition & 4 deletions tests/client/datablock_client_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ def orig_datablock(scicat_access):
path="data.nxs", size=9235, time=parse_date("2023-08-18T13:52:33.000Z")
)
],
datasetId="PLACEHOLDER",
ownerGroup=scicat_access.user.group,
)


Expand All @@ -64,8 +62,7 @@ def test_get_orig_datablock(scicat_client, key):

def test_create_first_orig_datablock(scicat_client, derived_dataset, orig_datablock):
uploaded = scicat_client.create_dataset_model(derived_dataset)
orig_datablock.datasetId = uploaded.pid
scicat_client.create_orig_datablock(orig_datablock)
scicat_client.create_orig_datablock(orig_datablock, dataset_id=uploaded.pid)
downloaded = scicat_client.get_orig_datablocks(uploaded.pid)
assert len(downloaded) == 1
downloaded = downloaded[0]
Expand Down
21 changes: 20 additions & 1 deletion tests/client/dataset_client_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pytest
from dateutil.parser import parse as parse_date

from scitacean import PID, Client, RemotePath, ScicatCommError
from scitacean import PID, Client, Dataset, RemotePath, ScicatCommError
from scitacean.client import ScicatClient
from scitacean.model import (
DatasetType,
Expand Down Expand Up @@ -141,3 +141,22 @@ def test_get_broken_dataset_strict_validation(real_client, require_scicat_backen
dset = INITIAL_DATASETS["partially-broken"]
with pytest.raises(pydantic.ValidationError):
real_client.get_dataset(dset.pid, strict_validation=True)


def test_dataset_with_orig_datablock_roundtrip(client):
ds = Dataset.from_download_models(
INITIAL_DATASETS["raw"], INITIAL_ORIG_DATABLOCKS["raw"], []
).as_new()
# Unset fields that a raw dataset should not have but where initialized
# in the download model.
ds.input_datasets = None
ds.used_software = None

# We don't need a file transfer because all files are on remote.
finalized = client.upload_new_dataset_now(ds)
downloaded = client.get_dataset(finalized.pid)

assert downloaded.name == ds.name
assert downloaded.owner == ds.owner
assert downloaded.size == ds.size
assert downloaded.number_of_files == ds.number_of_files
1 change: 0 additions & 1 deletion tests/dataset_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,6 @@ def test_make_scicat_models_datablock_with_one_file(dataset):

block = blocks[0]
assert block.size == 6163
assert block.datasetId == dataset.pid
assert block.dataFileList == [model.UploadDataFile(**file_model.model_dump())]


Expand Down
2 changes: 1 addition & 1 deletion tools/model-generation/spec/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def user_dset_fields(self, manual: bool | None = None) -> list[DatasetField]:

_SCHEMA_GROUPS = {
"Attachment": ("CreateAttachmentDto", "Attachment"),
"OrigDatablock": ("CreateOrigDatablockDto", "OrigDatablock"),
"OrigDatablock": ("CreateDatasetOrigDatablockDto", "OrigDatablock"),
"Datablock": ("CreateDatasetDatablockDto", "Datablock"),
"Lifecycle": (None, "LifecycleClass"),
"Technique": ("TechniqueClass", "TechniqueClass"),
Expand Down

0 comments on commit 1b554e0

Please sign in to comment.