Skip to content

Commit

Permalink
Merge pull request galaxyproject#16955 from mvdbeek/unify_extra_files…
Browse files Browse the repository at this point in the history
…_path_persistence

Move and re-use persist_extra_files
  • Loading branch information
mvdbeek authored Nov 1, 2023
2 parents af386d2 + dbca442 commit 8dad464
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 55 deletions.
21 changes: 9 additions & 12 deletions lib/galaxy/job_execution/output_collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from sqlalchemy.orm.scoping import ScopedSession

from galaxy.model import (
DatasetInstance,
HistoryDatasetAssociation,
HistoryDatasetCollectionAssociation,
Job,
Expand All @@ -40,7 +41,10 @@
SessionlessModelPersistenceContext,
UNSET,
)
from galaxy.objectstore import ObjectStore
from galaxy.objectstore import (
ObjectStore,
persist_extra_files,
)
from galaxy.tool_util.parser.output_collection_def import (
DEFAULT_DATASET_COLLECTOR_DESCRIPTION,
INPUT_DBKEY_TOKEN,
Expand Down Expand Up @@ -723,9 +727,11 @@ def default_exit_code_file(files_dir, id_tag):
return os.path.join(files_dir, f"galaxy_{id_tag}.ec")


def collect_extra_files(object_store, dataset, job_working_directory):
def collect_extra_files(object_store: ObjectStore, dataset: "DatasetInstance", job_working_directory: str) -> None:
# TODO: should this use compute_environment to determine the extra files path ?
assert dataset.dataset
file_name = dataset.dataset.extra_files_path_name_from(object_store)
assert file_name
output_location = "outputs"
temp_file_path = os.path.join(job_working_directory, output_location, file_name)
if not os.path.exists(temp_file_path):
Expand All @@ -740,16 +746,7 @@ def collect_extra_files(object_store, dataset, job_working_directory):
# automatically creates them. However, empty directories will
# not be created in the object store at all, which might be a
# problem.
for root, _dirs, files in os.walk(temp_file_path):
for f in files:
object_store.update_from_file(
dataset.dataset,
extra_dir=os.path.normpath(os.path.join(file_name, os.path.relpath(root, temp_file_path))),
alt_name=f,
file_name=os.path.join(root, f),
create=True,
preserve_symlinks=True,
)
persist_extra_files(object_store=object_store, src_extra_files_path=temp_file_path, primary_data=dataset)
except Exception as e:
log.debug("Error in collect_associated_files: %s", unicodify(e))

Expand Down
24 changes: 3 additions & 21 deletions lib/galaxy/model/store/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
from galaxy.objectstore import (
BaseObjectStore,
ObjectStore,
persist_extra_files,
)
from galaxy.schema.bco import (
BioComputeObjectCore,
Expand Down Expand Up @@ -101,10 +102,7 @@
)
from galaxy.util.bunch import Bunch
from galaxy.util.compression_utils import CompressedFile
from galaxy.util.path import (
safe_walk,
StrPath,
)
from galaxy.util.path import StrPath
from ._bco_convert_utils import (
bco_workflow_version,
SoftwarePrerequisiteTracker,
Expand Down Expand Up @@ -639,24 +637,8 @@ def handle_dataset_object_edit(dataset_instance, dataset_attrs):
dataset_extra_files_path = dataset_attrs.get("extra_files_path", None)
if dataset_extra_files_path:
assert file_source_root
dir_name = dataset_instance.dataset.extra_files_path_name
dataset_extra_files_path = os.path.join(file_source_root, dataset_extra_files_path)
for root, _dirs, files in safe_walk(dataset_extra_files_path):
extra_dir = os.path.join(
dir_name, root.replace(dataset_extra_files_path, "", 1).lstrip(os.path.sep)
)
extra_dir = os.path.normpath(extra_dir)
for extra_file in files:
source = os.path.join(root, extra_file)
if not in_directory(source, file_source_root):
raise MalformedContents(f"Invalid dataset path: {source}")
self.object_store.update_from_file(
dataset_instance.dataset,
extra_dir=extra_dir,
alt_name=extra_file,
file_name=source,
create=True,
)
persist_extra_files(self.object_store, dataset_extra_files_path, dataset_instance)
# Don't trust serialized file size
dataset_instance.dataset.file_size = None
dataset_instance.dataset.set_total_size() # update the filesize record in the database
Expand Down
27 changes: 5 additions & 22 deletions lib/galaxy/model/store/discover.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@
from galaxy.exceptions import RequestParameterInvalidException
from galaxy.model.dataset_collections import builder
from galaxy.model.tags import GalaxySessionlessTagHandler
from galaxy.objectstore import ObjectStore
from galaxy.objectstore import (
ObjectStore,
persist_extra_files,
)
from galaxy.util import (
chunk_iterable,
ExecutionTimer,
Expand Down Expand Up @@ -409,7 +412,7 @@ def update_object_store_with_datasets(self, datasets, paths, extra_files, output

self.object_store.update_from_file(dataset.dataset, file_name=path, create=True)
if extra_file:
persist_extra_files(self.object_store, extra_files, dataset)
persist_extra_files(self.object_store, extra_file, dataset)
dataset.set_size()
else:
dataset.set_size(no_extra_files=True)
Expand Down Expand Up @@ -667,26 +670,6 @@ def get_implicit_collection_jobs_association_id(self):
"""No-op, no job context."""


def persist_extra_files(object_store, src_extra_files_path, primary_data):
if src_extra_files_path and os.path.exists(src_extra_files_path):
primary_data.dataset.create_extra_files_path()
target_extra_files_path = primary_data.extra_files_path
for root, _dirs, files in os.walk(src_extra_files_path):
extra_dir = os.path.join(
target_extra_files_path, root.replace(src_extra_files_path, "", 1).lstrip(os.path.sep)
)
extra_dir = os.path.normpath(extra_dir)
for f in files:
object_store.update_from_file(
primary_data.dataset,
extra_dir=extra_dir,
alt_name=f,
file_name=os.path.join(root, f),
create=True,
preserve_symlinks=True,
)


def persist_target_to_export_store(target_dict, export_store, object_store, work_directory):
replace_request_syntax_sugar(target_dict)
model_persistence_context = SessionlessModelPersistenceContext(object_store, export_store, work_directory)
Expand Down
30 changes: 30 additions & 0 deletions lib/galaxy/objectstore/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,30 @@
Optional,
Tuple,
Type,
TYPE_CHECKING,
)

import yaml
from pydantic import BaseModel

from galaxy.exceptions import (
MalformedContents,
ObjectInvalid,
ObjectNotFound,
)
from galaxy.util import (
asbool,
directory_hash_id,
force_symlink,
in_directory,
parse_xml,
umask_fix_perms,
)
from galaxy.util.bunch import Bunch
from galaxy.util.path import (
safe_makedirs,
safe_relpath,
safe_walk,
)
from galaxy.util.sleeper import Sleeper
from .badges import (
Expand All @@ -50,6 +54,9 @@
)
from .caching import CacheTarget

if TYPE_CHECKING:
from galaxy.model import DatasetInstance

NO_SESSION_ERROR_MESSAGE = (
"Attempted to 'create' object store entity in configuration with no database session present."
)
Expand Down Expand Up @@ -217,6 +224,7 @@ def update_from_file(
obj_dir=False,
file_name=None,
create=False,
preserve_symlinks=False,
):
"""
Inform the store that the file associated with `obj.id` has been updated.
Expand Down Expand Up @@ -1577,3 +1585,25 @@ def set_dataset_object_store_id(self, dataset, require_shareable=True):
except ObjectInvalid:
raise Exception("Unable to create output dataset: object store is full")
self.object_store_id = dataset.object_store_id # these will be the same thing after the first output


def persist_extra_files(object_store: ObjectStore, src_extra_files_path: str, primary_data: "DatasetInstance") -> None:
if os.path.exists(src_extra_files_path):
assert primary_data.dataset
extra_files_path_name = primary_data.dataset.extra_files_path_name_from(object_store)
assert extra_files_path_name
for root, _dirs, files in safe_walk(src_extra_files_path):
extra_dir = os.path.join(extra_files_path_name, os.path.relpath(root, src_extra_files_path))
extra_dir = os.path.normpath(extra_dir)
for f in files:
if not in_directory(f, src_extra_files_path):
# Unclear if this can ever happen if we use safe_walk ... probably not ?
raise MalformedContents(f"Invalid dataset path: {f}")
object_store.update_from_file(
primary_data.dataset,
extra_dir=extra_dir,
alt_name=f,
file_name=os.path.join(root, f),
create=True,
preserve_symlinks=True,
)

0 comments on commit 8dad464

Please sign in to comment.