Skip to content

ENH: Attempt to use hard links for data sink. #1161

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 5, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
Next release
============

* ENH: Attempt to use hard links for data sink.
(https://github.com/nipy/nipype/pull/1161)
* FIX: Updates to SGE Plugins
(https://github.com/nipy/nipype/pull/1129)
* ENH: Add ants JointFusion() node with testing
(https://github.com/nipy/nipype/pull/1160)
* ENH: Add --float option for antsRegistration calls
(https://github.com/nipy/nipype/pull/1159)
* ENH: Added interface to simulate DWIs using the multi-tensor model
(https://github.com/nipy/nipype/pull/1085)
* ENH: New interface for FSL fslcpgeom utility (https://github.com/nipy/nipype/pull/1152)
Expand Down
11 changes: 11 additions & 0 deletions doc/users/config_file.rst
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,17 @@ Execution
other nodes) will never be deleted independent of this parameter. (possible
values: ``true`` and ``false``; default value: ``true``)

*try_hard_link_datasink*
When the DataSink is used to produce an orginized output file outside
of nipypes internal cache structure, a file system hard link will be
attempted first. A hard link allow multiple file paths to point to the
same physical storage location on disk if the condisions allow. By
refering to the same physical file on disk (instead of copying files
byte-by-byte) we can avoid unnecessary data duplication. If hard links
are not supported for the source or destination paths specified, then
a standard byte-by-byte copy is used. (possible values: ``true`` and
``false``; default value: ``true``)

*use_relative_paths*
Should the paths stored in results (and used to look for inputs)
be relative or absolute. Relative paths allow moving the whole
Expand Down
21 changes: 14 additions & 7 deletions nipype/interfaces/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
from nipype.utils.misc import human_order_sorted
from nipype.external import six

from ..utils.misc import str2bool
from .. import config

try:
import pyxnat
except:
Expand All @@ -53,7 +56,7 @@
iflogger = logging.getLogger('interface')


def copytree(src, dst):
def copytree(src, dst, use_hardlink=False):
"""Recursively copy a directory tree using
nipype.utils.filemanip.copyfile()

Expand All @@ -75,9 +78,10 @@ def copytree(src, dst):
dstname = os.path.join(dst, name)
try:
if os.path.isdir(srcname):
copytree(srcname, dstname)
copytree(srcname, dstname, use_hardlink)
else:
copyfile(srcname, dstname, True, hashmethod='content')
copyfile(srcname, dstname, True, hashmethod='content',
use_hardlink=use_hardlink)
except (IOError, os.error), why:
errors.append((srcname, dstname, str(why)))
# catch the Error from the recursive copytree so that we can
Expand Down Expand Up @@ -245,8 +249,8 @@ def __init__(self, infields=None, force_run=True, **kwargs):
self._always_run = True

def _get_dst(self, src):
## If path is directory with trailing os.path.sep,
## then remove that for a more robust behavior
# If path is directory with trailing os.path.sep,
# then remove that for a more robust behavior
src = src.rstrip(os.path.sep)
path, fname = os.path.split(src)
if self.inputs.parameterization:
Expand Down Expand Up @@ -306,6 +310,8 @@ def _list_outputs(self):
pass
else:
raise(inst)
use_hardlink = str2bool(config.get('execution',
'try_hard_link_datasink') )
for key, files in self.inputs._outputs.items():
if not isdefined(files):
continue
Expand Down Expand Up @@ -338,7 +344,8 @@ def _list_outputs(self):
else:
raise(inst)
iflogger.debug("copyfile: %s %s" % (src, dst))
copyfile(src, dst, copy=True, hashmethod='content')
copyfile(src, dst, copy=True, hashmethod='content',
use_hardlink=use_hardlink)
out_files.append(dst)
elif os.path.isdir(src):
dst = self._get_dst(os.path.join(src, ''))
Expand All @@ -364,7 +371,7 @@ def _list_outputs(self):
return outputs


class DataGrabberInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec): # InterfaceInputSpec):
class DataGrabberInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec):
base_directory = Directory(exists=True,
desc='Path to the base directory consisting of subject data.')
raise_on_empty = traits.Bool(True, usedefault=True,
Expand Down
1 change: 1 addition & 0 deletions nipype/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
plugin = Linear
remove_node_directories = false
remove_unnecessary_outputs = true
try_hard_link_datasink = true
single_thread_matlab = true
stop_on_first_crash = false
stop_on_first_rerun = false
Expand Down
33 changes: 30 additions & 3 deletions nipype/utils/filemanip.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,29 @@ class FileNotFoundError(Exception):
pass


def nipype_hardlink_wrapper(raw_src, raw_dst):
"""Attempt to use hard link instead of file copy.
The intent is to avoid unnnecessary duplication
of large files when using a DataSink.
Hard links are not supported on all file systems
or os environments, and will not succeed if the
src and dst are not on the same physical hardware
partition.
If the hardlink fails, then fall back to using
a standard copy.
"""
src = os.path.normpath(raw_src)
dst = os.path.normpath(raw_dst)
del raw_src
del raw_dst
if src != dst and os.path.exists(dst):
os.unlink(dst) # First remove destination
try:
os.link(src, dst) # Reference same inode to avoid duplication
except:
shutil.copyfile(src, dst) # Fall back to traditional copy


def split_filename(fname):
"""Split a filename into parts: path, base filename and extension.

Expand Down Expand Up @@ -173,7 +196,7 @@ def hash_timestamp(afile):


def copyfile(originalfile, newfile, copy=False, create_new=False,
hashmethod=None):
hashmethod=None, use_hardlink=False):
"""Copy or symlink ``originalfile`` to ``newfile``.

Parameters
Expand Down Expand Up @@ -241,8 +264,12 @@ def copyfile(originalfile, newfile, copy=False, create_new=False,
orighash = hash_infile(originalfile)
if (newhash is None) or (newhash != orighash):
try:
fmlogger.debug("Copying File: %s->%s" % (newfile, originalfile))
shutil.copyfile(originalfile, newfile)
fmlogger.debug("Copying File: %s->%s" %
(newfile, originalfile))
if use_hardlink:
nipype_hardlink_wrapper(originalfile, newfile)
else:
shutil.copyfile(originalfile, newfile)
except shutil.Error, e:
fmlogger.warn(e.message)
else:
Expand Down