Skip to content

Commit a71ee4c

Browse files
committed
ENH: Attempt to use hard links for data sink.
In many cases on Unix, the data sink is on the same physical drive as the internal nipype cache. In that case, we can use a hard link to save both time necessary to duplicate data, and space necesssary to hold the same data at two different inodes. This allows removal of the cache directory without modifying the results directory. In large analysis, this optimization can save several terabytes of storage consumption.
1 parent 67f2a59 commit a71ee4c

File tree

5 files changed

+64
-10
lines changed

5 files changed

+64
-10
lines changed

CHANGES

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
Next release
22
============
33

4+
* ENH: Attempt to use hard links for data sink.
5+
(https://github.com/nipy/nipype/pull/1161)
6+
* FIX: Updates to SGE Plugins
7+
(https://github.com/nipy/nipype/pull/1129)
8+
* ENH: Add ants JointFusion() node with testing
9+
(https://github.com/nipy/nipype/pull/1160)
10+
* ENH: Add --float option for antsRegistration calls
11+
(https://github.com/nipy/nipype/pull/1159)
412
* ENH: Added interface to simulate DWIs using the multi-tensor model
513
(https://github.com/nipy/nipype/pull/1085)
614
* ENH: New interface for FSL fslcpgeom utility (https://github.com/nipy/nipype/pull/1152)

doc/users/config_file.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,17 @@ Execution
8484
other nodes) will never be deleted independent of this parameter. (possible
8585
values: ``true`` and ``false``; default value: ``true``)
8686

87+
*try_hard_link_datasink*
88+
When the DataSink is used to produce an orginized output file outside
89+
of nipypes internal cache structure, a file system hard link will be
90+
attempted first. A hard link allow multiple file paths to point to the
91+
same physical storage location on disk if the condisions allow. By
92+
refering to the same physical file on disk (instead of copying files
93+
byte-by-byte) we can avoid unnecessary data duplication. If hard links
94+
are not supported for the source or destination paths specified, then
95+
a standard byte-by-byte copy is used. (possible values: ``true`` and
96+
``false``; default value: ``true``)
97+
8798
*use_relative_paths*
8899
Should the paths stored in results (and used to look for inputs)
89100
be relative or absolute. Relative paths allow moving the whole

nipype/interfaces/io.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
from nipype.utils.misc import human_order_sorted
3333
from nipype.external import six
3434

35+
from ..utils.misc import str2bool
36+
from .. import config
37+
3538
try:
3639
import pyxnat
3740
except:
@@ -53,7 +56,7 @@
5356
iflogger = logging.getLogger('interface')
5457

5558

56-
def copytree(src, dst):
59+
def copytree(src, dst, use_hardlink=False):
5760
"""Recursively copy a directory tree using
5861
nipype.utils.filemanip.copyfile()
5962
@@ -75,9 +78,10 @@ def copytree(src, dst):
7578
dstname = os.path.join(dst, name)
7679
try:
7780
if os.path.isdir(srcname):
78-
copytree(srcname, dstname)
81+
copytree(srcname, dstname, use_hardlink)
7982
else:
80-
copyfile(srcname, dstname, True, hashmethod='content')
83+
copyfile(srcname, dstname, True, hashmethod='content',
84+
use_hardlink=use_hardlink)
8185
except (IOError, os.error), why:
8286
errors.append((srcname, dstname, str(why)))
8387
# catch the Error from the recursive copytree so that we can
@@ -245,8 +249,8 @@ def __init__(self, infields=None, force_run=True, **kwargs):
245249
self._always_run = True
246250

247251
def _get_dst(self, src):
248-
## If path is directory with trailing os.path.sep,
249-
## then remove that for a more robust behavior
252+
# If path is directory with trailing os.path.sep,
253+
# then remove that for a more robust behavior
250254
src = src.rstrip(os.path.sep)
251255
path, fname = os.path.split(src)
252256
if self.inputs.parameterization:
@@ -306,6 +310,8 @@ def _list_outputs(self):
306310
pass
307311
else:
308312
raise(inst)
313+
use_hardlink = str2bool(config.get('execution',
314+
'try_hard_link_datasink') )
309315
for key, files in self.inputs._outputs.items():
310316
if not isdefined(files):
311317
continue
@@ -338,7 +344,8 @@ def _list_outputs(self):
338344
else:
339345
raise(inst)
340346
iflogger.debug("copyfile: %s %s" % (src, dst))
341-
copyfile(src, dst, copy=True, hashmethod='content')
347+
copyfile(src, dst, copy=True, hashmethod='content',
348+
use_hardlink=use_hardlink)
342349
out_files.append(dst)
343350
elif os.path.isdir(src):
344351
dst = self._get_dst(os.path.join(src, ''))
@@ -364,7 +371,7 @@ def _list_outputs(self):
364371
return outputs
365372

366373

367-
class DataGrabberInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec): # InterfaceInputSpec):
374+
class DataGrabberInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec):
368375
base_directory = Directory(exists=True,
369376
desc='Path to the base directory consisting of subject data.')
370377
raise_on_empty = traits.Bool(True, usedefault=True,

nipype/utils/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
plugin = Linear
4444
remove_node_directories = false
4545
remove_unnecessary_outputs = true
46+
try_hard_link_datasink = true
4647
single_thread_matlab = true
4748
stop_on_first_crash = false
4849
stop_on_first_rerun = false

nipype/utils/filemanip.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,29 @@ class FileNotFoundError(Exception):
2828
pass
2929

3030

31+
def nipype_hardlink_wrapper(raw_src, raw_dst):
32+
"""Attempt to use hard link instead of file copy.
33+
The intent is to avoid unnnecessary duplication
34+
of large files when using a DataSink.
35+
Hard links are not supported on all file systems
36+
or os environments, and will not succeed if the
37+
src and dst are not on the same physical hardware
38+
partition.
39+
If the hardlink fails, then fall back to using
40+
a standard copy.
41+
"""
42+
src = os.path.normpath(raw_src)
43+
dst = os.path.normpath(raw_dst)
44+
del raw_src
45+
del raw_dst
46+
if src != dst and os.path.exists(dst):
47+
os.unlink(dst) # First remove destination
48+
try:
49+
os.link(src, dst) # Reference same inode to avoid duplication
50+
except:
51+
shutil.copyfile(src, dst) # Fall back to traditional copy
52+
53+
3154
def split_filename(fname):
3255
"""Split a filename into parts: path, base filename and extension.
3356
@@ -173,7 +196,7 @@ def hash_timestamp(afile):
173196

174197

175198
def copyfile(originalfile, newfile, copy=False, create_new=False,
176-
hashmethod=None):
199+
hashmethod=None, use_hardlink=False):
177200
"""Copy or symlink ``originalfile`` to ``newfile``.
178201
179202
Parameters
@@ -241,8 +264,12 @@ def copyfile(originalfile, newfile, copy=False, create_new=False,
241264
orighash = hash_infile(originalfile)
242265
if (newhash is None) or (newhash != orighash):
243266
try:
244-
fmlogger.debug("Copying File: %s->%s" % (newfile, originalfile))
245-
shutil.copyfile(originalfile, newfile)
267+
fmlogger.debug("Copying File: %s->%s" %
268+
(newfile, originalfile))
269+
if use_hardlink:
270+
nipype_hardlink_wrapper(originalfile, newfile)
271+
else:
272+
shutil.copyfile(originalfile, newfile)
246273
except shutil.Error, e:
247274
fmlogger.warn(e.message)
248275
else:

0 commit comments

Comments
 (0)