nipy · yarikoptic · Sep 14, 2018 · Sep 14, 2018 · Sep 21, 2018 · Sep 21, 2018
diff --git a/appveyor.yml b/appveyor.yml
@@ -19,6 +19,9 @@ install:
   # the parent CMD process).
   - SET PATH=%PYTHON%;%PYTHON%\Scripts;%PATH%
 
+  # Update install environment
+  - pip install --upgrade pip setuptools
+
   # Install the dependencies of the project.
   - pip install numpy scipy matplotlib nose h5py mock hypothesis pydicom
   - pip install .

diff --git a/nibabel/cmdline/diff.py b/nibabel/cmdline/diff.py
@@ -39,6 +39,21 @@ def get_opt_parser():
         Option("-H", "--header-fields",
                dest="header_fields", default='all',
                help="Header fields (comma separated) to be printed as well (if present)"),
+
+        Option("--ma", "--data-max-abs-diff",
+               dest="data_max_abs_diff",
+               type=float,
+               default=0.0,
+               help="Maximal absolute difference in data between files to tolerate."),
+
+        Option("--mr", "--data-max-rel-diff",
+               dest="data_max_rel_diff",
+               type=float,
+               default=0.0,
+               help="Maximal relative difference in data between files to tolerate."
+                    " If --data-max-abs-diff is also specified, only the data points "
+                    " with absolute difference greater than that value would be "
+                    " considered for relative difference check."),
     ])
 
     return p
@@ -101,8 +116,8 @@ def get_headers_diff(file_headers, names=None):
     return difference
 
 
-def get_data_diff(files):
-    """Get difference between md5 values
+def get_data_hash_diff(files):
+    """Get difference between md5 values of data
 
         Parameters
         ----------
@@ -115,7 +130,7 @@ def get_data_diff(files):
         """
 
     md5sums = [
-        hashlib.md5(np.ascontiguousarray(nib.load(f).get_data(), dtype=np.float32)).hexdigest()
+        hashlib.md5(np.ascontiguousarray(nib.load(f).get_fdata())).hexdigest()
         for f in files
     ]
 
@@ -125,6 +140,84 @@ def get_data_diff(files):
     return md5sums
 
 
+def get_data_diff(files, max_abs=0, max_rel=0):
+    """Get difference between data
+
+    Parameters
+    ----------
+    files: list of (str or ndarray)
+      If list of strings is provided -- they must be existing file names
+    max_abs: float, optional
+      Maximal absolute difference to tolerate.
+    max_rel: float, optional
+      Maximal relative (`abs(diff)/mean(diff)`) difference to tolerate.
+      If `max_abs` is specified, then those data points with lesser than that
+      absolute difference, are not considered for relative difference testing
+
+    Returns
+    -------
+    diffs: OrderedDict
+        An ordered dict with a record per each file which has differences
+        with other files subsequent detected. Each record is a list of
+        difference records, one per each file pair.
+        Each difference record is an Ordered Dict with possible keys
+        'abs' or 'rel' showing maximal absolute or relative differences
+        in the file or the record ('CMP': 'incompat') if file shapes
+        are incompatible.
+    """
+
+    # we are doomed to keep them in RAM now
+    data = [f if isinstance(f, np.ndarray) else nib.load(f).get_fdata()
+            for f in files]
+    diffs = OrderedDict()
+    for i, d1 in enumerate(data[:-1]):
+        # populate empty entries for non-compared
+        diffs1 = [None] * (i + 1)
+
+        for j, d2 in enumerate(data[i + 1:], i + 1):
+
+            if d1.shape == d2.shape:
+                abs_diff = np.abs(d1 - d2)
+                mean_abs = (np.abs(d1) + np.abs(d2)) * 0.5
+                candidates = np.logical_or(mean_abs != 0, abs_diff != 0)
+
+                if max_abs:
+                    candidates[abs_diff <= max_abs] = False
+
+                max_abs_diff = np.max(abs_diff)
+                if np.any(candidates):
+                    rel_diff = abs_diff[candidates] / mean_abs[candidates]
+                    if max_rel:
+                        sub_thr = rel_diff <= max_rel
+                        # Since we operated on sub-selected values already, we need
+                        # to plug them back in
+                        candidates[
+                            tuple((indexes[sub_thr] for indexes in np.where(candidates)))
+                        ] = False
+                    max_rel_diff = np.max(rel_diff)
+                else:
+                    max_rel_diff = 0
+
+                if np.any(candidates):
+
+                    diff_rec = OrderedDict()  # so that abs goes before relative
+
+                    diff_rec['abs'] = max_abs_diff
+                    diff_rec['rel'] = max_rel_diff
+                    diffs1.append(diff_rec)
+                else:
+                    diffs1.append(None)
+
+            else:
+                diffs1.append({'CMP': "incompat"})
+
+        if any(diffs1):
+
+            diffs['DATA(diff %d:)' % (i + 1)] = diffs1
+
+    return diffs
+
+
 def display_diff(files, diff):
     """Format header differences into a nice string
 
@@ -140,21 +233,27 @@ def display_diff(files, diff):
     """
     output = ""
     field_width = "{:<15}"
+    filename_width = "{:<53}"
     value_width = "{:<55}"
 
     output += "These files are different.\n"
-    output += field_width.format('Field')
+    output += field_width.format('Field/File')
 
-    for f in files:
-        output += value_width.format(os.path.basename(f))
+    for i, f in enumerate(files, 1):
+        output += "%d:%s" % (i, filename_width.format(os.path.basename(f)))
 
     output += "\n"
 
     for key, value in diff.items():
         output += field_width.format(key)
 
         for item in value:
-            item_str = str(item)
+            if isinstance(item, dict):
+                item_str = ', '.join('%s: %s' % i for i in item.items())
+            elif item is None:
+                item_str = '-'
+            else:
+                item_str = str(item)
             # Value might start/end with some invisible spacing characters so we
             # would "condition" it on both ends a bit
             item_str = re.sub('^[ \t]+', '<', item_str)
@@ -169,8 +268,37 @@ def display_diff(files, diff):
     return output
 
 
+def diff(files, header_fields='all', data_max_abs_diff=None, data_max_rel_diff=None):
+    assert len(files) >= 2, "Please enter at least two files"
+
+    file_headers = [nib.load(f).header for f in files]
+
+    # signals "all fields"
+    if header_fields == 'all':
+        # TODO: header fields might vary across file types, thus prior sensing would be needed
+        header_fields = file_headers[0].keys()
+    else:
+        header_fields = header_fields.split(',')
+
+    diff = get_headers_diff(file_headers, header_fields)
+
+    data_md5_diffs = get_data_hash_diff(files)
+    if data_md5_diffs:
+        # provide details, possibly triggering the ignore of the difference
+        # in data
+        data_diffs = get_data_diff(files,
+                                   max_abs=data_max_abs_diff,
+                                   max_rel=data_max_rel_diff)
+        if data_diffs:
+            diff['DATA(md5)'] = data_md5_diffs
+            diff.update(data_diffs)
+
+    return diff
+
+
 def main(args=None, out=None):
     """Getting the show on the road"""
+
     out = out or sys.stdout
     parser = get_opt_parser()
     (opts, files) = parser.parse_args(args)
@@ -181,27 +309,16 @@ def main(args=None, out=None):
         # suppress nibabel format-compliance warnings
         nib.imageglobals.logger.level = 50
 
-    assert len(files) >= 2, "Please enter at least two files"
-
-    file_headers = [nib.load(f).header for f in files]
-
-    # signals "all fields"
-    if opts.header_fields == 'all':
-        # TODO: header fields might vary across file types, thus prior sensing would be needed
-        header_fields = file_headers[0].keys()
-    else:
-        header_fields = opts.header_fields.split(',')
-
-    diff = get_headers_diff(file_headers, header_fields)
-    data_diff = get_data_diff(files)
-
-    if data_diff:
-        diff['DATA(md5)'] = data_diff
+    files_diff = diff(
+        files,
+        header_fields=opts.header_fields,
+        data_max_abs_diff=opts.data_max_abs_diff,
+        data_max_rel_diff=opts.data_max_rel_diff
+    )
 
-    if diff:
-        out.write(display_diff(files, diff))
+    if files_diff:
+        out.write(display_diff(files, files_diff))
         raise SystemExit(1)
-
     else:
         out.write("These files are identical.\n")
         raise SystemExit(0)
diff --git a/nibabel/cmdline/tests/test_utils.py b/nibabel/cmdline/tests/test_utils.py
@@ -11,7 +11,7 @@
 import nibabel as nib
 import numpy as np
 from nibabel.cmdline.utils import *
-from nibabel.cmdline.diff import get_headers_diff, display_diff, main, get_data_diff
+from nibabel.cmdline.diff import get_headers_diff, display_diff, main, get_data_hash_diff, get_data_diff
 from os.path import (join as pjoin)
 from nibabel.testing import data_path
 from collections import OrderedDict
@@ -96,9 +96,9 @@ def test_display_diff():
         ("bitpix", [np.array(8).astype(dtype="uint8"), np.array(16).astype(dtype="uint8")])
     ])
 
-    expected_output = "These files are different.\n" + "Field          hellokitty.nii.gz" \
-                                                       "                                      " \
-                                                       "privettovarish.nii.gz                                  \n" \
+    expected_output = "These files are different.\n" + "Field/File     1:hellokitty.nii.gz" \
+                                                       "                                    " \
+                                                       "2:privettovarish.nii.gz                                \n" \
                                                        "datatype       " \
                                                        "2                                                      " \
                                                        "4                                                      \n" \
@@ -114,7 +114,37 @@ def test_get_data_diff():
     #  testing for identical files specifically as md5 may vary by computer
     test_names = [pjoin(data_path, f)
                   for f in ('standard.nii.gz', 'standard.nii.gz')]
-    assert_equal(get_data_diff(test_names), [])
+    assert_equal(get_data_hash_diff(test_names), [])
+
+    #  testing the maximum relative and absolute differences' different use cases
+    test_array = np.arange(16).reshape(4, 4)
+    test_array_2 = np.arange(1, 17).reshape(4, 4)
+    test_array_3 = np.arange(2, 18).reshape(4, 4)
+    test_array_4 = np.arange(100).reshape(10, 10)
+    test_array_5 = np.arange(64).reshape(8, 8)
+
+    # same shape, 2 files
+    assert_equal(get_data_diff([test_array, test_array_2]),
+                 OrderedDict([('DATA(diff 1:)', [None, OrderedDict([('abs', 1), ('rel', 2.0)])])]))
+
+    # same shape, 3 files
+    assert_equal(get_data_diff([test_array, test_array_2, test_array_3]),
+                 OrderedDict([('DATA(diff 1:)', [None, OrderedDict([('abs', 1), ('rel', 2.0)]),
+                                                 OrderedDict([('abs', 2), ('rel', 2.0)])]),
+                              ('DATA(diff 2:)', [None, None,
+                                                 OrderedDict([('abs', 1), ('rel', 0.66666666666666663)])])]))
+
+    # same shape, 2 files, modified maximum abs/rel
+    assert_equal(get_data_diff([test_array, test_array_2], max_abs=2, max_rel=2), OrderedDict())
+
+    # different shape, 2 files
+    assert_equal(get_data_diff([test_array_2, test_array_4]),
+                 OrderedDict([('DATA(diff 1:)', [None, {'CMP': 'incompat'}])]))
+
+    # different shape, 3 files
+    assert_equal(get_data_diff([test_array_4, test_array_5, test_array_2]),
+                 OrderedDict([('DATA(diff 1:)', [None, {'CMP': 'incompat'}, {'CMP': 'incompat'}]),
+                              ('DATA(diff 2:)', [None, None, {'CMP': 'incompat'}])]))
 
 
 def test_main():

diff --git a/nibabel/tests/test_scripts.py b/nibabel/tests/test_scripts.py
@@ -72,10 +72,10 @@ def check_nib_diff_examples():
     fnames = [pjoin(DATA_PATH, f)
                for f in ('standard.nii.gz', 'example4d.nii.gz')]
     code, stdout, stderr = run_command(['nib-diff'] + fnames, check_code=False)
-    checked_fields = ["Field", "regular", "dim_info", "dim", "datatype", "bitpix", "pixdim", "slice_end",
+    checked_fields = ["Field/File", "regular", "dim_info", "dim", "datatype", "bitpix", "pixdim", "slice_end",
                       "xyzt_units", "cal_max", "descrip", "qform_code", "sform_code", "quatern_b",
                       "quatern_c", "quatern_d", "qoffset_x", "qoffset_y", "qoffset_z", "srow_x",
-                      "srow_y", "srow_z", "DATA(md5)"]
+                      "srow_y", "srow_z", "DATA(md5)", "DATA(diff 1:)"]
     for item in checked_fields:
         assert_true(item in stdout)