Skip to content

RF+ENH: nib-diff - allow to specify absolute and/or relative maximal diff to tolerate #661

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ install:
# the parent CMD process).
- SET PATH=%PYTHON%;%PYTHON%\Scripts;%PATH%

# Update install environment
- pip install --upgrade pip setuptools

# Install the dependencies of the project.
- pip install numpy scipy matplotlib nose h5py mock hypothesis pydicom
- pip install .
Expand Down
169 changes: 143 additions & 26 deletions nibabel/cmdline/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,21 @@ def get_opt_parser():
Option("-H", "--header-fields",
dest="header_fields", default='all',
help="Header fields (comma separated) to be printed as well (if present)"),

Option("--ma", "--data-max-abs-diff",
dest="data_max_abs_diff",
type=float,
default=0.0,
help="Maximal absolute difference in data between files to tolerate."),

Option("--mr", "--data-max-rel-diff",
dest="data_max_rel_diff",
type=float,
default=0.0,
help="Maximal relative difference in data between files to tolerate."
" If --data-max-abs-diff is also specified, only the data points "
" with absolute difference greater than that value would be "
" considered for relative difference check."),
])

return p
Expand Down Expand Up @@ -101,8 +116,8 @@ def get_headers_diff(file_headers, names=None):
return difference


def get_data_diff(files):
"""Get difference between md5 values
def get_data_hash_diff(files):
"""Get difference between md5 values of data

Parameters
----------
Expand All @@ -115,7 +130,7 @@ def get_data_diff(files):
"""

md5sums = [
hashlib.md5(np.ascontiguousarray(nib.load(f).get_data(), dtype=np.float32)).hexdigest()
hashlib.md5(np.ascontiguousarray(nib.load(f).get_fdata())).hexdigest()
for f in files
]

Expand All @@ -125,6 +140,84 @@ def get_data_diff(files):
return md5sums


def get_data_diff(files, max_abs=0, max_rel=0):
"""Get difference between data

Parameters
----------
files: list of (str or ndarray)
If list of strings is provided -- they must be existing file names
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ndarray I assume means a data block equivalent to one loaded with nib.load().get_fdata() or similar?

max_abs: float, optional
Maximal absolute difference to tolerate.
max_rel: float, optional
Maximal relative (`abs(diff)/mean(diff)`) difference to tolerate.
If `max_abs` is specified, then those data points with lesser than that
absolute difference, are not considered for relative difference testing

Returns
-------
diffs: OrderedDict
An ordered dict with a record per each file which has differences
with other files subsequent detected. Each record is a list of
difference records, one per each file pair.
Each difference record is an Ordered Dict with possible keys
'abs' or 'rel' showing maximal absolute or relative differences
in the file or the record ('CMP': 'incompat') if file shapes
are incompatible.
"""

# we are doomed to keep them in RAM now
data = [f if isinstance(f, np.ndarray) else nib.load(f).get_fdata()
for f in files]
diffs = OrderedDict()
for i, d1 in enumerate(data[:-1]):
# populate empty entries for non-compared
diffs1 = [None] * (i + 1)

for j, d2 in enumerate(data[i + 1:], i + 1):

if d1.shape == d2.shape:
abs_diff = np.abs(d1 - d2)
mean_abs = (np.abs(d1) + np.abs(d2)) * 0.5
candidates = np.logical_or(mean_abs != 0, abs_diff != 0)

if max_abs:
candidates[abs_diff <= max_abs] = False

max_abs_diff = np.max(abs_diff)
if np.any(candidates):
rel_diff = abs_diff[candidates] / mean_abs[candidates]
if max_rel:
sub_thr = rel_diff <= max_rel
# Since we operated on sub-selected values already, we need
# to plug them back in
candidates[
tuple((indexes[sub_thr] for indexes in np.where(candidates)))
] = False
max_rel_diff = np.max(rel_diff)
else:
max_rel_diff = 0

if np.any(candidates):

diff_rec = OrderedDict() # so that abs goes before relative

diff_rec['abs'] = max_abs_diff
diff_rec['rel'] = max_rel_diff
diffs1.append(diff_rec)
else:
diffs1.append(None)

else:
diffs1.append({'CMP': "incompat"})

if any(diffs1):

diffs['DATA(diff %d:)' % (i + 1)] = diffs1

return diffs


def display_diff(files, diff):
"""Format header differences into a nice string

Expand All @@ -140,21 +233,27 @@ def display_diff(files, diff):
"""
output = ""
field_width = "{:<15}"
filename_width = "{:<53}"
value_width = "{:<55}"

output += "These files are different.\n"
output += field_width.format('Field')
output += field_width.format('Field/File')

for f in files:
output += value_width.format(os.path.basename(f))
for i, f in enumerate(files, 1):
output += "%d:%s" % (i, filename_width.format(os.path.basename(f)))

output += "\n"

for key, value in diff.items():
output += field_width.format(key)

for item in value:
item_str = str(item)
if isinstance(item, dict):
item_str = ', '.join('%s: %s' % i for i in item.items())
elif item is None:
item_str = '-'
else:
item_str = str(item)
# Value might start/end with some invisible spacing characters so we
# would "condition" it on both ends a bit
item_str = re.sub('^[ \t]+', '<', item_str)
Expand All @@ -169,8 +268,37 @@ def display_diff(files, diff):
return output


def diff(files, header_fields='all', data_max_abs_diff=None, data_max_rel_diff=None):
assert len(files) >= 2, "Please enter at least two files"

file_headers = [nib.load(f).header for f in files]

# signals "all fields"
if header_fields == 'all':
# TODO: header fields might vary across file types, thus prior sensing would be needed
header_fields = file_headers[0].keys()
else:
header_fields = header_fields.split(',')

diff = get_headers_diff(file_headers, header_fields)

data_md5_diffs = get_data_hash_diff(files)
if data_md5_diffs:
# provide details, possibly triggering the ignore of the difference
# in data
data_diffs = get_data_diff(files,
max_abs=data_max_abs_diff,
max_rel=data_max_rel_diff)
if data_diffs:
diff['DATA(md5)'] = data_md5_diffs
diff.update(data_diffs)

return diff


def main(args=None, out=None):
"""Getting the show on the road"""

out = out or sys.stdout
parser = get_opt_parser()
(opts, files) = parser.parse_args(args)
Expand All @@ -181,27 +309,16 @@ def main(args=None, out=None):
# suppress nibabel format-compliance warnings
nib.imageglobals.logger.level = 50

assert len(files) >= 2, "Please enter at least two files"

file_headers = [nib.load(f).header for f in files]

# signals "all fields"
if opts.header_fields == 'all':
# TODO: header fields might vary across file types, thus prior sensing would be needed
header_fields = file_headers[0].keys()
else:
header_fields = opts.header_fields.split(',')

diff = get_headers_diff(file_headers, header_fields)
data_diff = get_data_diff(files)

if data_diff:
diff['DATA(md5)'] = data_diff
files_diff = diff(
files,
header_fields=opts.header_fields,
data_max_abs_diff=opts.data_max_abs_diff,
data_max_rel_diff=opts.data_max_rel_diff
)

if diff:
out.write(display_diff(files, diff))
if files_diff:
out.write(display_diff(files, files_diff))
raise SystemExit(1)

else:
out.write("These files are identical.\n")
raise SystemExit(0)
40 changes: 35 additions & 5 deletions nibabel/cmdline/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import nibabel as nib
import numpy as np
from nibabel.cmdline.utils import *
from nibabel.cmdline.diff import get_headers_diff, display_diff, main, get_data_diff
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One of us has managed to make this file executable, please undo:
capture _2018-09-27-20-06-42
If you like a challenge - undo by rewriting that original commit. Workflow:

  • fix, commit
  • git rebase -i BADCOMMIT^ where you reposition fixing commit after the one to fix, and give it s status to squash them into one
  • git push -f since now you rewritten a commit

from nibabel.cmdline.diff import get_headers_diff, display_diff, main, get_data_hash_diff, get_data_diff
from os.path import (join as pjoin)
from nibabel.testing import data_path
from collections import OrderedDict
Expand Down Expand Up @@ -96,9 +96,9 @@ def test_display_diff():
("bitpix", [np.array(8).astype(dtype="uint8"), np.array(16).astype(dtype="uint8")])
])

expected_output = "These files are different.\n" + "Field hellokitty.nii.gz" \
" " \
"privettovarish.nii.gz \n" \
expected_output = "These files are different.\n" + "Field/File 1:hellokitty.nii.gz" \
" " \
"2:privettovarish.nii.gz \n" \
"datatype " \
"2 " \
"4 \n" \
Expand All @@ -114,7 +114,37 @@ def test_get_data_diff():
# testing for identical files specifically as md5 may vary by computer
test_names = [pjoin(data_path, f)
for f in ('standard.nii.gz', 'standard.nii.gz')]
assert_equal(get_data_diff(test_names), [])
assert_equal(get_data_hash_diff(test_names), [])

# testing the maximum relative and absolute differences' different use cases
test_array = np.arange(16).reshape(4, 4)
test_array_2 = np.arange(1, 17).reshape(4, 4)
test_array_3 = np.arange(2, 18).reshape(4, 4)
test_array_4 = np.arange(100).reshape(10, 10)
test_array_5 = np.arange(64).reshape(8, 8)

# same shape, 2 files
assert_equal(get_data_diff([test_array, test_array_2]),
OrderedDict([('DATA(diff 1:)', [None, OrderedDict([('abs', 1), ('rel', 2.0)])])]))

# same shape, 3 files
assert_equal(get_data_diff([test_array, test_array_2, test_array_3]),
OrderedDict([('DATA(diff 1:)', [None, OrderedDict([('abs', 1), ('rel', 2.0)]),
OrderedDict([('abs', 2), ('rel', 2.0)])]),
('DATA(diff 2:)', [None, None,
OrderedDict([('abs', 1), ('rel', 0.66666666666666663)])])]))

# same shape, 2 files, modified maximum abs/rel
assert_equal(get_data_diff([test_array, test_array_2], max_abs=2, max_rel=2), OrderedDict())

# different shape, 2 files
assert_equal(get_data_diff([test_array_2, test_array_4]),
OrderedDict([('DATA(diff 1:)', [None, {'CMP': 'incompat'}])]))

# different shape, 3 files
assert_equal(get_data_diff([test_array_4, test_array_5, test_array_2]),
OrderedDict([('DATA(diff 1:)', [None, {'CMP': 'incompat'}, {'CMP': 'incompat'}]),
('DATA(diff 2:)', [None, None, {'CMP': 'incompat'}])]))


def test_main():
Expand Down
4 changes: 2 additions & 2 deletions nibabel/tests/test_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ def check_nib_diff_examples():
fnames = [pjoin(DATA_PATH, f)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same here about permissions

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please clarify?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Jk I got it

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good ;)

for f in ('standard.nii.gz', 'example4d.nii.gz')]
code, stdout, stderr = run_command(['nib-diff'] + fnames, check_code=False)
checked_fields = ["Field", "regular", "dim_info", "dim", "datatype", "bitpix", "pixdim", "slice_end",
checked_fields = ["Field/File", "regular", "dim_info", "dim", "datatype", "bitpix", "pixdim", "slice_end",
"xyzt_units", "cal_max", "descrip", "qform_code", "sform_code", "quatern_b",
"quatern_c", "quatern_d", "qoffset_x", "qoffset_y", "qoffset_z", "srow_x",
"srow_y", "srow_z", "DATA(md5)"]
"srow_y", "srow_z", "DATA(md5)", "DATA(diff 1:)"]
for item in checked_fields:
assert_true(item in stdout)

Expand Down