diff --git a/.travis.yml b/.travis.yml index 009143ba2e..d25cd7b5e0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -96,7 +96,7 @@ install: - if [[ $INSTALL_TYPE == 'conda' ]]; then conda env create -q -f environment.yml python=$TRAVIS_PYTHON_VERSION; source activate devito; - if [[ $MPI_INSTALL == '1' ]]; then pip install -e .[extras]; else pip install -e .; fi + pip install -e .; conda list; fi - if [[ "$MPI_INSTALL" == '1' ]]; then @@ -105,6 +105,7 @@ install: # Install devito with pip - if [[ $INSTALL_TYPE == 'pip_setup' ]]; then python setup.py install; fi + - if [[ $MPI_INSTALL == '1' ]]; then pip install -e .[extras]; fi before_script: - echo -e "Host github.com\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config diff --git a/devito/__init__.py b/devito/__init__.py index c4c9380109..c973c12f2e 100644 --- a/devito/__init__.py +++ b/devito/__init__.py @@ -62,11 +62,7 @@ def _at_callback(val): # noqa level, mode = val if level == 'off': level = False - if configuration['backend'] == 'core' and mode == 'runtime': - warning("Unsupported auto-tuning mode `runtime` with backend `core`") - return at_setup(level, 'preemptive') - else: - return at_setup(level, mode) + return at_setup(level, mode) configuration.add('autotuning', 'off', at_accepted, callback=_at_callback, # noqa impacts_jit=False) diff --git a/devito/core/autotuning.py b/devito/core/autotuning.py index 93cf2f1e83..6e3f045584 100644 --- a/devito/core/autotuning.py +++ b/devito/core/autotuning.py @@ -6,27 +6,50 @@ from operator import mul import resource -from devito.ir.iet import Iteration, FindNodes, FindSymbols +from devito.ir import Backward, Iteration, FindNodes, FindSymbols from devito.logger import perf, warning from devito.parameters import configuration __all__ = ['autotune'] -def autotune(operator, arguments, parameters, tunable): +def autotune(operator, args, level, mode): """ - Acting as a high-order function, take as input an operator and a list of - operator arguments to perform empirical autotuning. Some of the operator - arguments are marked as tunable. + Operator autotuning. + + Parameters + ---------- + operator : Operator + Input Operator. + args : dict_like + The runtime arguments with which `operator` is run. + level : str + The autotuning aggressiveness (basic, aggressive). A more aggressive + autotuning might eventually result in higher performance, though in + some circumstances it might instead increase the actual runtime. + mode : str + The autotuning mode (preemptive, runtime). In preemptive mode, the + output runtime values supplied by the user to `operator.apply` are + replaced with shadow copies. """ + key = [level, mode] + accepted = configuration._accepted['autotuning'] + if key not in accepted: + raise ValueError("The accepted `(level, mode)` combinations are `%s`; " + "provided `%s` instead" % (accepted, key)) + + parameters = operator.parameters + tunable = operator._dle_args + # We get passed all the arguments, but the cfunction only requires a subset - at_arguments = OrderedDict([(p.name, arguments[p.name]) for p in parameters]) + at_args = OrderedDict([(p.name, args[p.name]) for p in parameters]) - # User-provided output data must not be altered - output = [i.name for i in operator.output] - for k, v in arguments.items(): - if k in output: - at_arguments[k] = v.copy() + # User-provided output data won't be altered in `preemptive` mode + if mode == 'preemptive': + output = [i.name for i in operator.output] + for k, v in args.items(): + if k in output: + at_args[k] = v.copy() iterations = FindNodes(Iteration).visit(operator.body) dim_mapper = {i.dim.name: i.dim for i in iterations} @@ -35,33 +58,27 @@ def autotune(operator, arguments, parameters, tunable): # runs will finish quickly steppers = [i for i in iterations if i.dim.is_Time] if len(steppers) == 0: + stepper = None timesteps = 1 elif len(steppers) == 1: stepper = steppers[0] - start = at_arguments[stepper.dim.min_name] - timesteps = stepper.extent(start=start, finish=options['at_squeezer']) - 1 - if timesteps < 0: - timesteps = options['at_squeezer'] - timesteps - perf("AutoTuner: Number of timesteps adjusted to %d" % timesteps) - at_arguments[stepper.dim.min_name] = start - at_arguments[stepper.dim.max_name] = timesteps - if stepper.dim.is_Stepping: - at_arguments[stepper.dim.parent.min_name] = start - at_arguments[stepper.dim.parent.max_name] = timesteps + timesteps = init_time_bounds(stepper, at_args) + if timesteps is None: + return args else: warning("AutoTuner: Couldn't understand loop structure; giving up") - return arguments + return args # Attempted block sizes ... - mapper = OrderedDict([(i.argument.symbolic_size.name, i) for i in tunable]) + mapper = OrderedDict([(i.tunable.name, i) for i in tunable]) # ... Defaults (basic mode) blocksizes = [OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize']] # ... Always try the entire iteration space (degenerate block) - itershape = [mapper[i].iteration.symbolic_extent.subs(arguments) for i in mapper] + itershape = [mapper[i].iteration.symbolic_extent.subs(args) for i in mapper] blocksizes.append(OrderedDict([(i, mapper[i].iteration.extent(0, j-1)) for i, j in zip(mapper, itershape)])) # ... More attempts if auto-tuning in aggressive mode - if configuration['autotuning'].level == 'aggressive': + if level == 'aggressive': blocksizes = more_heuristic_attempts(blocksizes) # How many temporaries are allocated on the stack? @@ -75,15 +92,18 @@ def autotune(operator, arguments, parameters, tunable): # square blocks are tested timings = OrderedDict() for bs in blocksizes: + # Can we safely autotune over the given time range? + check_time_bounds(stepper, at_args, args) + illegal = False - for k, v in at_arguments.items(): + for k, v in at_args.items(): if k in bs: val = bs[k] - start = mapper[k].original_dim.symbolic_start.subs(arguments) - end = mapper[k].original_dim.symbolic_end.subs(arguments) + start = mapper[k].original_dim.symbolic_start.subs(args) + end = mapper[k].original_dim.symbolic_end.subs(args) if val <= mapper[k].iteration.extent(start, end): - at_arguments[k] = val + at_args[k] = val else: # Block size cannot be larger than actual dimension illegal = True @@ -93,7 +113,7 @@ def autotune(operator, arguments, parameters, tunable): # Make sure we remain within stack bounds, otherwise skip block size dim_sizes = {} - for k, v in at_arguments.items(): + for k, v in at_args.items(): if k in bs: dim_sizes[mapper[k].argument.symbolic_size] = bs[k] elif k in dim_mapper: @@ -113,31 +133,93 @@ def autotune(operator, arguments, parameters, tunable): # Use AutoTuner-specific profiler structs timer = operator.profiler.timer.reset() - at_arguments[operator.profiler.name] = timer + at_args[operator.profiler.name] = timer - operator.cfunction(*list(at_arguments.values())) + operator.cfunction(*list(at_args.values())) elapsed = sum(getattr(timer._obj, i) for i, _ in timer._obj._fields_) timings[tuple(bs.items())] = elapsed perf("AutoTuner: Block shape <%s> took %f (s) in %d timesteps" % (','.join('%d' % i for i in bs.values()), elapsed, timesteps)) + # Prepare for the next autotuning run + update_time_bounds(stepper, at_args, timesteps, mode) + try: best = dict(min(timings, key=timings.get)) perf("AutoTuner: Selected block shape %s" % best) except ValueError: warning("AutoTuner: Couldn't find legal block shapes") - return arguments + return args # Build the new argument list - tuned = OrderedDict() - for k, v in arguments.items(): - tuned[k] = best[k] if k in mapper else v + args = {k: best.get(k, v) for k, v in args.items()} + + # In `runtime` mode, some timesteps have been executed already, so we + # get to adjust the time range + finalize_time_bounds(stepper, at_args, args, mode) + + # Reset profiling data + assert operator.profiler.name in args + args[operator.profiler.name] = operator.profiler.timer.reset() + + return args + + +def init_time_bounds(stepper, at_args): + if stepper is None: + return + dim = stepper.dim.root + if stepper.direction is Backward: + at_args[dim.max_name] = at_args[dim.max_name] + at_args[dim.min_name] = at_args[dim.max_name] - options['at_squeezer'] + if at_args[dim.max_name] < at_args[dim.min_name]: + warning("AutoTuner: too few time iterations; giving up") + return False + else: + at_args[dim.min_name] = at_args[dim.min_name] + at_args[dim.max_name] = at_args[dim.min_name] + options['at_squeezer'] + if at_args[dim.min_name] > at_args[dim.max_name]: + warning("AutoTuner: too few time iterations; giving up") + return False + + return stepper.extent(start=at_args[dim.min_name], finish=at_args[dim.max_name]) + + +def check_time_bounds(stepper, at_args, args): + if stepper is None: + return + dim = stepper.dim.root + if stepper.direction is Backward: + if at_args[dim.min_name] < args[dim.min_name]: + raise ValueError("Too few time iterations") + + else: + if at_args[dim.max_name] > args[dim.max_name]: + raise ValueError("Too few time iterations") + - # Reset the profiling struct - assert operator.profiler.name in tuned - tuned[operator.profiler.name] = operator.profiler.timer.reset() +def update_time_bounds(stepper, at_args, timesteps, mode): + if mode != 'runtime' or stepper is None: + return + dim = stepper.dim.root + if stepper.direction is Backward: + at_args[dim.max_name] -= timesteps + at_args[dim.min_name] -= timesteps + else: + at_args[dim.min_name] += timesteps + at_args[dim.max_name] += timesteps - return tuned + +def finalize_time_bounds(stepper, at_args, args, mode): + if mode != 'runtime' or stepper is None: + return + dim = stepper.dim.root + if stepper.direction is Backward: + args[dim.max_name] = at_args[dim.max_name] + args[dim.min_name] = args[dim.min_name] + else: + args[dim.min_name] = at_args[dim.min_name] + args[dim.max_name] = args[dim.max_name] def more_heuristic_attempts(blocksizes): diff --git a/devito/core/operator.py b/devito/core/operator.py index 26b6e5c130..2e66ea77a1 100644 --- a/devito/core/operator.py +++ b/devito/core/operator.py @@ -70,11 +70,30 @@ def _generate_mpi(self, iet, **kwargs): return iet - def _autotune(self, args): - if self._dle_flags.get('blocking', False): - return autotune(self, args, self.parameters, self._dle_args) - else: + def _autotune(self, args, setup): + if setup is False or not self._dle_flags.get('blocking'): return args + elif setup is True: + level = configuration['autotuning'].level or 'basic' + args = autotune(self, args, level, configuration['autotuning'].mode) + elif isinstance(setup, str): + args = autotune(self, args, setup, configuration['autotuning'].mode) + elif isinstance(setup, tuple) and len(setup) == 2: + level, mode = setup + if level is False: + return args + else: + args = autotune(self, args, level, mode) + else: + raise ValueError("Expected bool, str, or 2-tuple, got `%s` instead" + % type(setup)) + + # Record the tuned values + mapper = self._state.setdefault('tuned', {}) + mapper.update({k: v for k, v in args.items() + if k in [i.tunable.name for i in self._dle_args]}) + + return args class OperatorDebug(OperatorCore): diff --git a/devito/data/allocators.py b/devito/data/allocators.py index 8632fad4b7..d27af3fb52 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -65,9 +65,9 @@ def alloc(self, shape, dtype): if c_pointer is None: raise RuntimeError("Unable to allocate %d elements in memory", str(size)) - c_pointer = ctypes.cast(c_pointer, np.ctypeslib.ndpointer(dtype=dtype, - shape=shape)) - pointer = np.ctypeslib.as_array(c_pointer, shape=shape) + c_pointer_cast = ctypes.cast( + c_pointer, np.ctypeslib.ndpointer(dtype=dtype, shape=shape)) + pointer = np.ctypeslib.as_array(c_pointer_cast, shape=shape) return (pointer, memfree_args) @@ -242,7 +242,14 @@ def _alloc_C_libcall(self, size, ctype): if not self.available(): raise RuntimeError("Couldn't find `libnuma`'s `numa_alloc_*` to " "allocate memory") - c_bytesize = ctypes.c_ulong(size * ctypes.sizeof(ctype)) + + if size == 0: + # work around the fact that the allocator may return NULL when + # the size is 0, and numpy does not like that + c_bytesize = ctypes.c_ulong(1) + else: + c_bytesize = ctypes.c_ulong(size * ctypes.sizeof(ctype)) + if self.put_onnode: c_pointer = self.lib.numa_alloc_onnode(c_bytesize, self._node) elif self.put_local: @@ -253,7 +260,8 @@ def _alloc_C_libcall(self, size, ctype): # note! even though restype was set above, ctypes returns a # python integer. # See https://stackoverflow.com/questions/17840144/ - if c_pointer == 0: + # edit: it apparently can return None, also! + if c_pointer == 0 or c_pointer is None: return None, None else: # Convert it back to a void * - this is diff --git a/devito/dle/backends/common.py b/devito/dle/backends/common.py index 6360b48da9..227bb578cb 100644 --- a/devito/dle/backends/common.py +++ b/devito/dle/backends/common.py @@ -81,6 +81,10 @@ def __repr__(self): def original_dim(self): return self.iteration.dim + @property + def tunable(self): + return self.argument.symbolic_size + class AbstractRewriter(object): """ diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index 83a46b017f..7111fe4c46 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -6,7 +6,8 @@ from frozendict import frozendict -from devito.tools import PartialOrderTuple, as_tuple, filter_ordered, toposort +from devito.tools import PartialOrderTuple, as_tuple, filter_ordered, toposort, is_integer + __all__ = ['NullInterval', 'Interval', 'IntervalGroup', 'IterationSpace', 'DataSpace', 'Forward', 'Backward', 'Any'] @@ -112,8 +113,8 @@ class Interval(AbstractInterval): is_Defined = True def __init__(self, dim, lower, upper): - assert isinstance(lower, int) - assert isinstance(upper, int) + assert is_integer(lower) + assert is_integer(upper) super(Interval, self).__init__(dim) self.lower = lower self.upper = upper @@ -290,7 +291,7 @@ def zero(self, d=None): relations=self.relations) def __getitem__(self, key): - if isinstance(key, (slice, int)): + if isinstance(key, slice) or is_integer(key): return super(IntervalGroup, self).__getitem__(key) if not self.is_well_defined: raise ValueError("Cannot fetch Interval from ill defined Space") diff --git a/devito/operator.py b/devito/operator.py index f1f92923ee..f892b80efc 100644 --- a/devito/operator.py +++ b/devito/operator.py @@ -70,6 +70,10 @@ def __init__(self, expressions, **kwargs): # References to local or external routines self._func_table = OrderedDict() + # Internal state. May be used to store information about previous runs, + # autotuning reports, etc + self._state = {} + # Expression lowering: indexification, substitution rules, specialization expressions = [indexify(i) for i in expressions] expressions = self._apply_substitutions(expressions, subs) @@ -167,8 +171,7 @@ def _prepare_arguments(self, **kwargs): args.update(kwargs.pop('backend', {})) # Execute autotuning and adjust arguments accordingly - if kwargs.pop('autotune', configuration['autotuning'].level): - args = self._autotune(args) + args = self._autotune(args, kwargs.pop('autotune', configuration['autotuning'])) # Check all user-provided keywords are known to the Operator if not configuration['ignore-unknowns']: @@ -254,7 +257,7 @@ def _profile_sections(self, iet): """Introduce C-level profiling nodes within the Iteration/Expression tree.""" return List(body=iet), None - def _autotune(self, args): + def _autotune(self, args, setup): """Use auto-tuning on this Operator to determine empirically the best block sizes when loop blocking is in use.""" return args @@ -436,7 +439,18 @@ def apply(self, **kwargs): # Invoke kernel function with args arg_values = [args[p.name] for p in self.parameters] - self.cfunction(*arg_values) + try: + self.cfunction(*arg_values) + except ctypes.ArgumentError as e: + if e.args[0].startswith("argument "): + argnum = int(e.args[0][9:].split(':')[0]) - 1 + newmsg = "error in argument '%s' with value '%s': %s" % ( + self.parameters[argnum].name, + arg_values[argnum], + e.args[0]) + raise ctypes.ArgumentError(newmsg) from e + else: + raise # Post-process runtime arguments self._postprocess_arguments(args, **kwargs) diff --git a/devito/yask/wrappers.py b/devito/yask/wrappers.py index 5d48f0a234..b32246764d 100644 --- a/devito/yask/wrappers.py +++ b/devito/yask/wrappers.py @@ -199,9 +199,9 @@ def pre_apply(self, toshare): self.soln.prepare_solution() # Set up auto-tuning - if configuration['autotuning'] is False: + if configuration['autotuning'].level is False: self.soln.reset_auto_tuner(False) - elif configuration['autotuning'] == 'preemptive': + elif configuration['autotuning'].mode == 'preemptive': self.soln.run_auto_tuner_now() def post_apply(self): diff --git a/setup.py b/setup.py index 32ec38c625..b4fcdcb949 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ import versioneer -import os from setuptools import setup, find_packages @@ -44,6 +43,6 @@ license='MIT', packages=find_packages(exclude=['docs', 'tests', 'examples']), install_requires=reqs, - extras_require={'extras': opt_reqs}, + extras_require={'extras': opt_reqs}, dependency_links=links, test_suite='tests') diff --git a/tests/test_autotuner.py b/tests/test_autotuner.py index 99c0ba0c4d..3d17a990ec 100644 --- a/tests/test_autotuner.py +++ b/tests/test_autotuner.py @@ -19,8 +19,8 @@ @silencio(log_level='DEBUG') @pytest.mark.parametrize("shape,expected", [ - ((30, 30), 17), - ((30, 30, 30), 21) + ((30, 30), 13), + ((30, 30, 30), 17) ]) def test_at_is_actually_working(shape, expected): """ @@ -39,18 +39,34 @@ def test_at_is_actually_working(shape, expected): stencil = Eq(outfield.indexify(), outfield.indexify() + infield.indexify()*3.0) op = Operator(stencil, dle=('blocking', {'blockinner': True, 'blockalways': True})) - # Expected 3 AT attempts for the given shape + # Run with whatever `configuration` says (by default, basic+preemptive) op(infield=infield, outfield=outfield, autotune=True) out = [i for i in buffer.getvalue().split('\n') if 'took' in i] assert len(out) == 4 - # Now try the same with aggressive autotuning, which tries 9 more cases + buffer.truncate(0) + + # Now try `aggressive` autotuning configuration['autotuning'] = 'aggressive' op(infield=infield, outfield=outfield, autotune=True) out = [i for i in buffer.getvalue().split('\n') if 'took' in i] assert len(out) == expected configuration['autotuning'] = configuration._defaults['autotuning'] + buffer.truncate(0) + + # Try again, but using the Operator API directly + op(infield=infield, outfield=outfield, autotune='aggressive') + out = [i for i in buffer.getvalue().split('\n') if 'took' in i] + assert len(out) == expected + + buffer.truncate(0) + + # Similar to above + op(infield=infield, outfield=outfield, autotune=('aggressive', 'preemptive')) + out = [i for i in buffer.getvalue().split('\n') if 'took' in i] + assert len(out) == expected + logger.removeHandler(temporary_handler) temporary_handler.flush() @@ -99,10 +115,10 @@ def test_timesteps_per_at_run(): stencil = Eq(outfield[t + to, x, y, z], outfield.indexify() + infield.indexify()*3.0) op = Operator(stencil, dle=('blocking', {'blockalways': True})) - op(infield=infield, outfield=outfield, t=2, autotune=True) + op(infield=infield, outfield=outfield, time=20, autotune=True) out = [i for i in buffer.getvalue().split('\n') if 'took' in i] assert len(out) == 4 - assert all('in %d timesteps' % options['at_squeezer'] in i for i in out) + assert all('in %d timesteps' % (options['at_squeezer'] + 1) in i for i in out) buffer.truncate(0) logger.removeHandler(temporary_handler) @@ -111,3 +127,29 @@ def test_timesteps_per_at_run(): temporary_handler.close() buffer.flush() buffer.close() + + +def test_nondestructive_forward(): + """Test autotuning in non-destructive mode.""" + grid = Grid(shape=(64, 64, 64)) + f = TimeFunction(name='f', grid=grid) + + op = Operator(Eq(f.forward, f + 1)) + op.apply(time=100, autotune=('basic', 'runtime')) + + # AT is expected to have executed 35 timesteps + assert np.all(f.data[0] == 100) + assert np.all(f.data[1] == 101) + + +def test_nondestructive_backward(): + """Test autotuning in non-destructive mode.""" + grid = Grid(shape=(64, 64, 64)) + f = TimeFunction(name='f', grid=grid) + + op = Operator(Eq(f.backward, f + 1)) + op.apply(time=101, autotune=('basic', 'runtime')) + + # AT is expected to have executed 35 timesteps + assert np.all(f.data[0] == 101) + assert np.all(f.data[1] == 100)