diff --git a/.travis.yml b/.travis.yml
index 009143ba2e..d25cd7b5e0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -96,7 +96,7 @@ install:
   - if [[ $INSTALL_TYPE == 'conda' ]]; then
       conda env create -q -f environment.yml python=$TRAVIS_PYTHON_VERSION;
       source activate devito;
-      if [[ $MPI_INSTALL == '1' ]]; then pip install -e .[extras]; else pip install -e .; fi
+      pip install -e .;
       conda list;
     fi
   - if [[ "$MPI_INSTALL" == '1' ]]; then
@@ -105,6 +105,7 @@ install:
 
   # Install devito with pip
   - if [[ $INSTALL_TYPE == 'pip_setup' ]]; then python setup.py install; fi
+  - if [[ $MPI_INSTALL == '1' ]]; then pip install -e .[extras]; fi
 
 before_script:
   - echo -e "Host github.com\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config
diff --git a/devito/__init__.py b/devito/__init__.py
index c4c9380109..c973c12f2e 100644
--- a/devito/__init__.py
+++ b/devito/__init__.py
@@ -62,11 +62,7 @@ def _at_callback(val):  # noqa
         level, mode = val
     if level == 'off':
         level = False
-    if configuration['backend'] == 'core' and mode == 'runtime':
-        warning("Unsupported auto-tuning mode `runtime` with backend `core`")
-        return at_setup(level, 'preemptive')
-    else:
-        return at_setup(level, mode)
+    return at_setup(level, mode)
 configuration.add('autotuning', 'off', at_accepted, callback=_at_callback,  # noqa
                   impacts_jit=False)
 
diff --git a/devito/core/autotuning.py b/devito/core/autotuning.py
index 93cf2f1e83..6e3f045584 100644
--- a/devito/core/autotuning.py
+++ b/devito/core/autotuning.py
@@ -6,27 +6,50 @@
 from operator import mul
 import resource
 
-from devito.ir.iet import Iteration, FindNodes, FindSymbols
+from devito.ir import Backward, Iteration, FindNodes, FindSymbols
 from devito.logger import perf, warning
 from devito.parameters import configuration
 
 __all__ = ['autotune']
 
 
-def autotune(operator, arguments, parameters, tunable):
+def autotune(operator, args, level, mode):
     """
-    Acting as a high-order function, take as input an operator and a list of
-    operator arguments to perform empirical autotuning. Some of the operator
-    arguments are marked as tunable.
+    Operator autotuning.
+
+    Parameters
+    ----------
+    operator : Operator
+        Input Operator.
+    args : dict_like
+        The runtime arguments with which `operator` is run.
+    level : str
+        The autotuning aggressiveness (basic, aggressive). A more aggressive
+        autotuning might eventually result in higher performance, though in
+        some circumstances it might instead increase the actual runtime.
+    mode : str
+        The autotuning mode (preemptive, runtime). In preemptive mode, the
+        output runtime values supplied by the user to `operator.apply` are
+        replaced with shadow copies.
     """
+    key = [level, mode]
+    accepted = configuration._accepted['autotuning']
+    if key not in accepted:
+        raise ValueError("The accepted `(level, mode)` combinations are `%s`; "
+                         "provided `%s` instead" % (accepted, key))
+
+    parameters = operator.parameters
+    tunable = operator._dle_args
+
     # We get passed all the arguments, but the cfunction only requires a subset
-    at_arguments = OrderedDict([(p.name, arguments[p.name]) for p in parameters])
+    at_args = OrderedDict([(p.name, args[p.name]) for p in parameters])
 
-    # User-provided output data must not be altered
-    output = [i.name for i in operator.output]
-    for k, v in arguments.items():
-        if k in output:
-            at_arguments[k] = v.copy()
+    # User-provided output data won't be altered in `preemptive` mode
+    if mode == 'preemptive':
+        output = [i.name for i in operator.output]
+        for k, v in args.items():
+            if k in output:
+                at_args[k] = v.copy()
 
     iterations = FindNodes(Iteration).visit(operator.body)
     dim_mapper = {i.dim.name: i.dim for i in iterations}
@@ -35,33 +58,27 @@ def autotune(operator, arguments, parameters, tunable):
     # runs will finish quickly
     steppers = [i for i in iterations if i.dim.is_Time]
     if len(steppers) == 0:
+        stepper = None
         timesteps = 1
     elif len(steppers) == 1:
         stepper = steppers[0]
-        start = at_arguments[stepper.dim.min_name]
-        timesteps = stepper.extent(start=start, finish=options['at_squeezer']) - 1
-        if timesteps < 0:
-            timesteps = options['at_squeezer'] - timesteps
-            perf("AutoTuner: Number of timesteps adjusted to %d" % timesteps)
-        at_arguments[stepper.dim.min_name] = start
-        at_arguments[stepper.dim.max_name] = timesteps
-        if stepper.dim.is_Stepping:
-            at_arguments[stepper.dim.parent.min_name] = start
-            at_arguments[stepper.dim.parent.max_name] = timesteps
+        timesteps = init_time_bounds(stepper, at_args)
+        if timesteps is None:
+            return args
     else:
         warning("AutoTuner: Couldn't understand loop structure; giving up")
-        return arguments
+        return args
 
     # Attempted block sizes ...
-    mapper = OrderedDict([(i.argument.symbolic_size.name, i) for i in tunable])
+    mapper = OrderedDict([(i.tunable.name, i) for i in tunable])
     # ... Defaults (basic mode)
     blocksizes = [OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize']]
     # ... Always try the entire iteration space (degenerate block)
-    itershape = [mapper[i].iteration.symbolic_extent.subs(arguments) for i in mapper]
+    itershape = [mapper[i].iteration.symbolic_extent.subs(args) for i in mapper]
     blocksizes.append(OrderedDict([(i, mapper[i].iteration.extent(0, j-1))
                       for i, j in zip(mapper, itershape)]))
     # ... More attempts if auto-tuning in aggressive mode
-    if configuration['autotuning'].level == 'aggressive':
+    if level == 'aggressive':
         blocksizes = more_heuristic_attempts(blocksizes)
 
     # How many temporaries are allocated on the stack?
@@ -75,15 +92,18 @@ def autotune(operator, arguments, parameters, tunable):
     # square blocks are tested
     timings = OrderedDict()
     for bs in blocksizes:
+        # Can we safely autotune over the given time range?
+        check_time_bounds(stepper, at_args, args)
+
         illegal = False
-        for k, v in at_arguments.items():
+        for k, v in at_args.items():
             if k in bs:
                 val = bs[k]
-                start = mapper[k].original_dim.symbolic_start.subs(arguments)
-                end = mapper[k].original_dim.symbolic_end.subs(arguments)
+                start = mapper[k].original_dim.symbolic_start.subs(args)
+                end = mapper[k].original_dim.symbolic_end.subs(args)
 
                 if val <= mapper[k].iteration.extent(start, end):
-                    at_arguments[k] = val
+                    at_args[k] = val
                 else:
                     # Block size cannot be larger than actual dimension
                     illegal = True
@@ -93,7 +113,7 @@ def autotune(operator, arguments, parameters, tunable):
 
         # Make sure we remain within stack bounds, otherwise skip block size
         dim_sizes = {}
-        for k, v in at_arguments.items():
+        for k, v in at_args.items():
             if k in bs:
                 dim_sizes[mapper[k].argument.symbolic_size] = bs[k]
             elif k in dim_mapper:
@@ -113,31 +133,93 @@ def autotune(operator, arguments, parameters, tunable):
 
         # Use AutoTuner-specific profiler structs
         timer = operator.profiler.timer.reset()
-        at_arguments[operator.profiler.name] = timer
+        at_args[operator.profiler.name] = timer
 
-        operator.cfunction(*list(at_arguments.values()))
+        operator.cfunction(*list(at_args.values()))
         elapsed = sum(getattr(timer._obj, i) for i, _ in timer._obj._fields_)
         timings[tuple(bs.items())] = elapsed
         perf("AutoTuner: Block shape <%s> took %f (s) in %d timesteps" %
              (','.join('%d' % i for i in bs.values()), elapsed, timesteps))
 
+        # Prepare for the next autotuning run
+        update_time_bounds(stepper, at_args, timesteps, mode)
+
     try:
         best = dict(min(timings, key=timings.get))
         perf("AutoTuner: Selected block shape %s" % best)
     except ValueError:
         warning("AutoTuner: Couldn't find legal block shapes")
-        return arguments
+        return args
 
     # Build the new argument list
-    tuned = OrderedDict()
-    for k, v in arguments.items():
-        tuned[k] = best[k] if k in mapper else v
+    args = {k: best.get(k, v) for k, v in args.items()}
+
+    # In `runtime` mode, some timesteps have been executed already, so we
+    # get to adjust the time range
+    finalize_time_bounds(stepper, at_args, args, mode)
+
+    # Reset profiling data
+    assert operator.profiler.name in args
+    args[operator.profiler.name] = operator.profiler.timer.reset()
+
+    return args
+
+
+def init_time_bounds(stepper, at_args):
+    if stepper is None:
+        return
+    dim = stepper.dim.root
+    if stepper.direction is Backward:
+        at_args[dim.max_name] = at_args[dim.max_name]
+        at_args[dim.min_name] = at_args[dim.max_name] - options['at_squeezer']
+        if at_args[dim.max_name] < at_args[dim.min_name]:
+            warning("AutoTuner: too few time iterations; giving up")
+            return False
+    else:
+        at_args[dim.min_name] = at_args[dim.min_name]
+        at_args[dim.max_name] = at_args[dim.min_name] + options['at_squeezer']
+        if at_args[dim.min_name] > at_args[dim.max_name]:
+            warning("AutoTuner: too few time iterations; giving up")
+            return False
+
+    return stepper.extent(start=at_args[dim.min_name], finish=at_args[dim.max_name])
+
+
+def check_time_bounds(stepper, at_args, args):
+    if stepper is None:
+        return
+    dim = stepper.dim.root
+    if stepper.direction is Backward:
+        if at_args[dim.min_name] < args[dim.min_name]:
+            raise ValueError("Too few time iterations")
+
+    else:
+        if at_args[dim.max_name] > args[dim.max_name]:
+            raise ValueError("Too few time iterations")
+
 
-    # Reset the profiling struct
-    assert operator.profiler.name in tuned
-    tuned[operator.profiler.name] = operator.profiler.timer.reset()
+def update_time_bounds(stepper, at_args, timesteps, mode):
+    if mode != 'runtime' or stepper is None:
+        return
+    dim = stepper.dim.root
+    if stepper.direction is Backward:
+        at_args[dim.max_name] -= timesteps
+        at_args[dim.min_name] -= timesteps
+    else:
+        at_args[dim.min_name] += timesteps
+        at_args[dim.max_name] += timesteps
 
-    return tuned
+
+def finalize_time_bounds(stepper, at_args, args, mode):
+    if mode != 'runtime' or stepper is None:
+        return
+    dim = stepper.dim.root
+    if stepper.direction is Backward:
+        args[dim.max_name] = at_args[dim.max_name]
+        args[dim.min_name] = args[dim.min_name]
+    else:
+        args[dim.min_name] = at_args[dim.min_name]
+        args[dim.max_name] = args[dim.max_name]
 
 
 def more_heuristic_attempts(blocksizes):
diff --git a/devito/core/operator.py b/devito/core/operator.py
index 26b6e5c130..2e66ea77a1 100644
--- a/devito/core/operator.py
+++ b/devito/core/operator.py
@@ -70,11 +70,30 @@ def _generate_mpi(self, iet, **kwargs):
 
         return iet
 
-    def _autotune(self, args):
-        if self._dle_flags.get('blocking', False):
-            return autotune(self, args, self.parameters, self._dle_args)
-        else:
+    def _autotune(self, args, setup):
+        if setup is False or not self._dle_flags.get('blocking'):
             return args
+        elif setup is True:
+            level = configuration['autotuning'].level or 'basic'
+            args = autotune(self, args, level, configuration['autotuning'].mode)
+        elif isinstance(setup, str):
+            args = autotune(self, args, setup, configuration['autotuning'].mode)
+        elif isinstance(setup, tuple) and len(setup) == 2:
+            level, mode = setup
+            if level is False:
+                return args
+            else:
+                args = autotune(self, args, level, mode)
+        else:
+            raise ValueError("Expected bool, str, or 2-tuple, got `%s` instead"
+                             % type(setup))
+
+        # Record the tuned values
+        mapper = self._state.setdefault('tuned', {})
+        mapper.update({k: v for k, v in args.items()
+                       if k in [i.tunable.name for i in self._dle_args]})
+
+        return args
 
 
 class OperatorDebug(OperatorCore):
diff --git a/devito/data/allocators.py b/devito/data/allocators.py
index 8632fad4b7..d27af3fb52 100644
--- a/devito/data/allocators.py
+++ b/devito/data/allocators.py
@@ -65,9 +65,9 @@ def alloc(self, shape, dtype):
         if c_pointer is None:
             raise RuntimeError("Unable to allocate %d elements in memory", str(size))
 
-        c_pointer = ctypes.cast(c_pointer, np.ctypeslib.ndpointer(dtype=dtype,
-                                                                  shape=shape))
-        pointer = np.ctypeslib.as_array(c_pointer, shape=shape)
+        c_pointer_cast = ctypes.cast(
+            c_pointer, np.ctypeslib.ndpointer(dtype=dtype, shape=shape))
+        pointer = np.ctypeslib.as_array(c_pointer_cast, shape=shape)
 
         return (pointer, memfree_args)
 
@@ -242,7 +242,14 @@ def _alloc_C_libcall(self, size, ctype):
         if not self.available():
             raise RuntimeError("Couldn't find `libnuma`'s `numa_alloc_*` to "
                                "allocate memory")
-        c_bytesize = ctypes.c_ulong(size * ctypes.sizeof(ctype))
+
+        if size == 0:
+            # work around the fact that the allocator may return NULL when
+            # the size is 0, and numpy does not like that
+            c_bytesize = ctypes.c_ulong(1)
+        else:
+            c_bytesize = ctypes.c_ulong(size * ctypes.sizeof(ctype))
+
         if self.put_onnode:
             c_pointer = self.lib.numa_alloc_onnode(c_bytesize, self._node)
         elif self.put_local:
@@ -253,7 +260,8 @@ def _alloc_C_libcall(self, size, ctype):
         # note!  even though restype was set above, ctypes returns a
         # python integer.
         # See https://stackoverflow.com/questions/17840144/
-        if c_pointer == 0:
+        # edit: it apparently can return None, also!
+        if c_pointer == 0 or c_pointer is None:
             return None, None
         else:
             # Convert it back to a void * - this is
diff --git a/devito/dle/backends/common.py b/devito/dle/backends/common.py
index 6360b48da9..227bb578cb 100644
--- a/devito/dle/backends/common.py
+++ b/devito/dle/backends/common.py
@@ -81,6 +81,10 @@ def __repr__(self):
     def original_dim(self):
         return self.iteration.dim
 
+    @property
+    def tunable(self):
+        return self.argument.symbolic_size
+
 
 class AbstractRewriter(object):
     """
diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py
index 83a46b017f..7111fe4c46 100644
--- a/devito/ir/support/space.py
+++ b/devito/ir/support/space.py
@@ -6,7 +6,8 @@
 
 from frozendict import frozendict
 
-from devito.tools import PartialOrderTuple, as_tuple, filter_ordered, toposort
+from devito.tools import PartialOrderTuple, as_tuple, filter_ordered, toposort, is_integer
+
 
 __all__ = ['NullInterval', 'Interval', 'IntervalGroup', 'IterationSpace', 'DataSpace',
            'Forward', 'Backward', 'Any']
@@ -112,8 +113,8 @@ class Interval(AbstractInterval):
     is_Defined = True
 
     def __init__(self, dim, lower, upper):
-        assert isinstance(lower, int)
-        assert isinstance(upper, int)
+        assert is_integer(lower)
+        assert is_integer(upper)
         super(Interval, self).__init__(dim)
         self.lower = lower
         self.upper = upper
@@ -290,7 +291,7 @@ def zero(self, d=None):
                              relations=self.relations)
 
     def __getitem__(self, key):
-        if isinstance(key, (slice, int)):
+        if isinstance(key, slice) or is_integer(key):
             return super(IntervalGroup, self).__getitem__(key)
         if not self.is_well_defined:
             raise ValueError("Cannot fetch Interval from ill defined Space")
diff --git a/devito/operator.py b/devito/operator.py
index f1f92923ee..f892b80efc 100644
--- a/devito/operator.py
+++ b/devito/operator.py
@@ -70,6 +70,10 @@ def __init__(self, expressions, **kwargs):
         # References to local or external routines
         self._func_table = OrderedDict()
 
+        # Internal state. May be used to store information about previous runs,
+        # autotuning reports, etc
+        self._state = {}
+
         # Expression lowering: indexification, substitution rules, specialization
         expressions = [indexify(i) for i in expressions]
         expressions = self._apply_substitutions(expressions, subs)
@@ -167,8 +171,7 @@ def _prepare_arguments(self, **kwargs):
         args.update(kwargs.pop('backend', {}))
 
         # Execute autotuning and adjust arguments accordingly
-        if kwargs.pop('autotune', configuration['autotuning'].level):
-            args = self._autotune(args)
+        args = self._autotune(args, kwargs.pop('autotune', configuration['autotuning']))
 
         # Check all user-provided keywords are known to the Operator
         if not configuration['ignore-unknowns']:
@@ -254,7 +257,7 @@ def _profile_sections(self, iet):
         """Introduce C-level profiling nodes within the Iteration/Expression tree."""
         return List(body=iet), None
 
-    def _autotune(self, args):
+    def _autotune(self, args, setup):
         """Use auto-tuning on this Operator to determine empirically the
         best block sizes when loop blocking is in use."""
         return args
@@ -436,7 +439,18 @@ def apply(self, **kwargs):
 
         # Invoke kernel function with args
         arg_values = [args[p.name] for p in self.parameters]
-        self.cfunction(*arg_values)
+        try:
+            self.cfunction(*arg_values)
+        except ctypes.ArgumentError as e:
+            if e.args[0].startswith("argument "):
+                argnum = int(e.args[0][9:].split(':')[0]) - 1
+                newmsg = "error in argument '%s' with value '%s': %s" % (
+                    self.parameters[argnum].name,
+                    arg_values[argnum],
+                    e.args[0])
+                raise ctypes.ArgumentError(newmsg) from e
+            else:
+                raise
 
         # Post-process runtime arguments
         self._postprocess_arguments(args, **kwargs)
diff --git a/devito/yask/wrappers.py b/devito/yask/wrappers.py
index 5d48f0a234..b32246764d 100644
--- a/devito/yask/wrappers.py
+++ b/devito/yask/wrappers.py
@@ -199,9 +199,9 @@ def pre_apply(self, toshare):
         self.soln.prepare_solution()
 
         # Set up auto-tuning
-        if configuration['autotuning'] is False:
+        if configuration['autotuning'].level is False:
             self.soln.reset_auto_tuner(False)
-        elif configuration['autotuning'] == 'preemptive':
+        elif configuration['autotuning'].mode == 'preemptive':
             self.soln.run_auto_tuner_now()
 
     def post_apply(self):
diff --git a/setup.py b/setup.py
index 32ec38c625..b4fcdcb949 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,4 @@
 import versioneer
-import os
 
 from setuptools import setup, find_packages
 
@@ -44,6 +43,6 @@
       license='MIT',
       packages=find_packages(exclude=['docs', 'tests', 'examples']),
       install_requires=reqs,
-      extras_require={'extras':  opt_reqs},
+      extras_require={'extras': opt_reqs},
       dependency_links=links,
       test_suite='tests')
diff --git a/tests/test_autotuner.py b/tests/test_autotuner.py
index 99c0ba0c4d..3d17a990ec 100644
--- a/tests/test_autotuner.py
+++ b/tests/test_autotuner.py
@@ -19,8 +19,8 @@
 
 @silencio(log_level='DEBUG')
 @pytest.mark.parametrize("shape,expected", [
-    ((30, 30), 17),
-    ((30, 30, 30), 21)
+    ((30, 30), 13),
+    ((30, 30, 30), 17)
 ])
 def test_at_is_actually_working(shape, expected):
     """
@@ -39,18 +39,34 @@ def test_at_is_actually_working(shape, expected):
     stencil = Eq(outfield.indexify(), outfield.indexify() + infield.indexify()*3.0)
     op = Operator(stencil, dle=('blocking', {'blockinner': True, 'blockalways': True}))
 
-    # Expected 3 AT attempts for the given shape
+    # Run with whatever `configuration` says (by default, basic+preemptive)
     op(infield=infield, outfield=outfield, autotune=True)
     out = [i for i in buffer.getvalue().split('\n') if 'took' in i]
     assert len(out) == 4
 
-    # Now try the same with aggressive autotuning, which tries 9 more cases
+    buffer.truncate(0)
+
+    # Now try `aggressive` autotuning
     configuration['autotuning'] = 'aggressive'
     op(infield=infield, outfield=outfield, autotune=True)
     out = [i for i in buffer.getvalue().split('\n') if 'took' in i]
     assert len(out) == expected
     configuration['autotuning'] = configuration._defaults['autotuning']
 
+    buffer.truncate(0)
+
+    # Try again, but using the Operator API directly
+    op(infield=infield, outfield=outfield, autotune='aggressive')
+    out = [i for i in buffer.getvalue().split('\n') if 'took' in i]
+    assert len(out) == expected
+
+    buffer.truncate(0)
+
+    # Similar to above
+    op(infield=infield, outfield=outfield, autotune=('aggressive', 'preemptive'))
+    out = [i for i in buffer.getvalue().split('\n') if 'took' in i]
+    assert len(out) == expected
+
     logger.removeHandler(temporary_handler)
 
     temporary_handler.flush()
@@ -99,10 +115,10 @@ def test_timesteps_per_at_run():
         stencil = Eq(outfield[t + to, x, y, z],
                      outfield.indexify() + infield.indexify()*3.0)
         op = Operator(stencil, dle=('blocking', {'blockalways': True}))
-        op(infield=infield, outfield=outfield, t=2, autotune=True)
+        op(infield=infield, outfield=outfield, time=20, autotune=True)
         out = [i for i in buffer.getvalue().split('\n') if 'took' in i]
         assert len(out) == 4
-        assert all('in %d timesteps' % options['at_squeezer'] in i for i in out)
+        assert all('in %d timesteps' % (options['at_squeezer'] + 1) in i for i in out)
         buffer.truncate(0)
 
     logger.removeHandler(temporary_handler)
@@ -111,3 +127,29 @@ def test_timesteps_per_at_run():
     temporary_handler.close()
     buffer.flush()
     buffer.close()
+
+
+def test_nondestructive_forward():
+    """Test autotuning in non-destructive mode."""
+    grid = Grid(shape=(64, 64, 64))
+    f = TimeFunction(name='f', grid=grid)
+
+    op = Operator(Eq(f.forward, f + 1))
+    op.apply(time=100, autotune=('basic', 'runtime'))
+
+    # AT is expected to have executed 35 timesteps
+    assert np.all(f.data[0] == 100)
+    assert np.all(f.data[1] == 101)
+
+
+def test_nondestructive_backward():
+    """Test autotuning in non-destructive mode."""
+    grid = Grid(shape=(64, 64, 64))
+    f = TimeFunction(name='f', grid=grid)
+
+    op = Operator(Eq(f.backward, f + 1))
+    op.apply(time=101, autotune=('basic', 'runtime'))
+
+    # AT is expected to have executed 35 timesteps
+    assert np.all(f.data[0] == 101)
+    assert np.all(f.data[1] == 100)