Merge remote-tracking branch 'origin/master' into no-mpi

devitocodes · Nov 21, 2018 · 059ff82 · 059ff82
2 parents d89042e + 7007690
commit 059ff82
Show file tree

Hide file tree

Showing 11 changed files with 239 additions and 73 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -96,7 +96,7 @@ install:
   - if [[ $INSTALL_TYPE == 'conda' ]]; then
       conda env create -q -f environment.yml python=$TRAVIS_PYTHON_VERSION;
       source activate devito;
-      if [[ $MPI_INSTALL == '1' ]]; then pip install -e .[extras]; else pip install -e .; fi
+      pip install -e .;
       conda list;
     fi
   - if [[ "$MPI_INSTALL" == '1' ]]; then
@@ -105,6 +105,7 @@ install:
 
   # Install devito with pip
   - if [[ $INSTALL_TYPE == 'pip_setup' ]]; then python setup.py install; fi
+  - if [[ $MPI_INSTALL == '1' ]]; then pip install -e .[extras]; fi
 
 before_script:
   - echo -e "Host github.com\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config

diff --git a/devito/__init__.py b/devito/__init__.py
@@ -62,11 +62,7 @@ def _at_callback(val):  # noqa
         level, mode = val
     if level == 'off':
         level = False
-    if configuration['backend'] == 'core' and mode == 'runtime':
-        warning("Unsupported auto-tuning mode `runtime` with backend `core`")
-        return at_setup(level, 'preemptive')
-    else:
-        return at_setup(level, mode)
+    return at_setup(level, mode)
 configuration.add('autotuning', 'off', at_accepted, callback=_at_callback,  # noqa
                   impacts_jit=False)
 

diff --git a/devito/core/autotuning.py b/devito/core/autotuning.py
@@ -6,27 +6,50 @@
 from operator import mul
 import resource
 
-from devito.ir.iet import Iteration, FindNodes, FindSymbols
+from devito.ir import Backward, Iteration, FindNodes, FindSymbols
 from devito.logger import perf, warning
 from devito.parameters import configuration
 
 __all__ = ['autotune']
 
 
-def autotune(operator, arguments, parameters, tunable):
+def autotune(operator, args, level, mode):
     """
-    Acting as a high-order function, take as input an operator and a list of
-    operator arguments to perform empirical autotuning. Some of the operator
-    arguments are marked as tunable.
+    Operator autotuning.
+
+    Parameters
+    ----------
+    operator : Operator
+        Input Operator.
+    args : dict_like
+        The runtime arguments with which `operator` is run.
+    level : str
+        The autotuning aggressiveness (basic, aggressive). A more aggressive
+        autotuning might eventually result in higher performance, though in
+        some circumstances it might instead increase the actual runtime.
+    mode : str
+        The autotuning mode (preemptive, runtime). In preemptive mode, the
+        output runtime values supplied by the user to `operator.apply` are
+        replaced with shadow copies.
     """
+    key = [level, mode]
+    accepted = configuration._accepted['autotuning']
+    if key not in accepted:
+        raise ValueError("The accepted `(level, mode)` combinations are `%s`; "
+                         "provided `%s` instead" % (accepted, key))
+
+    parameters = operator.parameters
+    tunable = operator._dle_args
+
     # We get passed all the arguments, but the cfunction only requires a subset
-    at_arguments = OrderedDict([(p.name, arguments[p.name]) for p in parameters])
+    at_args = OrderedDict([(p.name, args[p.name]) for p in parameters])
 
-    # User-provided output data must not be altered
-    output = [i.name for i in operator.output]
-    for k, v in arguments.items():
-        if k in output:
-            at_arguments[k] = v.copy()
+    # User-provided output data won't be altered in `preemptive` mode
+    if mode == 'preemptive':
+        output = [i.name for i in operator.output]
+        for k, v in args.items():
+            if k in output:
+                at_args[k] = v.copy()
 
     iterations = FindNodes(Iteration).visit(operator.body)
     dim_mapper = {i.dim.name: i.dim for i in iterations}
@@ -35,33 +58,27 @@ def autotune(operator, arguments, parameters, tunable):
     # runs will finish quickly
     steppers = [i for i in iterations if i.dim.is_Time]
     if len(steppers) == 0:
+        stepper = None
         timesteps = 1
     elif len(steppers) == 1:
         stepper = steppers[0]
-        start = at_arguments[stepper.dim.min_name]
-        timesteps = stepper.extent(start=start, finish=options['at_squeezer']) - 1
-        if timesteps < 0:
-            timesteps = options['at_squeezer'] - timesteps
-            perf("AutoTuner: Number of timesteps adjusted to %d" % timesteps)
-        at_arguments[stepper.dim.min_name] = start
-        at_arguments[stepper.dim.max_name] = timesteps
-        if stepper.dim.is_Stepping:
-            at_arguments[stepper.dim.parent.min_name] = start
-            at_arguments[stepper.dim.parent.max_name] = timesteps
+        timesteps = init_time_bounds(stepper, at_args)
+        if timesteps is None:
+            return args
     else:
         warning("AutoTuner: Couldn't understand loop structure; giving up")
-        return arguments
+        return args
 
     # Attempted block sizes ...
-    mapper = OrderedDict([(i.argument.symbolic_size.name, i) for i in tunable])
+    mapper = OrderedDict([(i.tunable.name, i) for i in tunable])
     # ... Defaults (basic mode)
     blocksizes = [OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize']]
     # ... Always try the entire iteration space (degenerate block)
-    itershape = [mapper[i].iteration.symbolic_extent.subs(arguments) for i in mapper]
+    itershape = [mapper[i].iteration.symbolic_extent.subs(args) for i in mapper]
     blocksizes.append(OrderedDict([(i, mapper[i].iteration.extent(0, j-1))
                       for i, j in zip(mapper, itershape)]))
     # ... More attempts if auto-tuning in aggressive mode
-    if configuration['autotuning'].level == 'aggressive':
+    if level == 'aggressive':
         blocksizes = more_heuristic_attempts(blocksizes)
 
     # How many temporaries are allocated on the stack?
@@ -75,15 +92,18 @@ def autotune(operator, arguments, parameters, tunable):
     # square blocks are tested
     timings = OrderedDict()
     for bs in blocksizes:
+        # Can we safely autotune over the given time range?
+        check_time_bounds(stepper, at_args, args)
+
         illegal = False
-        for k, v in at_arguments.items():
+        for k, v in at_args.items():
             if k in bs:
                 val = bs[k]
-                start = mapper[k].original_dim.symbolic_start.subs(arguments)
-                end = mapper[k].original_dim.symbolic_end.subs(arguments)
+                start = mapper[k].original_dim.symbolic_start.subs(args)
+                end = mapper[k].original_dim.symbolic_end.subs(args)
 
                 if val <= mapper[k].iteration.extent(start, end):
-                    at_arguments[k] = val
+                    at_args[k] = val
                 else:
                     # Block size cannot be larger than actual dimension
                     illegal = True
@@ -93,7 +113,7 @@ def autotune(operator, arguments, parameters, tunable):
 
         # Make sure we remain within stack bounds, otherwise skip block size
         dim_sizes = {}
-        for k, v in at_arguments.items():
+        for k, v in at_args.items():
             if k in bs:
                 dim_sizes[mapper[k].argument.symbolic_size] = bs[k]
             elif k in dim_mapper:
@@ -113,31 +133,93 @@ def autotune(operator, arguments, parameters, tunable):
 
         # Use AutoTuner-specific profiler structs
         timer = operator.profiler.timer.reset()
-        at_arguments[operator.profiler.name] = timer
+        at_args[operator.profiler.name] = timer
 
-        operator.cfunction(*list(at_arguments.values()))
+        operator.cfunction(*list(at_args.values()))
         elapsed = sum(getattr(timer._obj, i) for i, _ in timer._obj._fields_)
         timings[tuple(bs.items())] = elapsed
         perf("AutoTuner: Block shape <%s> took %f (s) in %d timesteps" %
              (','.join('%d' % i for i in bs.values()), elapsed, timesteps))
 
+        # Prepare for the next autotuning run
+        update_time_bounds(stepper, at_args, timesteps, mode)
+
     try:
         best = dict(min(timings, key=timings.get))
         perf("AutoTuner: Selected block shape %s" % best)
     except ValueError:
         warning("AutoTuner: Couldn't find legal block shapes")
-        return arguments
+        return args
 
     # Build the new argument list
-    tuned = OrderedDict()
-    for k, v in arguments.items():
-        tuned[k] = best[k] if k in mapper else v
+    args = {k: best.get(k, v) for k, v in args.items()}
+
+    # In `runtime` mode, some timesteps have been executed already, so we
+    # get to adjust the time range
+    finalize_time_bounds(stepper, at_args, args, mode)
+
+    # Reset profiling data
+    assert operator.profiler.name in args
+    args[operator.profiler.name] = operator.profiler.timer.reset()
+
+    return args
+
+
+def init_time_bounds(stepper, at_args):
+    if stepper is None:
+        return
+    dim = stepper.dim.root
+    if stepper.direction is Backward:
+        at_args[dim.max_name] = at_args[dim.max_name]
+        at_args[dim.min_name] = at_args[dim.max_name] - options['at_squeezer']
+        if at_args[dim.max_name] < at_args[dim.min_name]:
+            warning("AutoTuner: too few time iterations; giving up")
+            return False
+    else:
+        at_args[dim.min_name] = at_args[dim.min_name]
+        at_args[dim.max_name] = at_args[dim.min_name] + options['at_squeezer']
+        if at_args[dim.min_name] > at_args[dim.max_name]:
+            warning("AutoTuner: too few time iterations; giving up")
+            return False
+
+    return stepper.extent(start=at_args[dim.min_name], finish=at_args[dim.max_name])
+
+
+def check_time_bounds(stepper, at_args, args):
+    if stepper is None:
+        return
+    dim = stepper.dim.root
+    if stepper.direction is Backward:
+        if at_args[dim.min_name] < args[dim.min_name]:
+            raise ValueError("Too few time iterations")
+
+    else:
+        if at_args[dim.max_name] > args[dim.max_name]:
+            raise ValueError("Too few time iterations")
+
 
-    # Reset the profiling struct
-    assert operator.profiler.name in tuned
-    tuned[operator.profiler.name] = operator.profiler.timer.reset()
+def update_time_bounds(stepper, at_args, timesteps, mode):
+    if mode != 'runtime' or stepper is None:
+        return
+    dim = stepper.dim.root
+    if stepper.direction is Backward:
+        at_args[dim.max_name] -= timesteps
+        at_args[dim.min_name] -= timesteps
+    else:
+        at_args[dim.min_name] += timesteps
+        at_args[dim.max_name] += timesteps
 
-    return tuned
+
+def finalize_time_bounds(stepper, at_args, args, mode):
+    if mode != 'runtime' or stepper is None:
+        return
+    dim = stepper.dim.root
+    if stepper.direction is Backward:
+        args[dim.max_name] = at_args[dim.max_name]
+        args[dim.min_name] = args[dim.min_name]
+    else:
+        args[dim.min_name] = at_args[dim.min_name]
+        args[dim.max_name] = args[dim.max_name]
 
 
 def more_heuristic_attempts(blocksizes):

diff --git a/devito/core/operator.py b/devito/core/operator.py
@@ -70,11 +70,30 @@ def _generate_mpi(self, iet, **kwargs):
 
         return iet
 
-    def _autotune(self, args):
-        if self._dle_flags.get('blocking', False):
-            return autotune(self, args, self.parameters, self._dle_args)
-        else:
+    def _autotune(self, args, setup):
+        if setup is False or not self._dle_flags.get('blocking'):
             return args
+        elif setup is True:
+            level = configuration['autotuning'].level or 'basic'
+            args = autotune(self, args, level, configuration['autotuning'].mode)
+        elif isinstance(setup, str):
+            args = autotune(self, args, setup, configuration['autotuning'].mode)
+        elif isinstance(setup, tuple) and len(setup) == 2:
+            level, mode = setup
+            if level is False:
+                return args
+            else:
+                args = autotune(self, args, level, mode)
+        else:
+            raise ValueError("Expected bool, str, or 2-tuple, got `%s` instead"
+                             % type(setup))
+
+        # Record the tuned values
+        mapper = self._state.setdefault('tuned', {})
+        mapper.update({k: v for k, v in args.items()
+                       if k in [i.tunable.name for i in self._dle_args]})
+
+        return args
 
 
 class OperatorDebug(OperatorCore):

diff --git a/devito/data/allocators.py b/devito/data/allocators.py
@@ -65,9 +65,9 @@ def alloc(self, shape, dtype):
         if c_pointer is None:
             raise RuntimeError("Unable to allocate %d elements in memory", str(size))
 
-        c_pointer = ctypes.cast(c_pointer, np.ctypeslib.ndpointer(dtype=dtype,
-                                                                  shape=shape))
-        pointer = np.ctypeslib.as_array(c_pointer, shape=shape)
+        c_pointer_cast = ctypes.cast(
+            c_pointer, np.ctypeslib.ndpointer(dtype=dtype, shape=shape))
+        pointer = np.ctypeslib.as_array(c_pointer_cast, shape=shape)
 
         return (pointer, memfree_args)
 
@@ -242,7 +242,14 @@ def _alloc_C_libcall(self, size, ctype):
         if not self.available():
             raise RuntimeError("Couldn't find `libnuma`'s `numa_alloc_*` to "
                                "allocate memory")
-        c_bytesize = ctypes.c_ulong(size * ctypes.sizeof(ctype))
+
+        if size == 0:
+            # work around the fact that the allocator may return NULL when
+            # the size is 0, and numpy does not like that
+            c_bytesize = ctypes.c_ulong(1)
+        else:
+            c_bytesize = ctypes.c_ulong(size * ctypes.sizeof(ctype))
+
         if self.put_onnode:
             c_pointer = self.lib.numa_alloc_onnode(c_bytesize, self._node)
         elif self.put_local:
@@ -253,7 +260,8 @@ def _alloc_C_libcall(self, size, ctype):
         # note!  even though restype was set above, ctypes returns a
         # python integer.
         # See https://stackoverflow.com/questions/17840144/
-        if c_pointer == 0:
+        # edit: it apparently can return None, also!
+        if c_pointer == 0 or c_pointer is None:
             return None, None
         else:
             # Convert it back to a void * - this is

diff --git a/devito/dle/backends/common.py b/devito/dle/backends/common.py
@@ -81,6 +81,10 @@ def __repr__(self):
     def original_dim(self):
         return self.iteration.dim
 
+    @property
+    def tunable(self):
+        return self.argument.symbolic_size
+
 
 class AbstractRewriter(object):
     """

diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py
@@ -6,7 +6,8 @@
 
 from frozendict import frozendict
 
-from devito.tools import PartialOrderTuple, as_tuple, filter_ordered, toposort
+from devito.tools import PartialOrderTuple, as_tuple, filter_ordered, toposort, is_integer
+
 
 __all__ = ['NullInterval', 'Interval', 'IntervalGroup', 'IterationSpace', 'DataSpace',
            'Forward', 'Backward', 'Any']
@@ -112,8 +113,8 @@ class Interval(AbstractInterval):
     is_Defined = True
 
     def __init__(self, dim, lower, upper):
-        assert isinstance(lower, int)
-        assert isinstance(upper, int)
+        assert is_integer(lower)
+        assert is_integer(upper)
         super(Interval, self).__init__(dim)
         self.lower = lower
         self.upper = upper
@@ -290,7 +291,7 @@ def zero(self, d=None):
                              relations=self.relations)
 
     def __getitem__(self, key):
-        if isinstance(key, (slice, int)):
+        if isinstance(key, slice) or is_integer(key):
             return super(IntervalGroup, self).__getitem__(key)
         if not self.is_well_defined:
             raise ValueError("Cannot fetch Interval from ill defined Space")