helmholtz-analytics · coquelin77 · Mar 3, 2020 · Dec 4, 2019 · Dec 4, 2019 · Dec 4, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -13,7 +13,7 @@ before_install:
 
 install:
   - pre-commit run --all-files
-  - docker exec -t unittest /bin/bash -c '. /root/.bashrc && pip install -q -e .[hdf5,netcdf] && pip list'
+  - docker exec -t unittest /bin/bash -c '. /root/.bashrc && pip install --progress-bar off -e .[hdf5,netcdf] && pip list'
 
 script:
   # Running multiple mpi process count, generate a unique coverage report for each one and merge into one report

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@
 # v0.3.0
 
 - [#454](https://github.com/helmholtz-analytics/heat/issues/454) Update lasso example
+- [#474](https://github.com/helmholtz-analytics/heat/pull/474) New feature: distributed Gaussian Naive Bayes classifier
 - [#473](https://github.com/helmholtz-analytics/heat/issues/473) Matmul now will not split any of the input matrices if both have `split=None`. To toggle splitting of one input for increased speed use the allow_resplit flag.
 - [#473](https://github.com/helmholtz-analytics/heat/issues/473) `dot` handles 2 split None vectors correctly now
 - [#470](https://github.com/helmholtz-analytics/heat/pull/470) Enhancement: Accelerate distance calculations in kmeans clustering by introduction of new module spatial.distance

diff --git a/heat/__init__.py b/heat/__init__.py
@@ -1,5 +1,6 @@
 from . import core
 from . import cluster
+from . import naive_bayes
 from . import regression
 from . import spatial
 from .core import *

diff --git a/heat/core/dndarray.py b/heat/core/dndarray.py
@@ -1290,11 +1290,13 @@ def __getitem__(self, key):
         (1/2) >>> tensor([0.])
         (2/2) >>> tensor([0., 0.])
         """
+
         l_dtype = self.dtype.torch_type()
         if isinstance(key, DNDarray) and key.gshape[-1] != len(self.gshape):
             key = tuple(x.item() for x in key)
 
         if not self.is_distributed():
+
             if not self.comm.size == 1:
                 if isinstance(key, DNDarray) and key.gshape[-1] == len(self.gshape):
                     # this will return a 1D array as the shape cannot be determined automatically
@@ -1329,6 +1331,7 @@ def __getitem__(self, key):
                     )
 
         else:
+
             _, _, chunk_slice = self.comm.chunk(self.shape, self.split)
             chunk_start = chunk_slice[self.split].start
             chunk_end = chunk_slice[self.split].stop
@@ -1373,7 +1376,6 @@ def __getitem__(self, key):
                 # handle the dimensional reduction for integers
                 ints = sum([isinstance(it, int) for it in key])
                 gout = gout[: len(gout) - ints]
-
                 if self.split >= len(gout):
                     new_split = len(gout) - 1 if len(gout) - 1 > 0 else 0
                 else:
@@ -1400,30 +1402,46 @@ def __getitem__(self, key):
                         key[self.split] = slice(min(hold), max(hold) + 1, key[self.split].step)
                         arr = self.__array[tuple(key)]
                         gout = list(arr.shape)
-
-                # if the given axes are not splits (must be ints for python)
-                # this means the whole slice is on one node
-                elif key[self.split] in range(chunk_start, chunk_end):
-                    key = list(key)
-                    key[self.split] = key[self.split] - chunk_start
-                    arr = self.__array[tuple(key)]
-                    gout = list(arr.shape)
-                elif key[self.split] < 0 and self.gshape[self.split] + key[self.split] in range(
-                    chunk_start, chunk_end
-                ):
-                    key = list(key)
-                    key[self.split] = key[self.split] + chunk_end - chunk_start
-                    arr = self.__array[tuple(key)]
-                    gout = list(arr.shape)
                 else:
-                    warnings.warn(
-                        "This process (rank: {}) is without data after slicing, running the .balance_() function is recommended".format(
-                            self.comm.rank
-                        ),
-                        ResourceWarning,
-                    )
-                    # arr is empty
-                    # gout is all 0s and is the proper shape
+                    # if the given axes are not splits (must be ints OR LISTS for python)
+                    # this means the whole slice is on one node
+                    if isinstance(key, list):
+                        indices = key
+                    else:
+                        indices = key[self.split]
+                    key = list(key)
+                    if isinstance(indices, list):
+                        indices = [
+                            index + self.gshape[self.split] if index < 0 else index
+                            for index in indices
+                        ]
+                        sorted_key_along_split = sorted(indices)
+                        if sorted_key_along_split[0] in range(
+                            chunk_start, chunk_end
+                        ) and sorted_key_along_split[-1] in range(chunk_start, chunk_end):
+                            indices = [index - chunk_start for index in indices]
+                            arr = self.__array[indices]
+                            gout = list(arr.shape)
+
+                    elif isinstance(key[self.split], int):
+                        key[self.split] = (
+                            key[self.split] + self.gshape[self.split]
+                            if key[self.split] < 0
+                            else key[self.split]
+                        )
+                        if key[self.split] in range(chunk_start, chunk_end):
+                            key[self.split] = key[self.split] - chunk_start
+                            arr = self.__array[tuple(key)]
+                            gout = list(arr.shape)
+                    if 0 in arr.shape:
+                        # arr is empty
+                        # gout is all 0s and is the proper shape
+                        warnings.warn(
+                            "This process (rank: {}) is without data after slicing, running the .balance_() function is recommended".format(
+                                self.comm.rank
+                            ),
+                            ResourceWarning,
+                        )
 
             # if the given axes are only a slice
             elif isinstance(key, slice) and self.split == 0:

diff --git a/heat/core/indexing.py b/heat/core/indexing.py
@@ -58,20 +58,22 @@ def nonzero(a):
     [0/1] tensor([[4, 5, 6]])
     [1/1] tensor([[7, 8, 9]])
     """
-
     if a.split is None:
         # if there is no split then just return the values from torch
-        return factories.array(
-            torch.nonzero(a._DNDarray__array), is_split=a.split, device=a.device, comm=a.comm
-        )
+        lcl_nonzero = torch.nonzero(a._DNDarray__array)
+        is_split = None
     else:
         # a is split
         lcl_nonzero = torch.nonzero(a._DNDarray__array)
         _, _, slices = a.comm.chunk(a.shape, a.split)
         lcl_nonzero[..., a.split] += slices[a.split].start
         gout = list(lcl_nonzero.size())
         gout[0] = a.comm.allreduce(gout[0], MPI.SUM)
-        return factories.array(lcl_nonzero, is_split=0, device=a.device, comm=a.comm)
+        is_split = 0
+
+    if a.numdims == 1:
+        lcl_nonzero = lcl_nonzero.squeeze()
+    return factories.array(lcl_nonzero, is_split=is_split, device=a.device, comm=a.comm)
 
 
 def where(cond, x=None, y=None):

diff --git a/heat/core/statistics.py b/heat/core/statistics.py
@@ -319,34 +319,36 @@ def average(x, axis=None, weights=None, returned=False):
         if x.gshape != weights.gshape:
             if axis is None:
                 raise TypeError("Axis must be specified when shapes of x and weights differ.")
-            if isinstance(axis, tuple):
+            elif isinstance(axis, tuple):
                 raise NotImplementedError("Weighted average over tuple axis not implemented yet.")
             if weights.numdims != 1:
                 raise TypeError("1D weights expected when shapes of x and weights differ.")
             if weights.gshape[0] != x.gshape[axis]:
                 raise ValueError("Length of weights not compatible with specified axis.")
 
-        wgt = factories.empty_like(weights, device=x.device)
-        wgt._DNDarray__array = weights._DNDarray__array
-        wgt._DNDarray__split = weights.split
-
-        # Broadcast weights along specified axis if necessary
-        if wgt.numdims == 1 and x.numdims != 1:
-            if wgt.split is not None:
-                wgt.resplit_(None)
-            weights_newshape = tuple(1 if i != axis else x.gshape[axis] for i in range(x.numdims))
-            wgt._DNDarray__array = torch.reshape(wgt._DNDarray__array, weights_newshape)
-            wgt._DNDarray__gshape = weights_newshape
+            wgt_lshape = tuple(
+                weights.lshape[0] if dim == axis else 1 for dim in list(range(x.numdims))
+            )
+            wgt_slice = [slice(None) if dim == axis else 0 for dim in list(range(x.numdims))]
+            wgt_split = None if weights.split is None else axis
+            wgt = factories.empty(wgt_lshape, dtype=weights.dtype, device=x.device)
+            wgt._DNDarray__array[wgt_slice] = weights._DNDarray__array
+            wgt = factories.array(wgt._DNDarray__array, is_split=wgt_split)
+        else:
+            if x.comm.is_distributed():
+                if x.split is not None and weights.split != x.split and weights.numdims != 1:
+                    # fix after Issue #425 is solved
+                    raise NotImplementedError(
+                        "weights.split does not match data.split: not implemented yet."
+                    )
+            wgt = factories.empty_like(weights, device=x.device)
+            wgt._DNDarray__array = weights._DNDarray__array
 
         cumwgt = wgt.sum(axis=axis)
+
         if logical.any(cumwgt == 0.0):
             raise ZeroDivisionError("Weights sum to zero, can't be normalized")
 
-        # Distribution: if x is split, split to weights along same dimension if possible
-        if x.split is not None and wgt.split != x.split:
-            if wgt.gshape[x.split] != 1:
-                wgt.resplit_(x.split)
-
         result = (x * wgt).sum(axis=axis) / cumwgt
 
     if returned:
@@ -1222,12 +1224,12 @@ def reduce_vars_elementwise(output_shape_i):
             mu = torch.mean(x._DNDarray__array, dim=axis)
             var = torch.var(x._DNDarray__array, dim=axis, unbiased=bessel)
         else:
-            mu = factories.zeros(output_shape_i, device=x.device)
-            var = factories.zeros(output_shape_i, device=x.device)
+            mu = factories.zeros(output_shape_i, dtype=x.dtype, device=x.device)
+            var = factories.zeros(output_shape_i, dtype=x.dtype, device=x.device)
 
         var_shape = list(var.shape) if list(var.shape) else [1]
 
-        var_tot = factories.zeros(([x.comm.size, 2] + var_shape), device=x.device)
+        var_tot = factories.zeros(([x.comm.size, 2] + var_shape), dtype=x.dtype, device=x.device)
         n_tot = factories.zeros(x.comm.size, device=x.device)
         var_tot[x.comm.rank, 0, :] = var
         var_tot[x.comm.rank, 1, :] = mu
@@ -1259,8 +1261,8 @@ def reduce_vars_elementwise(output_shape_i):
                 mu_in = 0.0
 
             n = x.lnumel
-            var_tot = factories.zeros((x.comm.size, 3), device=x.device)
-            var_proc = factories.zeros((x.comm.size, 3), device=x.device)
+            var_tot = factories.zeros((x.comm.size, 3), dtype=x.dtype, device=x.device)
+            var_proc = factories.zeros((x.comm.size, 3), dtype=x.dtype, device=x.device)
             var_proc[x.comm.rank] = var_in, mu_in, float(n)
             x.comm.Allreduce(var_proc, var_tot, MPI.SUM)
 
@@ -1322,15 +1324,20 @@ def reduce_vars_elementwise(output_shape_i):
 
             if x.split is None:  # x is *not* distributed -> no need to distributed
                 return factories.array(
-                    torch.var(x._DNDarray__array, dim=axis, unbiased=bessel), device=x.device
+                    torch.var(x._DNDarray__array, dim=axis, unbiased=bessel),
+                    dtype=x.dtype,
+                    device=x.device,
                 )
             elif axis == x.split:  # x is distributed and axis chosen is == to split
                 return reduce_vars_elementwise(output_shape)
             else:
                 # singular axis given (axis) not equal to split direction (x.split)
                 lcl = torch.var(x._DNDarray__array, dim=axis, keepdim=False)
                 return factories.array(
-                    lcl, is_split=x.split if axis > x.split else x.split - 1, device=x.device
+                    lcl,
+                    is_split=x.split if axis > x.split else x.split - 1,
+                    dtype=x.dtype,
+                    device=x.device,
                 )
         else:
             raise TypeError("axis (axis) must be an int, tuple, list, etc.; currently it is {}. ")
diff --git a/heat/core/tests/test_statistics.py b/heat/core/tests/test_statistics.py
@@ -315,7 +315,7 @@ def test_average(self):
         )
         size = random_volume.comm.size
         random_weights = ht.array(
-            torch.randn((3 * size,), dtype=torch.float64, device=device), device=ht_device
+            torch.randn((3 * size,), dtype=torch.float64, device=device), split=0, device=ht_device
         )
         avg_volume = ht.average(random_volume, weights=random_weights, axis=1)
         np_avg_volume = np.average(random_volume.numpy(), weights=random_weights.numpy(), axis=1)
@@ -334,6 +334,28 @@ def test_average(self):
         self.assertEqual(avg_volume_with_cumwgt[1].gshape, avg_volume_with_cumwgt[0].gshape)
         self.assertEqual(avg_volume_with_cumwgt[1].split, avg_volume_with_cumwgt[0].split)
 
+        # check weighted average over all float elements of split 3d tensor (3d weights)
+
+        random_weights_3d = ht.array(
+            torch.randn((3, 3, 3), dtype=torch.float64, device=device), is_split=1, device=ht_device
+        )
+        avg_volume = ht.average(random_volume, weights=random_weights_3d, axis=1)
+        np_avg_volume = np.average(random_volume.numpy(), weights=random_weights.numpy(), axis=1)
+        self.assertIsInstance(avg_volume, ht.DNDarray)
+        self.assertEqual(avg_volume.shape, (3, 3))
+        self.assertEqual(avg_volume.lshape, (3, 3))
+        self.assertEqual(avg_volume.dtype, ht.float64)
+        self.assertEqual(avg_volume._DNDarray__array.dtype, torch.float64)
+        self.assertEqual(avg_volume.split, None)
+        self.assertAlmostEqual(avg_volume.numpy().all(), np_avg_volume.all())
+        avg_volume_with_cumwgt = ht.average(
+            random_volume, weights=random_weights, axis=1, returned=True
+        )
+        self.assertIsInstance(avg_volume_with_cumwgt, tuple)
+        self.assertIsInstance(avg_volume_with_cumwgt[1], ht.DNDarray)
+        self.assertEqual(avg_volume_with_cumwgt[1].gshape, avg_volume_with_cumwgt[0].gshape)
+        self.assertEqual(avg_volume_with_cumwgt[1].split, avg_volume_with_cumwgt[0].split)
+
         # check average over all float elements of split 3d tensor, tuple axis
         random_volume = ht.random.randn(3, 3, 3, split=0, device=ht_device)
         avg_volume = ht.average(random_volume, axis=(1, 2))
@@ -347,16 +369,16 @@ def test_average(self):
 
         # check weighted average over all float elements of split 5d tensor, along split axis
         random_5d = ht.random.randn(random_volume.comm.size, 2, 3, 4, 5, split=0, device=ht_device)
-        axis = 1
-        random_weights = ht.random.randn(random_5d.gshape[axis], device=ht_device)
+        axis = random_5d.split
+        random_weights = ht.random.randn(random_5d.gshape[axis], split=0, device=ht_device)
         avg_5d = random_5d.average(weights=random_weights, axis=axis)
 
         self.assertIsInstance(avg_5d, ht.DNDarray)
-        self.assertEqual(avg_5d.gshape, (size, 3, 4, 5))
+        self.assertEqual(avg_5d.gshape, (2, 3, 4, 5))
         self.assertLessEqual(avg_5d.lshape[1], 3)
         self.assertEqual(avg_5d.dtype, ht.float64)
         self.assertEqual(avg_5d._DNDarray__array.dtype, torch.float64)
-        self.assertEqual(avg_5d.split, 0)
+        self.assertEqual(avg_5d.split, None)
 
         # check exceptions
         with self.assertRaises(TypeError):
@@ -372,12 +394,16 @@ def test_average(self):
         )
         with self.assertRaises(TypeError):
             ht.average(random_5d, weights=random_weights, axis=axis)
-        random_weights = ht.random.randn(random_5d.gshape[axis] + 1, device=ht_device)
+        random_shape_weights = ht.random.randn(random_5d.gshape[axis] + 1, device=ht_device)
         with self.assertRaises(ValueError):
-            ht.average(random_5d, weights=random_weights, axis=axis)
-        random_weights = ht.zeros((random_5d.gshape[axis]), device=ht_device)
+            ht.average(random_5d, weights=random_shape_weights, axis=axis)
+        zero_weights = ht.zeros((random_5d.gshape[axis]), split=0, device=ht_device)
         with self.assertRaises(ZeroDivisionError):
-            ht.average(random_5d, weights=random_weights, axis=axis)
+            ht.average(random_5d, weights=zero_weights, axis=axis)
+        weights_5d_split_mismatch = ht.ones(random_5d.gshape, split=-1, device=ht_device)
+        with self.assertRaises(NotImplementedError):
+            ht.average(random_5d, weights=weights_5d_split_mismatch, axis=axis)
+
         with self.assertRaises(TypeError):
             ht_array.average(axis=1.1)
         with self.assertRaises(TypeError):