IntelPython · oleksandr-pavlyk · Mar 3, 2023 · Mar 3, 2023 · Mar 3, 2023 · Mar 3, 2023
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
@@ -32,6 +32,7 @@ pybind11_add_module(${python_module_name} MODULE
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
@@ -58,7 +58,7 @@
 )
 from dpctl.tensor._device import Device
 from dpctl.tensor._dlpack import from_dlpack
-from dpctl.tensor._indexing_functions import put, take
+from dpctl.tensor._indexing_functions import extract, nonzero, place, put, take
 from dpctl.tensor._manipulation_functions import (
     broadcast_arrays,
     broadcast_to,
@@ -115,6 +115,9 @@
     "squeeze",
     "take",
     "put",
+    "extract",
+    "place",
+    "nonzero",
     "from_numpy",
     "to_numpy",
     "asnumpy",

diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
@@ -389,45 +389,75 @@ def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True):
     return R
 
 
-def _mock_extract(ary, ary_mask, p):
-    exec_q = dpctl.utils.get_execution_queue(
-        (
-            ary.sycl_queue,
-            ary_mask.sycl_queue,
+def _extract_impl(ary, ary_mask, axis=0):
+    """Extract elements of ary by applying mask starting from slot
+    dimension axis"""
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
+    if not isinstance(ary_mask, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary_mask)}"
         )
+    exec_q = dpctl.utils.get_execution_queue(
+        (ary.sycl_queue, ary_mask.sycl_queue)
     )
     if exec_q is None:
         raise dpctl.utils.ExecutionPlacementError(
-            "Can not automatically determine where to allocate the "
-            "result or performance execution. "
-            "Use `usm_ndarray.to_device` method to migrate data to "
-            "be associated with the same queue."
+            "arrays have different associated queues. "
+            "Use `Y.to_device(X.device)` to migrate."
         )
-
-    res_usm_type = dpctl.utils.get_coerced_usm_type(
-        (
-            ary.usm_type,
-            ary_mask.usm_type,
+    ary_nd = ary.ndim
+    pp = normalize_axis_index(operator.index(axis), ary_nd)
+    mask_nd = ary_mask.ndim
+    if pp < 0 or pp + mask_nd > ary_nd:
+        raise ValueError(
+            "Parameter p is inconsistent with input array dimensions"
         )
+    mask_nelems = ary_mask.size
+    cumsum = dpt.empty(mask_nelems, dtype=dpt.int64, device=ary_mask.device)
+    exec_q = cumsum.sycl_queue
+    mask_count = ti.mask_positions(ary_mask, cumsum, sycl_queue=exec_q)
+    dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
+    dst = dpt.empty(
+        dst_shape, dtype=ary.dtype, usm_type=ary.usm_type, device=ary.device
     )
-    ary_np = dpt.asnumpy(ary)
-    mask_np = dpt.asnumpy(ary_mask)
-    res_np = ary_np[(slice(None),) * p + (mask_np,)]
-    res = dpt.empty(
-        res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    hev, _ = ti._extract(
+        src=ary,
+        cumsum=cumsum,
+        axis_start=pp,
+        axis_end=pp + mask_nd,
+        dst=dst,
+        sycl_queue=exec_q,
     )
-    res[...] = res_np
-    return res
+    hev.wait()
+    return dst
 
 
-def _mock_nonzero(ary):
+def _nonzero_impl(ary):
     if not isinstance(ary, dpt.usm_ndarray):
-        raise TypeError
-    q = ary.sycl_queue
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
+    exec_q = ary.sycl_queue
     usm_type = ary.usm_type
-    ary_np = dpt.asnumpy(ary)
-    nz = ary_np.nonzero()
-    return tuple(dpt.asarray(i, usm_type=usm_type, sycl_queue=q) for i in nz)
+    mask_nelems = ary.size
+    cumsum = dpt.empty(
+        mask_nelems, dtype=dpt.int64, sycl_queue=exec_q, order="C"
+    )
+    mask_count = ti.mask_positions(ary, cumsum, sycl_queue=exec_q)
+    indexes = dpt.empty(
+        (ary.ndim, mask_count),
+        dtype=cumsum.dtype,
+        usm_type=usm_type,
+        sycl_queue=exec_q,
+        order="C",
+    )
+    hev, _ = ti._nonzero(cumsum, indexes, ary.shape, exec_q)
+    res = tuple(indexes[i, :] for i in range(ary.ndim))
+    hev.wait()
+    return res
 
 
 def _take_multi_index(ary, inds, p):
@@ -473,34 +503,57 @@ def _take_multi_index(ary, inds, p):
     return res
 
 
-def _mock_place(ary, ary_mask, p, vals):
+def _place_impl(ary, ary_mask, vals, axis=0):
+    """Extract elements of ary by applying mask starting from slot
+    dimension axis"""
     if not isinstance(ary, dpt.usm_ndarray):
-        raise TypeError
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
     if not isinstance(ary_mask, dpt.usm_ndarray):
-        raise TypeError
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary_mask)}"
+        )
+    if not isinstance(vals, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary_mask)}"
+        )
     exec_q = dpctl.utils.get_execution_queue(
-        (ary.sycl_queue, ary_mask.sycl_queue)
+        (ary.sycl_queue, ary_mask.sycl_queue, vals.sycl_queue)
     )
-    if exec_q is not None and isinstance(vals, dpt.usm_ndarray):
-        exec_q = dpctl.utils.get_execution_queue((exec_q, vals.sycl_queue))
     if exec_q is None:
         raise dpctl.utils.ExecutionPlacementError(
-            "Can not automatically determine where to allocate the "
-            "result or performance execution. "
-            "Use `usm_ndarray.to_device` method to migrate data to "
-            "be associated with the same queue."
+            "arrays have different associated queues. "
+            "Use `Y.to_device(X.device)` to migrate."
         )
-
-    ary_np = dpt.asnumpy(ary)
-    mask_np = dpt.asnumpy(ary_mask)
-    if isinstance(vals, dpt.usm_ndarray) or hasattr(
-        vals, "__sycl_usm_array_interface__"
-    ):
-        vals_np = dpt.asnumpy(vals)
+    ary_nd = ary.ndim
+    pp = normalize_axis_index(operator.index(axis), ary_nd)
+    mask_nd = ary_mask.ndim
+    if pp < 0 or pp + mask_nd > ary_nd:
+        raise ValueError(
+            "Parameter p is inconsistent with input array dimensions"
+        )
+    mask_nelems = ary_mask.size
+    cumsum = dpt.empty(mask_nelems, dtype=dpt.int64, device=ary_mask.device)
+    exec_q = cumsum.sycl_queue
+    mask_count = ti.mask_positions(ary_mask, cumsum, sycl_queue=exec_q)
+    expected_vals_shape = (
+        ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
+    )
+    if vals.dtype == ary.dtype:
+        rhs = vals
     else:
-        vals_np = vals
-    ary_np[(slice(None),) * p + (mask_np,)] = vals_np
-    ary[...] = ary_np
+        rhs = dpt.astype(vals, ary.dtype)
+    rhs = dpt.broadcast_to(rhs, expected_vals_shape)
+    hev, _ = ti._place(
+        dst=ary,
+        cumsum=cumsum,
+        axis_start=pp,
+        axis_end=pp + mask_nd,
+        rhs=rhs,
+        sycl_queue=exec_q,
+    )
+    hev.wait()
     return
 
 

diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py
@@ -21,7 +21,9 @@
 
 import dpctl
 import dpctl.tensor as dpt
-from dpctl.tensor._tensor_impl import _put, _take
+import dpctl.tensor._tensor_impl as ti
+
+from ._copy_utils import _extract_impl, _nonzero_impl
 
 
 def take(x, indices, /, *, axis=None, mode="clip"):
@@ -93,7 +95,7 @@ def take(x, indices, /, *, axis=None, mode="clip"):
         res_shape, dtype=x.dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
 
-    hev, _ = _take(x, indices, res, axis, mode, sycl_queue=exec_q)
+    hev, _ = ti._take(x, indices, res, axis, mode, sycl_queue=exec_q)
     hev.wait()
 
     return res
@@ -173,5 +175,136 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"):
 
     vals = dpt.broadcast_to(vals, val_shape)
 
-    hev, _ = _put(x, indices, vals, axis, mode, sycl_queue=exec_q)
+    hev, _ = ti._put(x, indices, vals, axis, mode, sycl_queue=exec_q)
+    hev.wait()
+
+
+def extract(condition, arr):
+    """extract(condition, arr)
+
+    Returns the elements of an array that satisfies the condition.
+
+    If `condition` is boolean :func:``dpctl.tensor.extract`` is
+    equivalent to ``arr[condition]``.
+
+    Note that :func:``dpctl.tensor.place`` does the opposite of
+    :func:``dpctl.tensor.extract``.
+
+    Args:
+       conditions: usm_ndarray
+          An array whose non-zero or True entries indicate the element
+          of `arr` to extract.
+       arr: usm_ndarray
+          Input array of the same size as `condition`.
+
+    Returns:
+       extract: usm_ndarray
+          Rank 1 array of values from `arr` where `condition` is True.
+    """
+    if not isinstance(condition, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(condition)}"
+        )
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    exec_q = dpctl.utils.get_execution_queue(
+        (
+            condition.sycl_queue,
+            arr.sycl_queue,
+        )
+    )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    if condition.shape != arr.shape:
+        raise ValueError("Arrays are not of the same size")
+    return _extract_impl(arr, condition)
+
+
+def place(arr, mask, vals):
+    """place(arr, mask, vals)
+
+    Change elements of an array based on conditional and input values.
+
+    If `mask` is boolean :func:``dpctl.tensor.place`` is
+    equivalent to ``arr[condition] = vals``.
+
+    Args:
+       arr: usm_ndarray
+          Array to put data into.
+       mask: usm_ndarray
+          Boolean mask array. Must have the same size as `arr`.
+       vals: usm_ndarray
+          Values to put into `arr`. Only the first N elements are
+          used, where N is the number of True values in `mask`. If
+          `vals` is smaller than N, it will be repeated, and if
+          elements of `arr` are to be masked, this sequence must be
+          non-empty. Array `vals` must be one dimensional.
+    """
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    if not isinstance(mask, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(mask)}"
+        )
+    if not isinstance(vals, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(vals)}"
+        )
+    exec_q = dpctl.utils.get_execution_queue(
+        (
+            arr.sycl_queue,
+            mask.sycl_queue,
+            vals.sycl_queue,
+        )
+    )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    if arr.shape != mask.shape or vals.ndim != 1:
+        raise ValueError("Array sizes are not as required")
+    cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q)
+    nz_count = ti.mask_positions(mask, cumsum, sycl_queue=exec_q)
+    if nz_count == 0:
+        return
+    if vals.dtype == arr.dtype:
+        rhs = vals
+    else:
+        rhs = dpt.astype(vals, arr.dtype)
+    hev, _ = ti._place(
+        dst=arr,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=mask.ndim,
+        rhs=rhs,
+        sycl_queue=exec_q,
+    )
     hev.wait()
+
+
+def nonzero(arr):
+    """nonzero(arr)
+
+    Return the indices of non-zero elements.
+
+    Returns the tuple of usm_narrays, one for each dimension
+    of `arr`, containing the indices of the non-zero elements
+    in that dimension. The values of `arr` are always tested in
+    row-major, C-style order.
+
+    Args:
+       arr: usm_ndarray
+          Input array, which has non-zero array rank.
+    Returns:
+       tuple_of_usm_ndarrays: tuple
+          Indices of non-zero array elements.
+    """
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    if arr.ndim == 0:
+        raise ValueError("Array of positive rank is exepcted")
+    return _nonzero_impl(arr)