IntelPython · antonwolfy · Mar 16, 2023 · Mar 20, 2023 · Mar 20, 2023 · Mar 29, 2023
@@ -2,4 +2,5 @@
 - [ ] Have you added a test, reproducer or referred to issue with a reproducer?
 - [ ] Have you tested your changes locally for CPU and GPU devices?
 - [ ] Have you made sure that new changes do not introduce compiler warnings?
+- [ ] Have you checked performance impact of proposed changes?
 - [ ] If this PR is a work in progress, are you filing the PR as a draft?
@@ -138,9 +138,9 @@ MACRO_2ARG_3TYPES_OP(dpnp_divide_c,
                      MACRO_UNPACK_TYPES(float, double, std::complex<float>, std::complex<double>))
 
 MACRO_2ARG_3TYPES_OP(dpnp_fmod_c,
-                     sycl::fmod((double)input1_elem, (double)input2_elem),
-                     nullptr,
-                     std::false_type,
+                     dispatch_fmod_op(input1_elem, input2_elem),
+                     x1 % x2,
+                     MACRO_UNPACK_TYPES(bool, std::int32_t, std::int64_t),
                      oneapi::mkl::vm::fmod,
                      MACRO_UNPACK_TYPES(float, double))
 

@@ -281,8 +281,8 @@ static void func_map_init_bitwise_1arg_1type(func_map_t& fmap)
                 const size_t output_id = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */                  \
                 {                                                                                                      \
                     const shape_elem_type* result_strides_data = &dev_strides_data[0];                                 \
-                    const shape_elem_type* input1_strides_data = &dev_strides_data[1];                                 \
-                    const shape_elem_type* input2_strides_data = &dev_strides_data[2];                                 \
+                    const shape_elem_type* input1_strides_data = &dev_strides_data[result_ndim];                       \
+                    const shape_elem_type* input2_strides_data = &dev_strides_data[2 * result_ndim];                   \
                                                                                                                        \
                     size_t input1_id = 0;                                                                              \
                     size_t input2_id = 0;                                                                              \

@@ -111,7 +111,7 @@
                 size_t output_id = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */                        \
                 {                                                                                                      \
                     const shape_elem_type* result_strides_data = &dev_strides_data[0];                                 \
-                    const shape_elem_type* input1_strides_data = &dev_strides_data[1];                                 \
+                    const shape_elem_type* input1_strides_data = &dev_strides_data[result_ndim];                       \
                                                                                                                        \
                     size_t input_id = 0;                                                                               \
                     for (size_t i = 0; i < input1_ndim; ++i)                                                           \
@@ -635,7 +635,7 @@ static void func_map_init_elemwise_1arg_2type(func_map_t& fmap)
                 size_t output_id = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */                        \
                 {                                                                                                      \
                     const shape_elem_type* result_strides_data = &dev_strides_data[0];                                 \
-                    const shape_elem_type* input1_strides_data = &dev_strides_data[1];                                 \
+                    const shape_elem_type* input1_strides_data = &dev_strides_data[result_ndim];                       \
                                                                                                                        \
                     size_t input_id = 0;                                                                               \
                     for (size_t i = 0; i < input1_ndim; ++i)                                                           \
@@ -848,6 +848,18 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
     return;
 }
 
+template <typename T>
+constexpr auto dispatch_fmod_op(T elem1, T elem2)
+{
+    if constexpr (is_any_v<T, std::int32_t, std::int64_t>)
+    {
+        return elem1 % elem2;
+    }
+    else
+    {
+        return sycl::fmod(elem1, elem2);
+    }
+}
 
 #define MACRO_2ARG_3TYPES_OP(                                                                                          \
     __name__, __operation__, __vec_operation__, __vec_types__, __mkl_operation__, __mkl_types__)                       \
@@ -995,8 +1007,8 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
                 const size_t output_id = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */                  \
                 {                                                                                                      \
                     const shape_elem_type* result_strides_data = &dev_strides_data[0];                                 \
-                    const shape_elem_type* input1_strides_data = &dev_strides_data[1];                                 \
-                    const shape_elem_type* input2_strides_data = &dev_strides_data[2];                                 \
+                    const shape_elem_type* input1_strides_data = &dev_strides_data[result_ndim];                       \
+                    const shape_elem_type* input2_strides_data = &dev_strides_data[2 * result_ndim];                   \
                                                                                                                        \
                     size_t input1_id = 0;                                                                              \
                     size_t input2_id = 0;                                                                              \
@@ -1261,6 +1273,16 @@ static constexpr DPNPFuncType get_divide_res_type()
     return widest_type;
 }
 
+template <DPNPFuncType FT1, DPNPFuncType FT2>
+static constexpr DPNPFuncType get_fmod_res_type()
+{
+    if constexpr ((FT1 == DPNPFuncType::DPNP_FT_BOOL) && (FT2 == DPNPFuncType::DPNP_FT_BOOL))
+    {
+        return DPNPFuncType::DPNP_FT_INT;
+    }
+    return populate_func_types<FT1, FT2>();
+}
+
 template <DPNPFuncType FT1, DPNPFuncType... FTs>
 static void func_map_elemwise_2arg_3type_core(func_map_t& fmap)
 {
@@ -1300,12 +1322,29 @@ static void func_map_elemwise_2arg_3type_core(func_map_t& fmap)
      ...);
 }
 
+template <DPNPFuncType FT1, DPNPFuncType... FTs>
+static void func_map_elemwise_2arg_3type_core_no_complex(func_map_t& fmap)
+{
+    ((fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][FT1][FTs] =
+          {get_fmod_res_type<FT1, FTs>(),
+           (void*)dpnp_fmod_c_ext<func_type_map_t::find_type<get_fmod_res_type<FT1, FTs>()>,
+                                  func_type_map_t::find_type<FT1>,
+                                  func_type_map_t::find_type<FTs>>}),
+     ...);
+}
+
 template <DPNPFuncType... FTs>
 static void func_map_elemwise_2arg_3type_helper(func_map_t& fmap)
 {
     ((func_map_elemwise_2arg_3type_core<FTs, FTs...>(fmap)), ...);
 }
 
+template <DPNPFuncType... FTs>
+static void func_map_elemwise_2arg_3type_helper_no_complex(func_map_t& fmap)
+{
+    ((func_map_elemwise_2arg_3type_core_no_complex<FTs, FTs...>(fmap)), ...);
+}
+
 static void func_map_init_elemwise_2arg_3type(func_map_t& fmap)
 {
     fmap[DPNPFuncName::DPNP_FN_ADD][eft_INT][eft_INT] = {eft_INT,
@@ -1539,39 +1578,6 @@ static void func_map_init_elemwise_2arg_3type(func_map_t& fmap)
     fmap[DPNPFuncName::DPNP_FN_FMOD][eft_DBL][eft_DBL] = {eft_DBL,
                                                           (void*)dpnp_fmod_c_default<double, double, double>};
 
-    fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][eft_INT][eft_INT] = {eft_INT,
-                                                              (void*)dpnp_fmod_c_ext<int32_t, int32_t, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][eft_INT][eft_LNG] = {eft_LNG,
-                                                              (void*)dpnp_fmod_c_ext<int64_t, int32_t, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][eft_INT][eft_FLT] = {eft_DBL,
-                                                              (void*)dpnp_fmod_c_ext<double, int32_t, float>};
-    fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][eft_INT][eft_DBL] = {eft_DBL,
-                                                              (void*)dpnp_fmod_c_ext<double, int32_t, double>};
-    fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][eft_LNG][eft_INT] = {eft_LNG,
-                                                              (void*)dpnp_fmod_c_ext<int64_t, int64_t, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][eft_LNG][eft_LNG] = {eft_LNG,
-                                                              (void*)dpnp_fmod_c_ext<int64_t, int64_t, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][eft_LNG][eft_FLT] = {eft_DBL,
-                                                              (void*)dpnp_fmod_c_ext<double, int64_t, float>};
-    fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][eft_LNG][eft_DBL] = {eft_DBL,
-                                                              (void*)dpnp_fmod_c_ext<double, int64_t, double>};
-    fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][eft_FLT][eft_INT] = {eft_DBL,
-                                                              (void*)dpnp_fmod_c_ext<double, float, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][eft_FLT][eft_LNG] = {eft_DBL,
-                                                              (void*)dpnp_fmod_c_ext<double, float, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][eft_FLT][eft_FLT] = {eft_FLT,
-                                                              (void*)dpnp_fmod_c_ext<float, float, float>};
-    fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][eft_FLT][eft_DBL] = {eft_DBL,
-                                                              (void*)dpnp_fmod_c_ext<double, float, double>};
-    fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][eft_DBL][eft_INT] = {eft_DBL,
-                                                              (void*)dpnp_fmod_c_ext<double, double, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][eft_DBL][eft_LNG] = {eft_DBL,
-                                                              (void*)dpnp_fmod_c_ext<double, double, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][eft_DBL][eft_FLT] = {eft_DBL,
-                                                              (void*)dpnp_fmod_c_ext<double, double, float>};
-    fmap[DPNPFuncName::DPNP_FN_FMOD_EXT][eft_DBL][eft_DBL] = {eft_DBL,
-                                                              (void*)dpnp_fmod_c_ext<double, double, double>};
-
     fmap[DPNPFuncName::DPNP_FN_HYPOT][eft_INT][eft_INT] = {eft_DBL,
                                                            (void*)dpnp_hypot_c_default<double, int32_t, int32_t>};
     fmap[DPNPFuncName::DPNP_FN_HYPOT][eft_INT][eft_LNG] = {eft_DBL,
@@ -1918,6 +1924,7 @@ static void func_map_init_elemwise_2arg_3type(func_map_t& fmap)
         eft_DBL, (void*)dpnp_subtract_c_default<double, double, double>};
 
     func_map_elemwise_2arg_3type_helper<eft_BLN, eft_INT, eft_LNG, eft_FLT, eft_DBL, eft_C64, eft_C128>(fmap);
+    func_map_elemwise_2arg_3type_helper_no_complex<eft_BLN, eft_INT, eft_LNG, eft_FLT, eft_DBL>(fmap);
 
     return;
 }

@@ -396,7 +396,7 @@ DPCTLSyclEventRef (*dpnp_any_ext_c)(DPCTLSyclQueueRef,
                 const size_t output_id = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */                  \
                 {                                                                                                      \
                     const shape_elem_type *result_strides_data = &dev_strides_data[0];                                 \
-                    const shape_elem_type *input1_strides_data = &dev_strides_data[1];                                 \
+                    const shape_elem_type *input1_strides_data = &dev_strides_data[result_ndim];                       \
                                                                                                                        \
                     size_t input1_id = 0;                                                                              \
                                                                                                                        \
@@ -635,8 +635,8 @@ static void func_map_logic_1arg_1type_helper(func_map_t& fmap)
                 const size_t output_id = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */                  \
                 {                                                                                                      \
                     const shape_elem_type *result_strides_data = &dev_strides_data[0];                                 \
-                    const shape_elem_type *input1_strides_data = &dev_strides_data[1];                                 \
-                    const shape_elem_type *input2_strides_data = &dev_strides_data[2];                                 \
+                    const shape_elem_type *input1_strides_data = &dev_strides_data[result_ndim];                       \
+                    const shape_elem_type *input2_strides_data = &dev_strides_data[2 * result_ndim];                   \
                                                                                                                        \
                     size_t input1_id = 0;                                                                              \
                     size_t input2_id = 0;                                                                              \

@@ -294,9 +294,9 @@ DPCTLSyclEventRef dpnp_where_c(DPCTLSyclQueueRef q_ref,
             const size_t output_id = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */
             {
                 const shape_elem_type* result_strides_data = &dev_strides_data[0];
-                const shape_elem_type* condition_strides_data = &dev_strides_data[1];
-                const shape_elem_type* input1_strides_data = &dev_strides_data[2];
-                const shape_elem_type* input2_strides_data = &dev_strides_data[3];
+                const shape_elem_type* condition_strides_data = &dev_strides_data[result_ndim];
+                const shape_elem_type* input1_strides_data = &dev_strides_data[2 * result_ndim];
+                const shape_elem_type* input2_strides_data = &dev_strides_data[3 * result_ndim];
 
                 size_t condition_id = 0;
                 size_t input1_id = 0;

@@ -505,25 +505,33 @@ cdef utils.dpnp_descriptor call_fptr_2in_1out_strides(DPNPFuncName fptr_name,
         return_type = kernel_data.return_type_no_fp64
         func = < fptr_2in_1out_strides_t > kernel_data.ptr_no_fp64
 
-    if out is None:
-        """ Create result array with type given by FPTR data """
+    # check 'out' parameter data
+    if out is not None:
+        if out.shape != result_shape:
+            utils.checker_throw_value_error(func_name, 'out.shape', out.shape, result_shape)
+
+        utils.get_common_usm_allocation(x1_obj, out)  # check USM allocation is common
+
+    if out is None or out.is_array_overlapped(x1_obj) or out.is_array_overlapped(x2_obj) or not out.match_ctype(return_type):
+        """
+        Create result array with type given by FPTR data.
+        If 'out' array has another dtype than expected or overlaps a memory from any input array,
+        we have to create a temporary array and to copy data from the temporary into 'out' array,
+        once the computation is completed.
+        Otherwise simultaneously access to the same memory may cause a race condition issue
+        which will result into undefined behaviour.
+        """
+        is_result_memory_allocated = True
         result = utils.create_output_descriptor(result_shape,
                                                 return_type,
                                                 None,
                                                 device=result_sycl_device,
                                                 usm_type=result_usm_type,
                                                 sycl_queue=result_sycl_queue)
     else:
-        result_type = dpnp_DPNPFuncType_to_dtype(< size_t > return_type)
-        if out.dtype != result_type:
-            utils.checker_throw_value_error(func_name, 'out.dtype', out.dtype, result_type)
-        if out.shape != result_shape:
-            utils.checker_throw_value_error(func_name, 'out.shape', out.shape, result_shape)
-
+        is_result_memory_allocated = False
         result = out
 
-        utils.get_common_usm_allocation(x1_obj, result)  # check USM allocation is common
-
     cdef shape_type_c result_strides = utils.strides_to_vector(result.strides, result_shape)
 
     result_obj = result.get_array()
@@ -554,4 +562,7 @@ cdef utils.dpnp_descriptor call_fptr_2in_1out_strides(DPNPFuncName fptr_name,
     with nogil: c_dpctl.DPCTLEvent_WaitAndThrow(event_ref)
     c_dpctl.DPCTLEvent_Delete(event_ref)
 
-    return result
+    if out is not None and is_result_memory_allocated:
+        return out.get_result_desc(result)
+
+    return result.get_result_desc()
@@ -29,6 +29,23 @@
 
 import dpnp
 
+
+def _get_unwrapped_index_key(key):
+    """
+    Return a key where each nested instance of DPNP array is unwrapped into USM ndarray
+    for futher processing in DPCTL advanced indexing functions.
+
+    """
+
+    if isinstance(key, tuple):
+        if any(isinstance(x, dpnp_array) for x in key):
+            # create a new tuple from the input key with unwrapped DPNP arrays
+            return tuple(x.get_array() if isinstance(x, dpnp_array) else x for x in key)
+    elif isinstance(key, dpnp_array):
+        return key.get_array()
+    return key
+
+
 class dpnp_array:
     """
     Multi-dimensional array object.
@@ -176,8 +193,7 @@ def __ge__(self, other):
  # '__getattribute__',
 
     def __getitem__(self, key):
-        if isinstance(key, dpnp_array):
-            key = key.get_array()
+        key = _get_unwrapped_index_key(key)
 
         item = self._array_obj.__getitem__(key)
         if not isinstance(item, dpt.usm_ndarray):
@@ -194,7 +210,10 @@ def __gt__(self, other):
         return dpnp.greater(self, other)
 
  # '__hash__',
- # '__iadd__',
+
+    def __iadd__(self, other):
+        dpnp.add(self, other, out=self)
+        return self
 
     def __iand__(self, other):
         dpnp.bitwise_and(self, other, out=self)
@@ -208,7 +227,10 @@ def __ilshift__(self, other):
 
  # '__imatmul__',
  # '__imod__',
- # '__imul__',
+
+    def __imul__(self, other):
+        dpnp.multiply(self, other, out=self)
+        return self
 
     def __index__(self):
         return self._array_obj.__index__()
@@ -334,8 +356,8 @@ def __rxor__(self, other):
  # '__setattr__',
 
     def __setitem__(self, key, val):
-        if isinstance(key, dpnp_array):
-            key = key.get_array()
+        key = _get_unwrapped_index_key(key)
+
         if isinstance(val, dpnp_array):
             val = val.get_array()
 
@@ -760,6 +782,8 @@ def item(self, id=None):
     @property
     def itemsize(self):
         """
+        Size of one array element in bytes.
+
         """
 
         return self._array_obj.itemsize
@@ -785,11 +809,20 @@ def min(self, axis=None, out=None, keepdims=numpy._NoValue, initial=numpy._NoVal
 
         return dpnp.min(self, axis, out, keepdims, initial, where)
 
- # 'nbytes',
+    @property
+    def nbytes(self):
+        """
+        Total bytes consumed by the elements of the array.
+
+        """
+
+        return self._array_obj.nbytes
 
     @property
     def ndim(self):
         """
+        Number of array dimensions.
+
         """
 
         return self._array_obj.ndim