From 0c7f1960114bac7ad2cb4c83c1bb6db5f5435bc3 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Wed, 14 Jun 2023 16:33:57 +0200
Subject: [PATCH] Add inplace support of divide (#1434)

---
 dpnp/dpnp_algo/dpnp_elementwise_common.py     | 26 ++++++++++++++++++-
 tests/test_usm_type.py                        | 20 +++++++++-----
 .../cupy/linalg_tests/test_product.py         |  4 ---
 .../cupy/math_tests/test_arithmetic.py        |  1 -
 4 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 527994a27ad..5115f39a8e7 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -35,6 +35,10 @@
     BinaryElementwiseFunc
 )
 import dpctl.tensor._tensor_impl as ti
+import dpctl.tensor as dpt
+import dpctl
+
+import numpy
 
 
 __all__ = [
@@ -125,12 +129,27 @@ def _call_divide(src1, src2, dst, sycl_queue, depends=[]):
             return vmi._div(sycl_queue, src1, src2, dst, depends)
         return ti._divide(src1, src2, dst, sycl_queue, depends)
 
+    def _call_divide_inplace(lhs, rhs, sycl_queue, depends=[]):
+        """In place workaround until dpctl.tensor provides the functionality."""
+
+        # allocate temporary memory for out array
+        out = dpt.empty_like(lhs, dtype=numpy.result_type((lhs.dtype, rhs.dtype)))
+
+        # call a general callback
+        div_ht_, div_ev_ = _call_divide(lhs, rhs, out, sycl_queue, depends)
+
+        # store the result into left input array and return events
+        cp_ht_, cp_ev_ = ti._copy_usm_ndarray_into_usm_ndarray(src=out, dst=lhs, sycl_queue=sycl_queue, depends=[div_ev_])
+        dpctl.SyclEvent.wait_for([div_ht_])
+        return (cp_ht_, cp_ev_)
+
     # dpctl.tensor only works with usm_ndarray or scalar
     x1_usm_or_scalar = dpnp.get_usm_ndarray_or_scalar(x1)
     x2_usm_or_scalar = dpnp.get_usm_ndarray_or_scalar(x2)
     out_usm = None if out is None else dpnp.get_usm_ndarray(out)
 
-    func = BinaryElementwiseFunc("divide", ti._divide_result_type, _call_divide, _divide_docstring_)
+    func = BinaryElementwiseFunc("divide", ti._divide_result_type, _call_divide,
+                                 _divide_docstring_, _call_divide_inplace)
     res_usm = func(x1_usm_or_scalar, x2_usm_or_scalar, out=out_usm, order=order)
     return dpnp_array._create_from_usm_ndarray(res_usm)
 
@@ -208,6 +227,11 @@ def dpnp_subtract(x1, x2, out=None, order='K'):
 
     """
 
+    # TODO: discuss with dpctl if the check is needed to be moved there
+    if not dpnp.isscalar(x1) and not dpnp.isscalar(x2) and x1.dtype == x2.dtype == dpnp.bool:
+        raise TypeError("DPNP boolean subtract, the `-` operator, is not supported, "
+                        "use the bitwise_xor, the `^` operator, or the logical_xor function instead.")
+
     # dpctl.tensor only works with usm_ndarray or scalar
     x1_usm_or_scalar = dpnp.get_usm_ndarray_or_scalar(x1)
     x2_usm_or_scalar = dpnp.get_usm_ndarray_or_scalar(x2)
diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py
index 9bd0ab16716..61145de42c7 100644
--- a/tests/test_usm_type.py
+++ b/tests/test_usm_type.py
@@ -19,9 +19,9 @@ def test_coerced_usm_types_sum(usm_type_x, usm_type_y):
 
     z = 1.3 + x + y + 2
 
-    # TODO: unmute once dpctl support that
-    # z += x
-    # z += 7.4
+    # inplace add
+    z += x
+    z += 7.4
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
@@ -36,9 +36,9 @@ def test_coerced_usm_types_mul(usm_type_x, usm_type_y):
 
     z = 3 * x * y * 1.5
 
-    # TODO: unmute once dpctl support that
-    # z *= x
-    # z *= 4.8
+    # inplace multiply
+    z *= x
+    z *= 4.8
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
@@ -53,6 +53,10 @@ def test_coerced_usm_types_subtract(usm_type_x, usm_type_y):
 
     z = 20 - x - y - 7.4
 
+    # inplace subtract
+    z -= x
+    z -= -3.4
+
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
     assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
@@ -66,6 +70,10 @@ def test_coerced_usm_types_divide(usm_type_x, usm_type_y):
 
     z = 2 / x / y / 1.5
 
+    # inplace divide
+    z /= x
+    z /= -2.4
+
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
     assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
diff --git a/tests/third_party/cupy/linalg_tests/test_product.py b/tests/third_party/cupy/linalg_tests/test_product.py
index 0f6a2f22fe8..d25cebbfa67 100644
--- a/tests/third_party/cupy/linalg_tests/test_product.py
+++ b/tests/third_party/cupy/linalg_tests/test_product.py
@@ -228,7 +228,6 @@ def test_transposed_multidim_vdot(self, xp, dtype):
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
-    @pytest.mark.skip("mute until dpctl support in-place add")
     def test_inner(self, xp, dtype):
         a = testing.shaped_arange((5,), xp, dtype)
         b = testing.shaped_reverse_arange((5,), xp, dtype)
@@ -237,7 +236,6 @@ def test_inner(self, xp, dtype):
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
-    @pytest.mark.skip("mute until dpctl support in-place add")
     def test_reversed_inner(self, xp, dtype):
         a = testing.shaped_arange((5,), xp, dtype)[::-1]
         b = testing.shaped_reverse_arange((5,), xp, dtype)[::-1]
@@ -246,7 +244,6 @@ def test_reversed_inner(self, xp, dtype):
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
-    @pytest.mark.skip("mute until dpctl support in-place add")
     def test_multidim_inner(self, xp, dtype):
         a = testing.shaped_arange((2, 3, 4), xp, dtype)
         b = testing.shaped_arange((3, 2, 4), xp, dtype)
@@ -254,7 +251,6 @@ def test_multidim_inner(self, xp, dtype):
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
-    @pytest.mark.skip("mute until dpctl support in-place add")
     def test_transposed_higher_order_inner(self, xp, dtype):
         a = testing.shaped_arange((2, 4, 3), xp, dtype).transpose(2, 0, 1)
         b = testing.shaped_arange((4, 2, 3), xp, dtype).transpose(1, 2, 0)
diff --git a/tests/third_party/cupy/math_tests/test_arithmetic.py b/tests/third_party/cupy/math_tests/test_arithmetic.py
index ade3c4c8f6e..c52b2d2df3a 100644
--- a/tests/third_party/cupy/math_tests/test_arithmetic.py
+++ b/tests/third_party/cupy/math_tests/test_arithmetic.py
@@ -280,7 +280,6 @@ def test_modf(self, xp, dtype):
     'shape': [(3, 2), (), (3, 0, 2)]
 }))
 @testing.gpu
-@pytest.mark.skip("dpctl doesn't raise an error")
 class TestBoolSubtract(unittest.TestCase):
 
     def test_bool_subtract(self):