From 7a56304b346ef44a7d37f3cf4382ded9eccaf0bf Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 29 Jan 2024 13:16:44 +0100
Subject: [PATCH] Update dpnp.linalg.inv() function (#1665)

* Impl dpnp.linalg.inv for 2d array

* Remove an old impl of dpnp_inv

* Add batch implementation of dpnp.linalg.inv func

* Add cupy tests for dpnp.linalg.inf

* Add dpnp tests for dpnp.linalg.inv

* Add check_lapack_dev_info func

* Add dev_info size check for getri_batch and getrf_batch

* Add size check dev_info and error_matrices_ids

* Remove dpnp_inv_ext_c

* Rename check_lapack_dev_info to _check_lapack_dev_info

* Skip test_inv in TestInvInvalid

---------

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/backend/extensions/lapack/CMakeLists.txt |   1 +
 .../backend/extensions/lapack/getrf_batch.cpp |  17 +
 dpnp/backend/extensions/lapack/getri.hpp      |  56 ++++
 .../backend/extensions/lapack/getri_batch.cpp | 293 ++++++++++++++++++
 dpnp/backend/extensions/lapack/lapack_py.cpp  |  10 +
 .../extensions/lapack/types_matrix.hpp        |  26 ++
 dpnp/backend/include/dpnp_iface_fptr.hpp      |   2 -
 dpnp/backend/kernels/dpnp_krnl_linalg.cpp     |  30 --
 dpnp/dpnp_algo/dpnp_algo.pxd                  |   2 -
 dpnp/linalg/dpnp_algo_linalg.pyx              |  41 ---
 dpnp/linalg/dpnp_iface_linalg.py              |  56 +++-
 dpnp/linalg/dpnp_utils_linalg.py              | 167 ++++++++++
 tests/test_linalg.py                          | 134 +++++++-
 tests/test_sycl_queue.py                      |  37 ++-
 tests/test_usm_type.py                        |  30 ++
 .../cupy/linalg_tests/test_solve.py           |  84 ++++-
 16 files changed, 871 insertions(+), 115 deletions(-)
 create mode 100644 dpnp/backend/extensions/lapack/getri.hpp
 create mode 100644 dpnp/backend/extensions/lapack/getri_batch.cpp

diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index 0cfdc1a1677..626615e3e53 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -30,6 +30,7 @@ set(_module_src
     ${CMAKE_CURRENT_SOURCE_DIR}/gesv.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/getrf.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/getrf_batch.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/getri_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/heevd.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/potrf.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/potrf_batch.cpp
diff --git a/dpnp/backend/extensions/lapack/getrf_batch.cpp b/dpnp/backend/extensions/lapack/getrf_batch.cpp
index 76977bf6628..a8993121809 100644
--- a/dpnp/backend/extensions/lapack/getrf_batch.cpp
+++ b/dpnp/backend/extensions/lapack/getrf_batch.cpp
@@ -116,6 +116,15 @@ static sycl::event getrf_batch_impl(sycl::queue exec_q,
         // Get the indices of the first zero diagonal elements of these matrices
         auto error_info = be.exceptions();
 
+        auto error_matrices_ids_size = error_matrices_ids.size();
+        auto dev_info_size = static_cast<std::size_t>(py::len(dev_info));
+        if (error_matrices_ids_size != dev_info_size) {
+            throw py::value_error("The size of `dev_info` must be equal to" +
+                                  std::to_string(error_matrices_ids_size) +
+                                  ", but currently it is " +
+                                  std::to_string(dev_info_size) + ".");
+        }
+
         for (size_t i = 0; i < error_matrices_ids.size(); ++i) {
             // Assign the index of the first zero diagonal element in each
             // error matrix to the corresponding index in 'dev_info'
@@ -190,6 +199,14 @@ std::pair<sycl::event, sycl::event>
                               ", but a 2-dimensional array is expected.");
     }
 
+    const int dev_info_size = py::len(dev_info);
+    if (dev_info_size != batch_size) {
+        throw py::value_error("The size of 'dev_info' (" +
+                              std::to_string(dev_info_size) +
+                              ") does not match the expected batch size (" +
+                              std::to_string(batch_size) + ").");
+    }
+
     // check compatibility of execution queue and allocation queue
     if (!dpctl::utils::queues_are_compatible(exec_q, {a_array, ipiv_array})) {
         throw py::value_error(
diff --git a/dpnp/backend/extensions/lapack/getri.hpp b/dpnp/backend/extensions/lapack/getri.hpp
new file mode 100644
index 00000000000..75e9b16d4ef
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/getri.hpp
@@ -0,0 +1,56 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+#include <dpctl4pybind11.hpp>
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+extern std::pair<sycl::event, sycl::event>
+    getri_batch(sycl::queue exec_q,
+                dpctl::tensor::usm_ndarray a_array,
+                dpctl::tensor::usm_ndarray ipiv_array,
+                py::list dev_info,
+                std::int64_t n,
+                std::int64_t stride_a,
+                std::int64_t stride_ipiv,
+                std::int64_t batch_size,
+                const std::vector<sycl::event> &depends = {});
+
+extern void init_getri_batch_dispatch_vector(void);
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/getri_batch.cpp b/dpnp/backend/extensions/lapack/getri_batch.cpp
new file mode 100644
index 00000000000..c6315e29427
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/getri_batch.cpp
@@ -0,0 +1,293 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "getri.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+namespace mkl_lapack = oneapi::mkl::lapack;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*getri_batch_impl_fn_ptr_t)(
+    sycl::queue,
+    std::int64_t,
+    char *,
+    std::int64_t,
+    std::int64_t,
+    std::int64_t *,
+    std::int64_t,
+    std::int64_t,
+    py::list,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+static getri_batch_impl_fn_ptr_t
+    getri_batch_dispatch_vector[dpctl_td_ns::num_types];
+
+template <typename T>
+static sycl::event getri_batch_impl(sycl::queue exec_q,
+                                    std::int64_t n,
+                                    char *in_a,
+                                    std::int64_t lda,
+                                    std::int64_t stride_a,
+                                    std::int64_t *ipiv,
+                                    std::int64_t stride_ipiv,
+                                    std::int64_t batch_size,
+                                    py::list dev_info,
+                                    std::vector<sycl::event> &host_task_events,
+                                    const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    T *a = reinterpret_cast<T *>(in_a);
+
+    const std::int64_t scratchpad_size =
+        mkl_lapack::getri_batch_scratchpad_size<T>(exec_q, n, lda, stride_a,
+                                                   stride_ipiv, batch_size);
+    T *scratchpad = nullptr;
+
+    std::stringstream error_msg;
+    std::int64_t info = 0;
+    bool is_exception_caught = false;
+
+    sycl::event getri_batch_event;
+    try {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+
+        getri_batch_event = mkl_lapack::getri_batch(
+            exec_q,
+            n, // The order of each square matrix in the batch; (0 ≤ n).
+               // It must be a non-negative integer.
+            a, // Pointer to the batch of square matrices, each of size (n x n).
+            lda,      // The leading dimension of each matrix in the batch.
+            stride_a, // Stride between consecutive matrices in the batch.
+            ipiv, // Pointer to the array of pivot indices for each matrix in
+                  // the batch.
+            stride_ipiv, // Stride between pivot indices: Spacing between pivot
+                         // arrays in 'ipiv'.
+            batch_size,  // Total number of matrices in the batch.
+            scratchpad,  // Pointer to scratchpad memory to be used by MKL
+                         // routine for storing intermediate results.
+            scratchpad_size, depends);
+    } catch (mkl_lapack::batch_error const &be) {
+        // Get the indices of matrices within the batch that encountered an
+        // error
+        auto error_matrices_ids = be.ids();
+        // Get the indices of the first zero diagonal elements of these matrices
+        auto error_info = be.exceptions();
+
+        auto error_matrices_ids_size = error_matrices_ids.size();
+        auto dev_info_size = static_cast<std::size_t>(py::len(dev_info));
+        if (error_matrices_ids_size != dev_info_size) {
+            throw py::value_error("The size of `dev_info` must be equal to" +
+                                  std::to_string(error_matrices_ids_size) +
+                                  ", but currently it is " +
+                                  std::to_string(dev_info_size) + ".");
+        }
+
+        for (size_t i = 0; i < error_matrices_ids.size(); ++i) {
+            // Assign the index of the first zero diagonal element in each
+            // error matrix to the corresponding index in 'dev_info'
+            dev_info[error_matrices_ids[i]] = error_info[i];
+        }
+    } catch (mkl_lapack::exception const &e) {
+        is_exception_caught = true;
+        info = e.info();
+
+        if (info < 0) {
+            error_msg << "Parameter number " << -info
+                      << " had an illegal value.";
+        }
+        else if (info == scratchpad_size && e.detail() != 0) {
+            error_msg
+                << "Insufficient scratchpad size. Required size is at least "
+                << e.detail();
+        }
+        else {
+            error_msg << "Unexpected MKL exception caught during getri_batch() "
+                         "call:\nreason: "
+                      << e.what() << "\ninfo: " << e.info();
+        }
+    } catch (sycl::exception const &e) {
+        is_exception_caught = true;
+        error_msg
+            << "Unexpected SYCL exception caught during getri_batch() call:\n"
+            << e.what();
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        if (scratchpad != nullptr) {
+            sycl::free(scratchpad, exec_q);
+        }
+
+        throw std::runtime_error(error_msg.str());
+    }
+
+    sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(getri_batch_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); });
+    });
+    host_task_events.push_back(clean_up_event);
+    return getri_batch_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    getri_batch(sycl::queue exec_q,
+                dpctl::tensor::usm_ndarray a_array,
+                dpctl::tensor::usm_ndarray ipiv_array,
+                py::list dev_info,
+                std::int64_t n,
+                std::int64_t stride_a,
+                std::int64_t stride_ipiv,
+                std::int64_t batch_size,
+                const std::vector<sycl::event> &depends)
+{
+    const int a_array_nd = a_array.get_ndim();
+    const int ipiv_array_nd = ipiv_array.get_ndim();
+
+    if (a_array_nd < 3) {
+        throw py::value_error(
+            "The input array has ndim=" + std::to_string(a_array_nd) +
+            ", but an array with ndim >= 3 is expected.");
+    }
+
+    if (ipiv_array_nd != 2) {
+        throw py::value_error("The array of pivot indices has ndim=" +
+                              std::to_string(ipiv_array_nd) +
+                              ", but a 2-dimensional array is expected.");
+    }
+
+    const int dev_info_size = py::len(dev_info);
+    if (dev_info_size != batch_size) {
+        throw py::value_error("The size of 'dev_info' (" +
+                              std::to_string(dev_info_size) +
+                              ") does not match the expected batch size (" +
+                              std::to_string(batch_size) + ").");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {a_array, ipiv_array})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(a_array, ipiv_array)) {
+        throw py::value_error("The input array and the array of pivot indices "
+                              "are overlapping segments of memory");
+    }
+
+    bool is_a_array_c_contig = a_array.is_c_contiguous();
+    bool is_ipiv_array_c_contig = ipiv_array.is_c_contiguous();
+    if (!is_a_array_c_contig) {
+        throw py::value_error("The input array "
+                              "must be C-contiguous");
+    }
+    if (!is_ipiv_array_c_contig) {
+        throw py::value_error("The array of pivot indices "
+                              "must be C-contiguous");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int a_array_type_id =
+        array_types.typenum_to_lookup_id(a_array.get_typenum());
+
+    getri_batch_impl_fn_ptr_t getri_batch_fn =
+        getri_batch_dispatch_vector[a_array_type_id];
+    if (getri_batch_fn == nullptr) {
+        throw py::value_error(
+            "No getri_batch implementation defined for the provided type "
+            "of the input matrix.");
+    }
+
+    auto ipiv_types = dpctl_td_ns::usm_ndarray_types();
+    int ipiv_array_type_id =
+        ipiv_types.typenum_to_lookup_id(ipiv_array.get_typenum());
+
+    if (ipiv_array_type_id != static_cast<int>(dpctl_td_ns::typenum_t::INT64)) {
+        throw py::value_error("The type of 'ipiv_array' must be int64.");
+    }
+
+    char *a_array_data = a_array.get_data();
+    const std::int64_t lda = std::max<size_t>(1UL, n);
+
+    char *ipiv_array_data = ipiv_array.get_data();
+    std::int64_t *d_ipiv = reinterpret_cast<std::int64_t *>(ipiv_array_data);
+
+    std::vector<sycl::event> host_task_events;
+    sycl::event getri_batch_ev = getri_batch_fn(
+        exec_q, n, a_array_data, lda, stride_a, d_ipiv, stride_ipiv, batch_size,
+        dev_info, host_task_events, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(
+        exec_q, {a_array, ipiv_array}, host_task_events);
+
+    return std::make_pair(args_ev, getri_batch_ev);
+}
+
+template <typename fnT, typename T>
+struct GetriBatchContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::GetriBatchTypePairSupportFactory<T>::is_defined) {
+            return getri_batch_impl<T>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_getri_batch_dispatch_vector(void)
+{
+    dpctl_td_ns::DispatchVectorBuilder<getri_batch_impl_fn_ptr_t,
+                                       GetriBatchContigFactory,
+                                       dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_vector(getri_batch_dispatch_vector);
+}
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp
index f97a0dc4a43..71991be3652 100644
--- a/dpnp/backend/extensions/lapack/lapack_py.cpp
+++ b/dpnp/backend/extensions/lapack/lapack_py.cpp
@@ -32,6 +32,7 @@
 
 #include "gesv.hpp"
 #include "getrf.hpp"
+#include "getri.hpp"
 #include "heevd.hpp"
 #include "linalg_exceptions.hpp"
 #include "potrf.hpp"
@@ -46,6 +47,7 @@ void init_dispatch_vectors(void)
     lapack_ext::init_gesv_dispatch_vector();
     lapack_ext::init_getrf_batch_dispatch_vector();
     lapack_ext::init_getrf_dispatch_vector();
+    lapack_ext::init_getri_batch_dispatch_vector();
     lapack_ext::init_potrf_batch_dispatch_vector();
     lapack_ext::init_potrf_dispatch_vector();
     lapack_ext::init_syevd_dispatch_vector();
@@ -88,6 +90,14 @@ PYBIND11_MODULE(_lapack_impl, m)
           py::arg("stride_ipiv"), py::arg("batch_size"),
           py::arg("depends") = py::list());
 
+    m.def("_getri_batch", &lapack_ext::getri_batch,
+          "Call `getri_batch` from OneMKL LAPACK library to return "
+          "the inverses of a batch of LU-factored matrices",
+          py::arg("sycl_queue"), py::arg("a_array"), py::arg("ipiv_array"),
+          py::arg("dev_info"), py::arg("n"), py::arg("stride_a"),
+          py::arg("stride_ipiv"), py::arg("batch_size"),
+          py::arg("depends") = py::list());
+
     m.def("_heevd", &lapack_ext::heevd,
           "Call `heevd` from OneMKL LAPACK library to return "
           "the eigenvalues and eigenvectors of a complex Hermitian matrix",
diff --git a/dpnp/backend/extensions/lapack/types_matrix.hpp b/dpnp/backend/extensions/lapack/types_matrix.hpp
index 0277e630738..7e5413b84c8 100644
--- a/dpnp/backend/extensions/lapack/types_matrix.hpp
+++ b/dpnp/backend/extensions/lapack/types_matrix.hpp
@@ -122,6 +122,32 @@ struct GetrfBatchTypePairSupportFactory
         dpctl_td_ns::NotDefinedEntry>::is_defined;
 };
 
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL LAPACK library provides support in oneapi::mkl::lapack::getri_batch<T>
+ * function.
+ *
+ * @tparam T Type of array containing input matrix (LU-factored form),
+ * as well as the output array for storing the inverse of the matrix.
+ */
+template <typename T>
+struct GetriBatchTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<T, double, T, double>,
+        dpctl_td_ns::TypePairDefinedEntry<T, float, T, float>,
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<float>,
+                                          T,
+                                          std::complex<float>>,
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<double>,
+                                          T,
+                                          std::complex<double>>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
 /**
  * @brief A factory to define pairs of supported types for which
  * MKL LAPACK library provides support in oneapi::mkl::lapack::heevd<T, RealT>
diff --git a/dpnp/backend/include/dpnp_iface_fptr.hpp b/dpnp/backend/include/dpnp_iface_fptr.hpp
index 87826cc3bb9..2e2ce5ab144 100644
--- a/dpnp/backend/include/dpnp_iface_fptr.hpp
+++ b/dpnp/backend/include/dpnp_iface_fptr.hpp
@@ -174,8 +174,6 @@ enum class DPNPFuncName : size_t
     DPNP_FN_INITVAL_EXT, /**< Used in numpy ones, ones_like, zeros, zeros_like
                             impls  */
     DPNP_FN_INV,         /**< Used in numpy.linalg.inv() impl  */
-    DPNP_FN_INV_EXT,     /**< Used in numpy.linalg.inv() impl, requires extra
-                            parameters  */
     DPNP_FN_INVERT,      /**< Used in numpy.invert() impl  */
     DPNP_FN_KRON,        /**< Used in numpy.kron() impl  */
     DPNP_FN_KRON_EXT, /**< Used in numpy.kron() impl, requires extra parameters
diff --git a/dpnp/backend/kernels/dpnp_krnl_linalg.cpp b/dpnp/backend/kernels/dpnp_krnl_linalg.cpp
index 297f963ba28..e0b6de5b1b6 100644
--- a/dpnp/backend/kernels/dpnp_krnl_linalg.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_linalg.cpp
@@ -378,15 +378,6 @@ template <typename _DataType, typename _ResultType>
 void (*dpnp_inv_default_c)(void *, void *, shape_elem_type *, size_t) =
     dpnp_inv_c<_DataType, _ResultType>;
 
-template <typename _DataType, typename _ResultType>
-DPCTLSyclEventRef (*dpnp_inv_ext_c)(DPCTLSyclQueueRef,
-                                    void *,
-                                    void *,
-                                    shape_elem_type *,
-                                    size_t,
-                                    const DPCTLEventVectorRef) =
-    dpnp_inv_c<_DataType, _ResultType>;
-
 template <typename _DataType1, typename _DataType2, typename _ResultType>
 class dpnp_kron_c_kernel;
 
@@ -869,27 +860,6 @@ void func_map_init_linalg_func(func_map_t &fmap)
     fmap[DPNPFuncName::DPNP_FN_INV][eft_DBL][eft_DBL] = {
         eft_DBL, (void *)dpnp_inv_default_c<double, double>};
 
-    fmap[DPNPFuncName::DPNP_FN_INV_EXT][eft_INT][eft_INT] = {
-        get_default_floating_type(),
-        (void *)dpnp_inv_ext_c<
-            int32_t, func_type_map_t::find_type<get_default_floating_type()>>,
-        get_default_floating_type<std::false_type>(),
-        (void *)dpnp_inv_ext_c<
-            int32_t, func_type_map_t::find_type<
-                         get_default_floating_type<std::false_type>()>>};
-    fmap[DPNPFuncName::DPNP_FN_INV_EXT][eft_LNG][eft_LNG] = {
-        get_default_floating_type(),
-        (void *)dpnp_inv_ext_c<
-            int64_t, func_type_map_t::find_type<get_default_floating_type()>>,
-        get_default_floating_type<std::false_type>(),
-        (void *)dpnp_inv_ext_c<
-            int64_t, func_type_map_t::find_type<
-                         get_default_floating_type<std::false_type>()>>};
-    fmap[DPNPFuncName::DPNP_FN_INV_EXT][eft_FLT][eft_FLT] = {
-        eft_FLT, (void *)dpnp_inv_ext_c<float, float>};
-    fmap[DPNPFuncName::DPNP_FN_INV_EXT][eft_DBL][eft_DBL] = {
-        eft_DBL, (void *)dpnp_inv_ext_c<double, double>};
-
     fmap[DPNPFuncName::DPNP_FN_KRON][eft_INT][eft_INT] = {
         eft_INT, (void *)dpnp_kron_default_c<int32_t, int32_t, int32_t>};
     fmap[DPNPFuncName::DPNP_FN_KRON][eft_INT][eft_LNG] = {
diff --git a/dpnp/dpnp_algo/dpnp_algo.pxd b/dpnp/dpnp_algo/dpnp_algo.pxd
index 3fcb67aac9e..895b393aeff 100644
--- a/dpnp/dpnp_algo/dpnp_algo.pxd
+++ b/dpnp/dpnp_algo/dpnp_algo.pxd
@@ -78,8 +78,6 @@ cdef extern from "dpnp_iface_fptr.hpp" namespace "DPNPFuncName":  # need this na
         DPNP_FN_FMOD_EXT
         DPNP_FN_FULL
         DPNP_FN_FULL_LIKE
-        DPNP_FN_INV
-        DPNP_FN_INV_EXT
         DPNP_FN_KRON
         DPNP_FN_KRON_EXT
         DPNP_FN_MATRIX_RANK
diff --git a/dpnp/linalg/dpnp_algo_linalg.pyx b/dpnp/linalg/dpnp_algo_linalg.pyx
index 6cf3bd3578f..1d94a893fff 100644
--- a/dpnp/linalg/dpnp_algo_linalg.pyx
+++ b/dpnp/linalg/dpnp_algo_linalg.pyx
@@ -48,7 +48,6 @@ __all__ = [
     "dpnp_cond",
     "dpnp_eig",
     "dpnp_eigvals",
-    "dpnp_inv",
     "dpnp_matrix_rank",
     "dpnp_norm",
     "dpnp_qr",
@@ -186,46 +185,6 @@ cpdef utils.dpnp_descriptor dpnp_eigvals(utils.dpnp_descriptor input):
     return res_val
 
 
-cpdef utils.dpnp_descriptor dpnp_inv(utils.dpnp_descriptor input):
-    cdef shape_type_c input_shape = input.shape
-
-    cdef DPNPFuncType param1_type = dpnp_dtype_to_DPNPFuncType(input.dtype)
-
-    cdef DPNPFuncData kernel_data = get_dpnp_function_ptr(DPNP_FN_INV_EXT, param1_type, param1_type)
-
-    input_obj = input.get_array()
-
-    cdef (DPNPFuncType, void *) ret_type_and_func = utils.get_ret_type_and_func(kernel_data,
-                                                                                input_obj.sycl_device.has_aspect_fp64)
-    cdef DPNPFuncType return_type = ret_type_and_func[0]
-    cdef custom_linalg_1in_1out_func_ptr_t func = < custom_linalg_1in_1out_func_ptr_t > ret_type_and_func[1]
-
-    # create result array with type given by FPTR data
-    cdef utils.dpnp_descriptor result = utils.create_output_descriptor(input_shape,
-                                                                       return_type,
-                                                                       None,
-                                                                       device=input_obj.sycl_device,
-                                                                       usm_type=input_obj.usm_type,
-                                                                       sycl_queue=input_obj.sycl_queue)
-
-    result_sycl_queue = result.get_array().sycl_queue
-
-    cdef c_dpctl.SyclQueue q = <c_dpctl.SyclQueue> result_sycl_queue
-    cdef c_dpctl.DPCTLSyclQueueRef q_ref = q.get_queue_ref()
-
-    cdef c_dpctl.DPCTLSyclEventRef event_ref = func(q_ref,
-                                                    input.get_data(),
-                                                    result.get_data(),
-                                                    input_shape.data(),
-                                                    input.ndim,
-                                                    NULL)  # dep_events_ref
-
-    with nogil: c_dpctl.DPCTLEvent_WaitAndThrow(event_ref)
-    c_dpctl.DPCTLEvent_Delete(event_ref)
-
-    return result
-
-
 cpdef utils.dpnp_descriptor dpnp_matrix_rank(utils.dpnp_descriptor input):
     cdef shape_type_c input_shape = input.shape
     cdef DPNPFuncType param1_type = dpnp_dtype_to_DPNPFuncType(input.dtype)
diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py
index 2f7420d1329..800aa8de1bb 100644
--- a/dpnp/linalg/dpnp_iface_linalg.py
+++ b/dpnp/linalg/dpnp_iface_linalg.py
@@ -50,6 +50,7 @@
     dpnp_cholesky,
     dpnp_det,
     dpnp_eigh,
+    dpnp_inv,
     dpnp_slogdet,
     dpnp_solve,
 )
@@ -316,30 +317,51 @@ def eigvals(input):
     return call_origin(numpy.linalg.eigvals, input)
 
 
-def inv(input):
+def inv(a):
     """
-    Divide arguments element-wise.
+    Compute the (multiplicative) inverse of a matrix.
+
+    Given a square matrix a, return the matrix ainv
+    satisfying ``dot(a, ainv) = dot(ainv, a) = eye(a.shape[0])``.
 
     For full documentation refer to :obj:`numpy.linalg.inv`.
 
-    Limitations
-    -----------
-        Input array is supported as :obj:`dpnp.ndarray`.
-        Dimension of input array is supported to be equal to ``2``.
-        Shape of input array is limited by ``input.shape[0] == input.shape[1]``, ``input.shape[0] >= 2``.
-        Otherwise the function will be executed sequentially on CPU.
+    Parameters
+    ----------
+    a : (..., M, M) {dpnp.ndarray, usm_ndarray}
+        Matrix to be inverted.
+
+    Returns
+    -------
+    out : (..., M, M) dpnp.ndarray
+        (Multiplicative) inverse of the matrix a.
+
+    Examples
+    --------
+    >>> import dpnp as np
+    >>> a = np.array([[1., 2.], [3., 4.]])
+    >>> ainv = np.linalg.inv(a)
+    >>> np.allclose(np.dot(a, ainv), np.eye(2))
+    array([ True])
+    >>> np.allclose(np.dot(ainv, a), np.eye(2))
+    array([ True])
+
+    Inverses of several matrices can be computed at once:
+    >>> a = np.array([[[1., 2.], [3., 4.]], [[1, 3], [3, 5]]])
+    >>> np.linalg.inv(a)
+    array([[[-2.  ,  1.  ],
+            [ 1.5 , -0.5 ]],
+
+          [[-1.25,  0.75],
+            [ 0.75, -0.25]]])
+
     """
 
-    x1_desc = dpnp.get_dpnp_descriptor(input, copy_when_nondefault_queue=False)
-    if x1_desc:
-        if (
-            x1_desc.ndim == 2
-            and x1_desc.shape[0] == x1_desc.shape[1]
-            and x1_desc.shape[0] >= 2
-        ):
-            return dpnp_inv(x1_desc).get_pyobj()
+    dpnp.check_supported_arrays_type(a)
+    check_stacked_2d(a)
+    check_stacked_square(a)
 
-    return call_origin(numpy.linalg.inv, input)
+    return dpnp_inv(a)
 
 
 def matrix_power(input, count):
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 40159ac02bc..f2632b5b6a4 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -37,6 +37,7 @@
     "dpnp_cholesky",
     "dpnp_det",
     "dpnp_eigh",
+    "dpnp_inv",
     "dpnp_slogdet",
     "dpnp_solve",
 ]
@@ -96,6 +97,33 @@ def _calculate_determinant_sign(ipiv, diag, res_type, n):
     return sign.astype(res_type)
 
 
+def _check_lapack_dev_info(dev_info, error_msg=None):
+    """
+    Check `dev_info` from OneMKL LAPACK routines, raising an error for failures.
+
+    Parameters
+    ----------
+    dev_info : list of ints
+        Each element of the list indicates the status of OneMKL LAPACK routine calls.
+        A non-zero value signifies a failure.
+
+    error_message : str, optional
+        Custom error message for detected LAPACK errors.
+        Default: `Singular matrix`
+
+    Raises
+    ------
+    dpnp.linalg.LinAlgError
+        On non-zero elements in dev_info, indicating LAPACK errors.
+
+    """
+
+    if any(dev_info):
+        error_msg = error_msg or "Singular matrix"
+
+        raise dpnp.linalg.LinAlgError(error_msg)
+
+
 def _real_type(dtype, device=None):
     """
     Returns the real data type corresponding to a given dpnp data type.
@@ -740,6 +768,145 @@ def dpnp_eigh(a, UPLO):
         return w, out_v
 
 
+def dpnp_inv_batched(a, res_type):
+    """
+    dpnp_inv_batched(a, res_type)
+
+    Return the inverses of each matrix in a batch of matrices `a`.
+
+    The inverse of a matrix is such that if it is multiplied by the original matrix,
+    it results in the identity matrix. This function computes the inverses of a batch
+    of square matrices.
+    """
+
+    orig_shape = a.shape
+    # get 3d input arrays by reshape
+    a = a.reshape(-1, orig_shape[-2], orig_shape[-1])
+    batch_size = a.shape[0]
+    a_usm_arr = dpnp.get_usm_ndarray(a)
+    a_sycl_queue = a.sycl_queue
+    a_usm_type = a.usm_type
+    n = a.shape[1]
+
+    # oneMKL LAPACK getri_batch overwrites `a`
+    a_h = dpnp.empty_like(a, order="C", dtype=res_type, usm_type=a_usm_type)
+    ipiv_h = dpnp.empty(
+        (batch_size, n),
+        dtype=dpnp.int64,
+        usm_type=a_usm_type,
+        sycl_queue=a_sycl_queue,
+    )
+    dev_info = [0] * batch_size
+
+    # use DPCTL tensor function to fill the matrix array
+    # with content from the input array `a`
+    a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=a_usm_arr, dst=a_h.get_array(), sycl_queue=a.sycl_queue
+    )
+
+    ipiv_stride = n
+    a_stride = a_h.strides[0]
+
+    # Call the LAPACK extension function _getrf_batch
+    # to perform LU decomposition of a batch of general matrices
+    ht_getrf_ev, getrf_ev = li._getrf_batch(
+        a_sycl_queue,
+        a_h.get_array(),
+        ipiv_h.get_array(),
+        dev_info,
+        n,
+        a_stride,
+        ipiv_stride,
+        batch_size,
+        [a_copy_ev],
+    )
+
+    _check_lapack_dev_info(dev_info)
+
+    # Call the LAPACK extension function _getri_batch
+    # to compute the inverse of a batch of matrices using the results
+    # from the LU decomposition performed by _getrf_batch
+    ht_getri_ev, _ = li._getri_batch(
+        a_sycl_queue,
+        a_h.get_array(),
+        ipiv_h.get_array(),
+        dev_info,
+        n,
+        a_stride,
+        ipiv_stride,
+        batch_size,
+        [getrf_ev],
+    )
+
+    _check_lapack_dev_info(dev_info)
+
+    ht_getri_ev.wait()
+    ht_getrf_ev.wait()
+    a_ht_copy_ev.wait()
+
+    return a_h.reshape(orig_shape)
+
+
+def dpnp_inv(a):
+    """
+    dpnp_inv(a)
+
+    Return the inverse of `a` matrix.
+
+    The inverse of a matrix is such that if it is multiplied by the original matrix,
+    it results in the identity matrix. This function computes the inverse of a single
+    square matrix.
+
+    """
+
+    res_type = _common_type(a)
+    if a.size == 0:
+        return dpnp.empty_like(a, dtype=res_type)
+
+    if a.ndim >= 3:
+        return dpnp_inv_batched(a, res_type)
+
+    a_usm_arr = dpnp.get_usm_ndarray(a)
+    a_sycl_queue = a.sycl_queue
+    a_usm_type = a.usm_type
+
+    a_order = "C" if a.flags.c_contiguous else "F"
+    a_shape = a.shape
+
+    # oneMKL LAPACK gesv overwrites `a` and assumes fortran-like array as input.
+    # To use C-contiguous arrays, we transpose them before passing to gesv.
+    # This transposition is effective because the input array `a` is square.
+    a_f = dpnp.empty_like(a, order=a_order, dtype=res_type)
+
+    # use DPCTL tensor function to fill the coefficient matrix array
+    # with content from the input array `a`
+    a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=a_usm_arr, dst=a_f.get_array(), sycl_queue=a_sycl_queue
+    )
+
+    b_f = dpnp.eye(
+        a_shape[0],
+        dtype=res_type,
+        order=a_order,
+        sycl_queue=a_sycl_queue,
+        usm_type=a_usm_type,
+    )
+
+    if a_order == "F":
+        ht_lapack_ev, _ = li._gesv(
+            a_sycl_queue, a_f.get_array(), b_f.get_array(), [a_copy_ev]
+        )
+    else:
+        ht_lapack_ev, _ = li._gesv(
+            a_sycl_queue, a_f.T.get_array(), b_f.T.get_array(), [a_copy_ev]
+        )
+
+    ht_lapack_ev.wait()
+    a_ht_copy_ev.wait()
+
+    return b_f
+
+
 def dpnp_solve(a, b):
     """
     dpnp_solve(a, b)
diff --git a/tests/test_linalg.py b/tests/test_linalg.py
index 5f38421c6ec..5ea536c2887 100644
--- a/tests/test_linalg.py
+++ b/tests/test_linalg.py
@@ -435,18 +435,128 @@ def test_eigvals(type):
         assert_allclose(expected, result, atol=0.5)
 
 
-@pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True))
-@pytest.mark.parametrize(
-    "array",
-    [[[1.0, 2.0], [3.0, 4.0]], [[0, 1, 2], [3, 2, -1], [4, -2, 3]]],
-    ids=["[[1., 2.], [3., 4.]]", "[[0, 1, 2], [3, 2, -1], [4, -2, 3]]"],
-)
-def test_inv(type, array):
-    a = numpy.array(array, dtype=type)
-    ia = inp.array(a)
-    result = inp.linalg.inv(ia)
-    expected = numpy.linalg.inv(a)
-    assert_allclose(expected, result, rtol=1e-06)
+class TestInv:
+    @pytest.mark.parametrize(
+        "array",
+        [
+            [[1, 2], [3, 4]],
+            [[[1, 2], [3, 4]], [[1, 2], [2, 1]], [[1, 3], [3, 1]]],
+            [
+                [[[1, 2], [3, 4]], [[1, 2], [2, 1]]],
+                [[[1, 3], [3, 1]], [[0, 1], [1, 3]]],
+            ],
+        ],
+        ids=[
+            "2D_array",
+            "3D_array",
+            "4D_array",
+        ],
+    )
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_inv(self, array, dtype):
+        a = numpy.array(array, dtype=dtype)
+        ia = inp.array(a)
+        result = inp.linalg.inv(ia)
+        expected = numpy.linalg.inv(a)
+        assert_dtype_allclose(result, expected)
+
+    def test_inv_strides(self):
+        a_np = numpy.array(
+            [
+                [2, 3, 1, 4, 5],
+                [5, 6, 7, 8, 9],
+                [9, 7, 7, 2, 3],
+                [1, 4, 5, 1, 8],
+                [8, 9, 8, 5, 3],
+            ]
+        )
+
+        a_dp = inp.array(a_np)
+
+        # positive strides
+        expected = numpy.linalg.inv(a_np[::2, ::2])
+        result = inp.linalg.inv(a_dp[::2, ::2])
+        assert_allclose(expected, result, rtol=1e-3, atol=1e-4)
+
+        # negative strides
+        expected = numpy.linalg.inv(a_np[::-2, ::-2])
+        result = inp.linalg.inv(a_dp[::-2, ::-2])
+        assert_allclose(expected, result, rtol=1e-3, atol=1e-4)
+
+    @pytest.mark.parametrize(
+        "shape",
+        [
+            (0, 0),
+            (3, 0, 0),
+            (0, 2, 2),
+        ],
+        ids=[
+            "(0, 0)",
+            "(3, 0, 0)",
+            "(0, 2, 2)",
+        ],
+    )
+    def test_inv_empty(self, shape):
+        a = numpy.empty(shape)
+        ia = inp.array(a)
+        result = inp.linalg.inv(ia)
+        expected = numpy.linalg.inv(a)
+        assert_dtype_allclose(result, expected)
+
+    # TODO: remove skipif when MKLD-16626 is resolved
+    @pytest.mark.skipif(is_cpu_device(), reason="MKLD-16626")
+    @pytest.mark.parametrize(
+        "matrix",
+        [
+            [[1, 2], [2, 4]],
+            [[0, 0], [0, 0]],
+            [[1, 1], [1, 1]],
+            [[2, 4], [1, 2]],
+            [[1, 2], [0, 0]],
+            [[1, 0], [2, 0]],
+        ],
+        ids=[
+            "Linearly dependent rows",
+            "Zero matrix",
+            "Identical rows",
+            "Linearly dependent columns",
+            "Zero row",
+            "Zero column",
+        ],
+    )
+    def test_inv_singular_matrix(self, matrix):
+        a_np = numpy.array(matrix, dtype="float32")
+        a_dp = inp.array(a_np)
+
+        assert_raises(numpy.linalg.LinAlgError, numpy.linalg.inv, a_np)
+        assert_raises(inp.linalg.LinAlgError, inp.linalg.inv, a_dp)
+
+    # TODO: remove skipif when MKLD-16626 is resolved
+    # _getrf_batch does not raise an error with singular matrices.
+    @pytest.mark.skip("MKLD-16626")
+    def test_inv_singular_matrix_3D(self):
+        a_np = numpy.array(
+            [[[1, 2], [3, 4]], [[1, 2], [1, 2]], [[1, 3], [3, 1]]]
+        )
+        a_dp = inp.array(a_np)
+
+        assert_raises(numpy.linalg.LinAlgError, numpy.linalg.inv, a_np)
+        assert_raises(inp.linalg.LinAlgError, inp.linalg.inv, a_dp)
+
+    def test_inv_errors(self):
+        a_dp = inp.array([[1, 2], [2, 5]], dtype="float32")
+
+        # unsupported type
+        a_np = inp.asnumpy(a_dp)
+        assert_raises(TypeError, inp.linalg.inv, a_np)
+
+        # a.ndim < 2
+        a_dp_ndim_1 = a_dp.flatten()
+        assert_raises(inp.linalg.LinAlgError, inp.linalg.inv, a_dp_ndim_1)
+
+        # a is not square
+        a_dp = inp.ones((2, 3))
+        assert_raises(inp.linalg.LinAlgError, inp.linalg.inv, a_dp)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py
index ee247aa5414..4b0d21eae40 100644
--- a/tests/test_sycl_queue.py
+++ b/tests/test_sycl_queue.py
@@ -1136,21 +1136,42 @@ def test_eigvals(device):
     assert_sycl_queue_equal(result_queue, expected_queue)
 
 
+@pytest.mark.parametrize(
+    "shape, is_empty",
+    [
+        ((2, 2), False),
+        ((3, 2, 2), False),
+        ((0, 0), True),
+        ((0, 2, 2), True),
+    ],
+    ids=[
+        "(2, 2)",
+        "(3, 2, 2)",
+        "(0, 0)",
+        "(0, 2, 2)",
+    ],
+)
 @pytest.mark.parametrize(
     "device",
     valid_devices,
     ids=[device.filter_string for device in valid_devices],
 )
-def test_inv(device):
-    data = [[1.0, 2.0], [3.0, 4.0]]
-    numpy_data = numpy.array(data)
-    dpnp_data = dpnp.array(data, device=device)
+def test_inv(shape, is_empty, device):
+    if is_empty:
+        numpy_x = numpy.empty(shape, dtype=dpnp.default_float_type(device))
+    else:
+        count_elem = numpy.prod(shape)
+        numpy_x = numpy.arange(
+            1, count_elem + 1, dtype=dpnp.default_float_type()
+        ).reshape(shape)
 
-    result = dpnp.linalg.inv(dpnp_data)
-    expected = numpy.linalg.inv(numpy_data)
-    assert_allclose(expected, result)
+    dpnp_x = dpnp.array(numpy_x, device=device)
 
-    expected_queue = dpnp_data.get_array().sycl_queue
+    result = dpnp.linalg.inv(dpnp_x)
+    expected = numpy.linalg.inv(numpy_x)
+    assert_dtype_allclose(result, expected)
+
+    expected_queue = dpnp_x.get_array().sycl_queue
     result_queue = result.get_array().sycl_queue
 
     assert_sycl_queue_equal(result_queue, expected_queue)
diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py
index 554f939b208..ada68ebfa6c 100644
--- a/tests/test_usm_type.py
+++ b/tests/test_usm_type.py
@@ -710,3 +710,33 @@ def test_det(shape, is_empty, usm_type):
     det = dp.linalg.det(x)
 
     assert x.usm_type == det.usm_type
+
+
+@pytest.mark.parametrize("usm_type", list_of_usm_types, ids=list_of_usm_types)
+@pytest.mark.parametrize(
+    "shape, is_empty",
+    [
+        ((2, 2), False),
+        ((3, 2, 2), False),
+        ((0, 0), True),
+        ((0, 2, 2), True),
+    ],
+    ids=[
+        "(2, 2)",
+        "(3, 2, 2)",
+        "(0, 0)",
+        "(0, 2, 2)",
+    ],
+)
+def test_inv(shape, is_empty, usm_type):
+    if is_empty:
+        x = dp.empty(shape, dtype=dp.default_float_type(), usm_type=usm_type)
+    else:
+        count_elem = numpy.prod(shape)
+        x = dp.arange(
+            1, count_elem + 1, dtype=dp.default_float_type(), usm_type=usm_type
+        ).reshape(shape)
+
+    result = dp.linalg.inv(x)
+
+    assert x.usm_type == result.usm_type
diff --git a/tests/third_party/cupy/linalg_tests/test_solve.py b/tests/third_party/cupy/linalg_tests/test_solve.py
index 6194cf6b8ac..b31082c8e84 100644
--- a/tests/third_party/cupy/linalg_tests/test_solve.py
+++ b/tests/third_party/cupy/linalg_tests/test_solve.py
@@ -4,8 +4,13 @@
 import pytest
 
 import dpnp as cupy
-from tests.helper import has_support_aspect64
+from tests.helper import (
+    assert_dtype_allclose,
+    has_support_aspect64,
+    is_cpu_device,
+)
 from tests.third_party.cupy import testing
+from tests.third_party.cupy.testing import condition
 
 
 @testing.parameterize(
@@ -38,8 +43,8 @@ def check_x(self, a_shape, b_shape, xp, dtype):
         a_copy = a.copy()
         b_copy = b.copy()
         result = xp.linalg.solve(a, b)
-        numpy.testing.assert_array_equal(a_copy, a)
-        numpy.testing.assert_array_equal(b_copy, b)
+        testing.assert_array_equal(a_copy, a)
+        testing.assert_array_equal(b_copy, b)
         return result
 
     def test_solve(self):
@@ -88,3 +93,76 @@ def test_invalid_shape(self):
         self.check_shape((2, 3, 3), (3,), value_errors)
         self.check_shape((3, 3), (0,), value_errors)
         self.check_shape((0, 3, 4), (3,), linalg_errors)
+
+
+@testing.parameterize(
+    *testing.product(
+        {
+            "order": ["C", "F"],
+        }
+    )
+)
+class TestInv(unittest.TestCase):
+    @testing.for_dtypes("ifdFD")
+    @condition.retry(10)
+    def check_x(self, a_shape, dtype):
+        a_cpu = numpy.random.randint(0, 10, size=a_shape)
+        a_cpu = a_cpu.astype(dtype, order=self.order)
+        a_gpu = cupy.asarray(a_cpu, order=self.order)
+        a_gpu_copy = a_gpu.copy()
+        result_cpu = numpy.linalg.inv(a_cpu)
+        result_gpu = cupy.linalg.inv(a_gpu)
+
+        assert_dtype_allclose(result_gpu, result_cpu)
+        testing.assert_array_equal(a_gpu_copy, a_gpu)
+
+    def check_shape(self, a_shape):
+        a = cupy.random.rand(*a_shape)
+        with self.assertRaises(
+            (numpy.linalg.LinAlgError, cupy.linalg.LinAlgError)
+        ):
+            cupy.linalg.inv(a)
+
+    def test_inv(self):
+        self.check_x((3, 3))
+        self.check_x((4, 4))
+        self.check_x((5, 5))
+        self.check_x((2, 5, 5))
+        self.check_x((3, 4, 4))
+        self.check_x((4, 2, 3, 3))
+        self.check_x((0, 0))
+        self.check_x((3, 0, 0))
+        self.check_x((2, 0, 3, 4, 4))
+
+    def test_invalid_shape(self):
+        self.check_shape((2, 3))
+        self.check_shape((4, 1))
+        self.check_shape((4, 3, 2))
+        self.check_shape((2, 4, 3))
+        self.check_shape((2, 0))
+        self.check_shape((0, 2, 3))
+
+
+class TestInvInvalid(unittest.TestCase):
+    # TODO: remove skipif when MKLD-16626 is resolved
+    # _gesv does not raise an error with singular matrices on CPU.
+    @pytest.mark.skipif(is_cpu_device(), reason="MKLD-16626")
+    @testing.for_dtypes("ifdFD")
+    def test_inv(self, dtype):
+        for xp in (numpy, cupy):
+            a = xp.array([[1, 2], [2, 4]]).astype(dtype)
+            with pytest.raises(
+                (numpy.linalg.LinAlgError, cupy.linalg.LinAlgError)
+            ):
+                xp.linalg.inv(a)
+
+    # TODO: remove skipif when MKLD-16626 is resolved
+    # _getrf_batch does not raise an error with singular matrices.
+    @pytest.mark.skip("MKLD-16626")
+    @testing.for_dtypes("ifdFD")
+    def test_batched_inv(self, dtype):
+        for xp in (numpy, cupy):
+            a = xp.array([[[1, 2], [2, 4]]]).astype(dtype)
+            assert a.ndim >= 3  # CuPy internally uses a batched function.
+            with pytest.raises(xp.linalg.LinAlgError):
+                xp.linalg.inv(a)