Tweaks to matmul and gemm kernels

ndgrigorian · ndgrigorian · commit 39cf672709ce · 2024-01-08T10:42:11.000-08:00
Fixes a missing indexer in gemm functor with threading along `nm` dimensions

Fixes `matmul` broadcasting, which was broadcasting in some unnecessary cases
diff --git a/dpctl/tensor/_linear_algebra_functions.py b/dpctl/tensor/_linear_algebra_functions.py
@@ -823,9 +823,9 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
                     sycl_queue=exec_q,
                     order=order,
                 )
-        if x1.shape != res_shape:
+        if x1.shape != x1_broadcast_shape:
             x1 = dpt.broadcast_to(x1, x1_broadcast_shape)
-        if x2.shape != res_shape:
+        if x2.shape != x2_broadcast_shape:
             x2 = dpt.broadcast_to(x2, x2_broadcast_shape)
         ht_dot_ev, binary_ev = tli._dot(
             x1=x1,
@@ -875,9 +875,10 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
                     order=order,
                 )
 
-        if x1.shape != res_shape:
+        if x1.shape != x1_broadcast_shape:
             x1 = dpt.broadcast_to(x1, x1_broadcast_shape)
-        buf2 = dpt.broadcast_to(buf2, x2_broadcast_shape)
+        if buf2.shape != x2_broadcast_shape:
+            buf2 = dpt.broadcast_to(buf2, x2_broadcast_shape)
         ht_dot_ev, binary_ev = tli._dot(
             x1=x1,
             x2=buf2,
@@ -929,8 +930,9 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
                     order=order,
                 )
 
-        buf1 = dpt.broadcast_to(buf1, x1_broadcast_shape)
-        if x2.shape != res_shape:
+        if buf1.shape != x1_broadcast_shape:
+            buf1 = dpt.broadcast_to(buf1, x1_broadcast_shape)
+        if x2.shape != x2_broadcast_shape:
             x2 = dpt.broadcast_to(x2, x2_broadcast_shape)
         ht_dot_ev, binary_ev = tli._dot(
             x1=buf1,
@@ -994,8 +996,10 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
                 order=order,
             )
 
-    buf1 = dpt.broadcast_to(buf1, x1_broadcast_shape)
-    buf2 = dpt.broadcast_to(buf2, x2_broadcast_shape)
+    if buf1.shape != x1_broadcast_shape:
+        buf1 = dpt.broadcast_to(buf1, x1_broadcast_shape)
+    if buf2.shape != x2_broadcast_shape:
+        buf2 = dpt.broadcast_to(buf2, x2_broadcast_shape)
     ht_, _ = tli._dot(
         x1=buf1,
         x2=buf2,
diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
@@ -533,7 +533,8 @@ class GemmFunctorThreadNM
                 size_t g_j = g_j0 + lane_id;
                 vec[lane_id] =
                     (g_j < m && g_s < k)
-                        ? static_cast<resT>(rhs[g_s * b_st0 + g_j * b_st1])
+                        ? static_cast<resT>(
+                              rhs[rhs_indexer(g_s * b_st0 + g_j * b_st1)])
                         : resT(0);
             }
 
diff --git a/dpctl/tests/test_usm_ndarray_linalg.py b/dpctl/tests/test_usm_ndarray_linalg.py
@@ -72,7 +72,7 @@ def test_matmul_simple(dtype):
     q = get_queue_or_skip()
     skip_if_dtype_not_supported(dtype, q)
 
-    n, m = 100, 17
+    n, m = 235, 17
     m1 = dpt.ones((m, n), dtype=dtype)
     m2 = dpt.ones((n, m), dtype=dtype)
 

Original file line number	Diff line number	Diff line change
`@@ -533,7 +533,8 @@ class GemmFunctorThreadNM`
`533`	`533`	`size_t g_j = g_j0 + lane_id;`
`534`	`534`	`vec[lane_id] =`
`535`	`535`	`(g_j < m && g_s < k)`
`536`		`- ? static_cast<resT>(rhs[g_s * b_st0 + g_j * b_st1])`
	`536`	`+ ? static_cast<resT>(`
	`537`	`+ rhs[rhs_indexer(g_s * b_st0 + g_j * b_st1)])`
`537`	`538`	`: resT(0);`
`538`	`539`	`}`
`539`	`540`