Skip to content

Bgemm for arm64 #5287

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 9 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ In chronological order:
* Ye Tao <ye.tao@arm.com>
* [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1
* [2025-02-27] Add sbgemv_n_neon kernel
* [2025-05-17] Impl prototype of BGEMM inferface

* Abhishek Kumar <https://github.com/abhishek-iitmadras>
* [2025-04-22] Optimise dot kernel for NEOVERSE V1
* [2025-04-22] Optimise dot kernel for NEOVERSE V1
6 changes: 5 additions & 1 deletion Makefile.system
Original file line number Diff line number Diff line change
Expand Up @@ -1544,6 +1544,9 @@ ifeq ($(USE_TLS), 1)
CCOMMON_OPT += -DUSE_TLS
endif

ifeq ($(BUILD_BFLOAT16_ONLY), 1)
CCOMMON_OPT += -DBUILD_BFLOAT16_ONLY
endif
ifeq ($(BUILD_BFLOAT16), 1)
CCOMMON_OPT += -DBUILD_BFLOAT16
endif
Expand Down Expand Up @@ -1888,6 +1891,7 @@ export FUNCTION_PROFILE
export TARGET_CORE
export NO_AVX512
export NO_AVX2
export BUILD_BFLOAT16_ONLY
export BUILD_BFLOAT16
export NO_LSX
export NO_LASX
Expand All @@ -1912,7 +1916,7 @@ export ZGEMM3M_UNROLL_M
export ZGEMM3M_UNROLL_N
export XGEMM3M_UNROLL_M
export XGEMM3M_UNROLL_N

# Todo: add bgemm unroll factors

ifdef USE_CUDA
export CUDADIR
Expand Down
4 changes: 3 additions & 1 deletion Makefile.tail
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX))

HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX))

BLASOBJS = $(SBEXTOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(CBAUXOBJS)
BLASOBJS = $(SBEXTOBJS) $(BBLASOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(CBAUXOBJS)
BLASOBJS_P = $(SBEXTOBJS_P) $(SBBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P)

ifdef EXPRECISION
Expand All @@ -24,6 +24,7 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS)
BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P)
endif

$(BBLASOBJS) : override CFLAGS += -DBFLOAT16_ONLY -UDOUBLE -UCOMPLEX -UBFLOAT16 -USMALL_MATRIX_OPT
$(SBBLASOBJS) $(SBBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX
$(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX
$(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX
Expand All @@ -42,6 +43,7 @@ $(ZBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(XBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(SBEXTOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)


libs :: $(BLASOBJS) $(COMMONOBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^

Expand Down
31 changes: 31 additions & 0 deletions cblas.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,31 @@
/***************************************************************************
* Copyright (c) 2025, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/

#ifndef CBLAS_H
#define CBLAS_H

Expand Down Expand Up @@ -446,6 +474,9 @@ void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum C
void cblas_sbgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST bfloat16 ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST bfloat16 ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);

void cblas_bgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, bfloat16 *C, OPENBLAS_CONST blasint ldc);

#ifdef __cplusplus
}
#endif /* __cplusplus */
Expand Down
8 changes: 8 additions & 0 deletions common.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2025 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
Expand Down Expand Up @@ -306,6 +307,13 @@ typedef int blasint;
#define SIZE 8
#define BASE_SHIFT 3
#define ZBASE_SHIFT 4
#elif defined(BFLOAT16_ONLY)
Copy link
Contributor

@annop-w annop-w May 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not think we need to introduce BFLOAT16_ONLY build flag.
The type of FLOAT is the only difference. Can we simplyl use XFLOAT instead of FLOAT for C matrix, for example, in kernel/arm64/bgemm_beta.c ?

#define IFLOAT bfloat16
#define XFLOAT IFLOAT
#define FLOAT bfloat16
#define SIZE 2
#define BASE_SHIFT 1
#define ZBASE_SHIFT 2
#elif defined(BFLOAT16)
#define IFLOAT bfloat16
#define XFLOAT IFLOAT
Expand Down
86 changes: 86 additions & 0 deletions common_b.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/***************************************************************************
* Copyright (c) 2025, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/

#ifndef COMMON_B_H
#define COMMON_B_H

// for now, only support DYNAMIC_ARCH = 0 case.
#ifndef DYNAMIC_ARCH
#define BGEMM_ONCOPY bgemm_oncopy
#define BGEMM_OTCOPY bgemm_otcopy
#define BGEMM_INCOPY bgemm_incopy
#define BGEMM_ITCOPY bgemm_itcopy

#define BGEMM_BETA bgemm_beta
#define BGEMM_KERNEL bgemm_kernel

#else

#define BGEMM_ONCOPY gotoblas -> bgemm_oncopy
#define BGEMM_OTCOPY gotoblas -> bgemm_otcopy
#define BGEMM_INCOPY gotoblas -> bgemm_incopy
#define BGEMM_ITCOPY gotoblas -> bgemm_itcopy
#define BGEMM_BETA gotoblas -> bgemm_beta
#define BGEMM_KERNEL gotoblas -> bgemm_kernel

#endif

#define BGEMM_NN bgemm_nn
#define BGEMM_CN bgemm_tn
#define BGEMM_TN bgemm_tn
#define BGEMM_NC bgemm_nt
#define BGEMM_NT bgemm_nt
#define BGEMM_CC bgemm_tt
#define BGEMM_CT bgemm_tt
#define BGEMM_TC bgemm_tt
#define BGEMM_TT bgemm_tt
#define BGEMM_NR bgemm_nn
#define BGEMM_TR bgemm_tn
#define BGEMM_CR bgemm_tn
#define BGEMM_RN bgemm_nn
#define BGEMM_RT bgemm_nt
#define BGEMM_RC bgemm_nt
#define BGEMM_RR bgemm_nn

#define BGEMM_THREAD_NN bgemm_thread_nn
#define BGEMM_THREAD_CN bgemm_thread_tn
#define BGEMM_THREAD_TN bgemm_thread_tn
#define BGEMM_THREAD_NC bgemm_thread_nt
#define BGEMM_THREAD_NT bgemm_thread_nt
#define BGEMM_THREAD_CC bgemm_thread_tt
#define BGEMM_THREAD_CT bgemm_thread_tt
#define BGEMM_THREAD_TC bgemm_thread_tt
#define BGEMM_THREAD_TT bgemm_thread_tt
#define BGEMM_THREAD_NR bgemm_thread_nn
#define BGEMM_THREAD_TR bgemm_thread_tn
#define BGEMM_THREAD_CR bgemm_thread_tn
#define BGEMM_THREAD_RN bgemm_thread_nn
#define BGEMM_THREAD_RT bgemm_thread_nt
#define BGEMM_THREAD_RC bgemm_thread_nt
#define BGEMM_THREAD_RR bgemm_thread_nn
#endif
4 changes: 3 additions & 1 deletion common_interface.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2025 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
Expand Down Expand Up @@ -480,7 +481,8 @@ void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint
xdouble *, blasint *, xdouble *, xdouble *, blasint *);

/* Level 3 routines */

void BLASFUNC(bgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
bfloat16 *, blasint *, bfloat16 *, blasint *, float *, bfloat16 *, blasint *);
void BLASFUNC(sbgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *);
void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
Expand Down
21 changes: 20 additions & 1 deletion common_level3.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ void sgemm_direct(BLASLONG M, BLASLONG N, BLASLONG K,

int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);


int bgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG);
int sbgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG);
int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
Expand All @@ -78,6 +79,12 @@ int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
#endif

// add bgemm copy functions
int bgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int bgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int bgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int bgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);

int sbgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int sbgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int sbgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
Expand Down Expand Up @@ -505,6 +512,8 @@ int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdoubl
int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag);
int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag);

// add bgemm kernel
int bgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, bfloat16 *, BLASLONG);
int sbgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG);
int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG);
Expand Down Expand Up @@ -657,6 +666,11 @@ int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float
int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);

int bgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);

int sbgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int sbgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int sbgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
Expand Down Expand Up @@ -754,6 +768,11 @@ int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLON
int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
#endif

int bgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);

int sbgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int sbgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int sbgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
Expand Down
51 changes: 50 additions & 1 deletion common_macro.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2025 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
Expand Down Expand Up @@ -39,6 +40,7 @@
#ifndef COMMON_MACRO
#define COMMON_MACRO

#include "common_b.h"
#include "common_sb.h"
#include "common_s.h"
#include "common_d.h"
Expand Down Expand Up @@ -657,8 +659,52 @@
#define GEMM_SMALL_KERNEL_B0_TN DGEMM_SMALL_KERNEL_B0_TN
#define GEMM_SMALL_KERNEL_B0_TT DGEMM_SMALL_KERNEL_B0_TT

#elif defined(BFLOAT16)
#elif defined(BFLOAT16_ONLY)
#define GEMM_BETA BGEMM_BETA
#define GEMM_KERNEL_N BGEMM_KERNEL
#define GEMM_KERNEL_L BGEMM_KERNEL
#define GEMM_KERNEL_R BGEMM_KERNEL
#define GEMM_KERNEL_B BGEMM_KERNEL

#define GEMM_NN BGEMM_NN
#define GEMM_CN BGEMM_TN
#define GEMM_TN BGEMM_TN
#define GEMM_NC BGEMM_NT
#define GEMM_NT BGEMM_NT
#define GEMM_CC BGEMM_TT
#define GEMM_CT BGEMM_TT
#define GEMM_TC BGEMM_TT
#define GEMM_TT BGEMM_TT
#define GEMM_NR BGEMM_NN
#define GEMM_TR BGEMM_TN
#define GEMM_CR BGEMM_TN
#define GEMM_RN BGEMM_NN
#define GEMM_RT BGEMM_NT
#define GEMM_RC BGEMM_NT
#define GEMM_RR BGEMM_NN
#define GEMM_ONCOPY BGEMM_ONCOPY
#define GEMM_OTCOPY BGEMM_OTCOPY
#define GEMM_INCOPY BGEMM_INCOPY
#define GEMM_ITCOPY BGEMM_ITCOPY

#define GEMM_THREAD_NN BGEMM_THREAD_NN
#define GEMM_THREAD_CN BGEMM_THREAD_TN
#define GEMM_THREAD_TN BGEMM_THREAD_TN
#define GEMM_THREAD_NC BGEMM_THREAD_NT
#define GEMM_THREAD_NT BGEMM_THREAD_NT
#define GEMM_THREAD_CC BGEMM_THREAD_TT
#define GEMM_THREAD_CT BGEMM_THREAD_TT
#define GEMM_THREAD_TC BGEMM_THREAD_TT
#define GEMM_THREAD_TT BGEMM_THREAD_TT
#define GEMM_THREAD_NR BGEMM_THREAD_NN
#define GEMM_THREAD_TR BGEMM_THREAD_TN
#define GEMM_THREAD_CR BGEMM_THREAD_TN
#define GEMM_THREAD_RN BGEMM_THREAD_NN
#define GEMM_THREAD_RT BGEMM_THREAD_NT
#define GEMM_THREAD_RC BGEMM_THREAD_NT
#define GEMM_THREAD_RR BGEMM_THREAD_NN

#elif defined(BFLOAT16)
#define D_TO_BF16_K SBDTOBF16_K
#define D_BF16_TO_K DBF16TOD_K
#define S_TO_BF16_K SBSTOBF16_K
Expand Down Expand Up @@ -2618,6 +2664,9 @@
|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) || defined(ARCH_ALPHA))
extern BLASLONG gemm_offset_a;
extern BLASLONG gemm_offset_b;
extern BLASLONG bgemm_p;
extern BLASLONG bgemm_q;
extern BLASLONG bgemm_r;
extern BLASLONG sbgemm_p;
extern BLASLONG sbgemm_q;
extern BLASLONG sbgemm_r;
Expand Down
Loading
Loading