eth-cscs · dbsanfte · Sep 23, 2025 · Sep 23, 2025 · Sep 24, 2025 · Sep 25, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -1,9 +1,10 @@
 [submodule "libs/Tiled-MM"]
 	path = libs/Tiled-MM
-	url = https://github.com/eth-cscs/Tiled-MM.git
+	url = https://github.com/dbsanfte/Tiled-MM.git
+	branch = feature/bf16-support
 [submodule "libs/COSTA"]
 	path = libs/COSTA
-	url = https://github.com/eth-cscs/COSTA
+	url = https://github.com/dbsanfte/COSTA
 [submodule "libs/cxxopts"]
 	path = libs/cxxopts
 	url = https://github.com/jarro2783/cxxopts
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -97,10 +97,10 @@ endif ()
 set(COSTA_WITH_PROFILING ${COSMA_WITH_PROFILING} CACHE INTERNAL "")
 set(COSTA_SCALAPACK ${COSMA_SCALAPACK} CACHE INTERNAL "")
 
+# Use local COSTA submodule (forked with bfloat16 support)
 FetchContent_Declare(
   costa
-  GIT_REPOSITORY https://github.com/eth-cscs/costa.git
-  GIT_TAG        03847e66f05ad4a1eb371b85be628e218ce46f11 # v2.2.3
+  SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/libs/COSTA
   FIND_PACKAGE_ARGS NAMES costa
 )
 # the joy of fetch_content. if we build costa and cosma together
@@ -114,10 +114,12 @@ FetchContent_MakeAvailable(costa)
 # these are only GPU-backends
 if (COSMA_GPU_BACKEND MATCHES "CUDA|ROCM")
   set(TILEDMM_GPU_BACKEND ${COSMA_GPU_BACKEND} CACHE INTERNAL "")
+
+  # Use fork with BF16 support
   FetchContent_Declare(
     Tiled-MM
-    GIT_REPOSITORY https://github.com/eth-cscs/Tiled-MM.git
-    GIT_TAG      0eb75179e670a04c649b50ae5e91bb71b43e4d06 # v2.3.2
+    GIT_REPOSITORY https://github.com/dbsanfte/Tiled-MM.git
+    GIT_TAG      feature/bf16-support  # BF16 support branch
     FIND_PACKAGE_ARGS NAMES tiled-MM
   )
   FetchContent_MakeAvailable(Tiled-MM)
@@ -134,6 +136,55 @@ if (COSMA_GPU_BACKEND MATCHES "CUDA|ROCM")
       message("Tiled-mm target not found")
   endif ()
 
+  # Check if GPU backend supports BFloat16
+  include(check_gpu_bf16_support)
+  check_gpu_bf16_support()
+
+  # Pass BF16 support flag to Tiled-MM
+  if(COSMA_GPU_HAS_BF16_SUPPORT)
+    set(TILED_MM_HAS_BF16_SUPPORT ON CACHE INTERNAL "Enable BF16 support in Tiled-MM")
+    target_compile_definitions(Tiled-MM::Tiled-MM INTERFACE TILED_MM_HAS_BF16_SUPPORT)
+    message(STATUS "Tiled-MM BF16 support: ENABLED")
+  else()
+    set(TILED_MM_HAS_BF16_SUPPORT OFF CACHE INTERNAL "Enable BF16 support in Tiled-MM")
+    message(STATUS "Tiled-MM BF16 support: DISABLED")
+  endif()
+
+endif()
+
+# CPU BFloat16 Support Detection (for OpenBLAS native BF16)
+# This detects if the CPU has AVX512_BF16 instructions and if OpenBLAS
+# supports native BF16 GEMM operations (cblas_sbgemm)
+if(COSMA_BLAS_VENDOR MATCHES "OPENBLAS")
+  message(STATUS "Configuring OpenBLAS with BF16 support detection...")
+
+  # Check CPU capabilities for BF16
+  include(check_cpu_bf16_support)
+  check_cpu_bf16_support()
+
+  # Fetch/build OpenBLAS from source with BF16 support
+  include(fetch_openblas_bf16)
+
+  # Configure COSMA with OpenBLAS BF16 capabilities
+  if(COSMA_CPU_HAS_BF16 AND OPENBLAS_HAS_BF16_SUPPORT)
+    set(COSMA_OPENBLAS_HAS_BF16_NATIVE ON CACHE BOOL "OpenBLAS has native BF16 GEMM support")
+    target_compile_definitions(cosma PRIVATE COSMA_OPENBLAS_HAS_BF16_NATIVE)
+
+    # Add CPU BF16 compiler flags if needed
+    if(COSMA_CPU_BF16_FLAGS)
+      target_compile_options(cosma PRIVATE ${COSMA_CPU_BF16_FLAGS})
+    endif()
+
+    message(STATUS "OpenBLAS native BF16 GEMM: ENABLED (CPU has AVX512_BF16)")
+  else()
+    set(COSMA_OPENBLAS_HAS_BF16_NATIVE OFF CACHE BOOL "OpenBLAS native BF16 support")
+
+    if(NOT COSMA_CPU_HAS_BF16)
+      message(STATUS "OpenBLAS native BF16 GEMM: DISABLED (CPU lacks AVX512_BF16)")
+    elseif(NOT OPENBLAS_HAS_BF16_SUPPORT)
+      message(STATUS "OpenBLAS native BF16 GEMM: DISABLED (OpenBLAS version too old)")
+    endif()
+  endif()
 endif()
 
 if (COSMA_WITH_PROFILING)

diff --git a/README.md b/README.md
@@ -58,9 +58,10 @@ The paper and other materials on COSMA are available under the following link:
 ## Features
 
 - **[NEW] Multi-GPU Systems Support:** COSMA is now able to take advantage of fast GPU-to-GPU interconnects either through the use of NCCL/RCCL libraries or by using the GPU-aware MPI. Both, NVIDIA and AMD GPUs are supported.
+- **[NEW] BFloat16 Support:** COSMA now supports BFloat16 (BF16) reduced precision arithmetic for AI/ML workloads, enabling memory-efficient distributed matrix multiplication with automatic precision handling.
 - **ScaLAPACK API Support:** it is enough to link to COSMA, without changing the code and all `p?gemm` calls will use ScaLAPACK wrappers provided by COSMA.
 - **C/Fortran Interface:** written in `C++`, but provides `C` and `Fortran` interfaces.
-- **Custom Types:** fully templatized types.
+- **Custom Types:** fully templatized types including support for `float`, `double`, complex types (`zfloat`, `zdouble`), and **BFloat16** (`bfloat16`).
 - **GPU acceleration:** supports both **NVIDIA** and **AMD** GPUs.
 - **Supported BLAS (CPU) backends:** MKL, LibSci, NETLIB, BLIS, ATLAS.
 - **Custom Data Layout Support:** natively uses its own blocked data layout of matrices, but supports arbitrary grid-like data layout of matrices.
@@ -273,10 +274,20 @@ The overview of all supported options is given below:
   step. The third parameter is an integer which defines the divisor. This
   parameter can be omitted. In that case the default strategy will be used. An example of a possible value for the upper example: `--steps=sm2,pn2,pk2`.
 - `-r (--n_rep)` (optional, default: `2`): the number of repetitions.
-- `-t (--type)` (optional, default: `double`): data type of matrix entries. Can be one of: `float`, `double`, `zfloat` and `zdouble`. The last two correspond to complex numbers.
+- `-t (--type)` (optional, default: `double`): data type of matrix entries. Can be one of: `float`, `double`, `zfloat`, `zdouble`, and `bfloat16`. The `bfloat16` type enables reduced-precision arithmetic for AI/ML workloads. Complex types are `zfloat` and `zdouble`.
 - `--test` (optional): if present, the result of COSMA will be verified with the result of the available SCALAPACK.
 - `-h (--help) (optional)`: print available options.
 
+**Example: Testing BFloat16 matrix multiplication:**
+```bash
+# BFloat16 matrix multiplication with verification
+mpirun -np 4 ./build/miniapp/cosma_miniapp -m 2000 -n 2000 -k 2000 -t bfloat16 --test -r 5
+
+# Large-scale BFloat16 multiplication without verification (performance testing)
+mpirun -np 16 ./build/miniapp/cosma_miniapp -m 10000 -n 10000 -k 10000 -t bfloat16 -r 2
+```
+**Note:** BFloat16 provides approximately the same dynamic range as FP32 but uses only 16 bits per element, reducing memory bandwidth requirements by 50% compared to single precision. This is particularly beneficial for large-scale distributed matrix operations in AI/ML workloads.
+
 ### COSMA pxgemm wrapper
 
 COSMA also contains a wrapper for ScaLAPACK `pxgemm` calls which offers scalapack interface (pxgemm functions with exactly the same signatures as ScaLAPACK). Running these functions will take care of transforming the matrices between ScaLAPACK and COSMA data layout, perform the multiplication using COSMA algorithm and transform the result back to the specified ScaLAPACK data layout.
@@ -311,7 +322,7 @@ The overview of all supported options is given below:
 - `--alpha` (optional, default: 1): alpha parameter in `C = alpha*A*B + beta*C`.
 - `--beta` (optional, default: 0): beta parameter in `C = alpha*A*B + beta*C`.
 - `-r (--n_rep)` (optional, default: 2): number of repetitions.
-- `-t (--type)` (optional, default: `double`): data type of matrix entries. Can be one of: `float`, `double`, `zfloat` and `zdouble`. The last two correspond to complex numbers.
+- `-t (--type)` (optional, default: `double`): data type of matrix entries. Can be one of: `float`, `double`, `zfloat`, `zdouble`, and `bfloat16`. The `bfloat16` type enables reduced-precision arithmetic.
 - `--test` (optional): if present, the result of COSMA will be verified with the result of the available SCALAPACK.
 - `--algorithm` (optional, default: `both`): defines which algorithm (`cosma`, `scalapack` or `both`) to run.
 - `-h (--help) (optional)`: print available options.