Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
060cf0b
Add double-allocation guard and formatting tweaks to buffer.cpp
dbsanfte Sep 23, 2025
cfa482c
Merge buffer double-allocation guard from llaminar
dbsanfte Sep 23, 2025
035da16
Refactor alignment assertions and improve error messages in memory pool
dbsanfte Sep 24, 2025
a516444
Implement move semantics and enhance copy operations in Mapper class,…
dbsanfte Sep 25, 2025
13ed177
Fix validation tolerance: use relative error instead of absolute
dbsanfte Oct 7, 2025
a3cf149
WIP: Add BFloat16 support - Phase 1 (type definition and basic GEMM)
dbsanfte Oct 19, 2025
5fb0b88
Phase 2: Add BF16 template instantiations across COSMA
dbsanfte Oct 19, 2025
1cc9f76
Phase 4: Add BF16 MPI communication support
dbsanfte Oct 19, 2025
a4ac241
Phase 5: Add Intel MKL native BF16 GEMM support
dbsanfte Oct 19, 2025
beb46d5
Phase 6: Unify bfloat16 types and add GEMM wrapper
dbsanfte Oct 19, 2025
49a3b24
Add comprehensive BFloat16 support to COSMA
dbsanfte Oct 19, 2025
fa07545
Update COSTA submodule to include BF16 support
dbsanfte Oct 19, 2025
2bee5a2
Phase 1: GPU BF16 Type System Integration (COMPLETE)
dbsanfte Oct 19, 2025
c23d986
Phase 2: Update COSMA to use Tiled-MM fork with BF16 support
dbsanfte Oct 19, 2025
063fe52
Phase 2: Add GPU-side BF16 conversion infrastructure
dbsanfte Oct 19, 2025
dc88f1e
Document GPU BF16 conversion kernel implementation
dbsanfte Oct 19, 2025
79aa22c
Phase 4: Add COSMA GPU bfloat16 template instantiation
dbsanfte Oct 19, 2025
f8ca749
Add Phase 4 completion documentation and project summary
dbsanfte Oct 19, 2025
5bf3367
Add OpenBLAS native BF16 support with CPU feature detection
dbsanfte Oct 19, 2025
b36a9a5
Add OpenBLAS native BF16 implementation summary
dbsanfte Oct 19, 2025
02f0d0f
Add Tiled-MM upstream PR summary
dbsanfte Oct 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
[submodule "libs/Tiled-MM"]
path = libs/Tiled-MM
url = https://github.com/eth-cscs/Tiled-MM.git
url = https://github.com/dbsanfte/Tiled-MM.git
branch = feature/bf16-support
[submodule "libs/COSTA"]
path = libs/COSTA
url = https://github.com/eth-cscs/COSTA
url = https://github.com/dbsanfte/COSTA
[submodule "libs/cxxopts"]
path = libs/cxxopts
url = https://github.com/jarro2783/cxxopts
59 changes: 55 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,10 @@ endif ()
set(COSTA_WITH_PROFILING ${COSMA_WITH_PROFILING} CACHE INTERNAL "")
set(COSTA_SCALAPACK ${COSMA_SCALAPACK} CACHE INTERNAL "")

# Use local COSTA submodule (forked with bfloat16 support)
FetchContent_Declare(
costa
GIT_REPOSITORY https://github.com/eth-cscs/costa.git
GIT_TAG 03847e66f05ad4a1eb371b85be628e218ce46f11 # v2.2.3
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/libs/COSTA
FIND_PACKAGE_ARGS NAMES costa
)
# the joy of fetch_content. if we build costa and cosma together
Expand All @@ -114,10 +114,12 @@ FetchContent_MakeAvailable(costa)
# these are only GPU-backends
if (COSMA_GPU_BACKEND MATCHES "CUDA|ROCM")
set(TILEDMM_GPU_BACKEND ${COSMA_GPU_BACKEND} CACHE INTERNAL "")

# Use fork with BF16 support
FetchContent_Declare(
Tiled-MM
GIT_REPOSITORY https://github.com/eth-cscs/Tiled-MM.git
GIT_TAG 0eb75179e670a04c649b50ae5e91bb71b43e4d06 # v2.3.2
GIT_REPOSITORY https://github.com/dbsanfte/Tiled-MM.git
GIT_TAG feature/bf16-support # BF16 support branch
FIND_PACKAGE_ARGS NAMES tiled-MM
)
FetchContent_MakeAvailable(Tiled-MM)
Expand All @@ -134,6 +136,55 @@ if (COSMA_GPU_BACKEND MATCHES "CUDA|ROCM")
message("Tiled-mm target not found")
endif ()

# Check if GPU backend supports BFloat16
include(check_gpu_bf16_support)
check_gpu_bf16_support()

# Pass BF16 support flag to Tiled-MM
if(COSMA_GPU_HAS_BF16_SUPPORT)
set(TILED_MM_HAS_BF16_SUPPORT ON CACHE INTERNAL "Enable BF16 support in Tiled-MM")
target_compile_definitions(Tiled-MM::Tiled-MM INTERFACE TILED_MM_HAS_BF16_SUPPORT)
message(STATUS "Tiled-MM BF16 support: ENABLED")
else()
set(TILED_MM_HAS_BF16_SUPPORT OFF CACHE INTERNAL "Enable BF16 support in Tiled-MM")
message(STATUS "Tiled-MM BF16 support: DISABLED")
endif()

endif()

# CPU BFloat16 Support Detection (for OpenBLAS native BF16)
# This detects if the CPU has AVX512_BF16 instructions and if OpenBLAS
# supports native BF16 GEMM operations (cblas_sbgemm)
if(COSMA_BLAS_VENDOR MATCHES "OPENBLAS")
message(STATUS "Configuring OpenBLAS with BF16 support detection...")

# Check CPU capabilities for BF16
include(check_cpu_bf16_support)
check_cpu_bf16_support()

# Fetch/build OpenBLAS from source with BF16 support
include(fetch_openblas_bf16)

# Configure COSMA with OpenBLAS BF16 capabilities
if(COSMA_CPU_HAS_BF16 AND OPENBLAS_HAS_BF16_SUPPORT)
set(COSMA_OPENBLAS_HAS_BF16_NATIVE ON CACHE BOOL "OpenBLAS has native BF16 GEMM support")
target_compile_definitions(cosma PRIVATE COSMA_OPENBLAS_HAS_BF16_NATIVE)

# Add CPU BF16 compiler flags if needed
if(COSMA_CPU_BF16_FLAGS)
target_compile_options(cosma PRIVATE ${COSMA_CPU_BF16_FLAGS})
endif()

message(STATUS "OpenBLAS native BF16 GEMM: ENABLED (CPU has AVX512_BF16)")
else()
set(COSMA_OPENBLAS_HAS_BF16_NATIVE OFF CACHE BOOL "OpenBLAS native BF16 support")

if(NOT COSMA_CPU_HAS_BF16)
message(STATUS "OpenBLAS native BF16 GEMM: DISABLED (CPU lacks AVX512_BF16)")
elseif(NOT OPENBLAS_HAS_BF16_SUPPORT)
message(STATUS "OpenBLAS native BF16 GEMM: DISABLED (OpenBLAS version too old)")
endif()
endif()
endif()

if (COSMA_WITH_PROFILING)
Expand Down
17 changes: 14 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,10 @@ The paper and other materials on COSMA are available under the following link:
## Features

- **[NEW] Multi-GPU Systems Support:** COSMA is now able to take advantage of fast GPU-to-GPU interconnects either through the use of NCCL/RCCL libraries or by using the GPU-aware MPI. Both, NVIDIA and AMD GPUs are supported.
- **[NEW] BFloat16 Support:** COSMA now supports BFloat16 (BF16) reduced precision arithmetic for AI/ML workloads, enabling memory-efficient distributed matrix multiplication with automatic precision handling.
- **ScaLAPACK API Support:** it is enough to link to COSMA, without changing the code and all `p?gemm` calls will use ScaLAPACK wrappers provided by COSMA.
- **C/Fortran Interface:** written in `C++`, but provides `C` and `Fortran` interfaces.
- **Custom Types:** fully templatized types.
- **Custom Types:** fully templatized types including support for `float`, `double`, complex types (`zfloat`, `zdouble`), and **BFloat16** (`bfloat16`).
- **GPU acceleration:** supports both **NVIDIA** and **AMD** GPUs.
- **Supported BLAS (CPU) backends:** MKL, LibSci, NETLIB, BLIS, ATLAS.
- **Custom Data Layout Support:** natively uses its own blocked data layout of matrices, but supports arbitrary grid-like data layout of matrices.
Expand Down Expand Up @@ -273,10 +274,20 @@ The overview of all supported options is given below:
step. The third parameter is an integer which defines the divisor. This
parameter can be omitted. In that case the default strategy will be used. An example of a possible value for the upper example: `--steps=sm2,pn2,pk2`.
- `-r (--n_rep)` (optional, default: `2`): the number of repetitions.
- `-t (--type)` (optional, default: `double`): data type of matrix entries. Can be one of: `float`, `double`, `zfloat` and `zdouble`. The last two correspond to complex numbers.
- `-t (--type)` (optional, default: `double`): data type of matrix entries. Can be one of: `float`, `double`, `zfloat`, `zdouble`, and `bfloat16`. The `bfloat16` type enables reduced-precision arithmetic for AI/ML workloads. Complex types are `zfloat` and `zdouble`.
- `--test` (optional): if present, the result of COSMA will be verified with the result of the available SCALAPACK.
- `-h (--help) (optional)`: print available options.

**Example: Testing BFloat16 matrix multiplication:**
```bash
# BFloat16 matrix multiplication with verification
mpirun -np 4 ./build/miniapp/cosma_miniapp -m 2000 -n 2000 -k 2000 -t bfloat16 --test -r 5

# Large-scale BFloat16 multiplication without verification (performance testing)
mpirun -np 16 ./build/miniapp/cosma_miniapp -m 10000 -n 10000 -k 10000 -t bfloat16 -r 2
```
**Note:** BFloat16 provides approximately the same dynamic range as FP32 but uses only 16 bits per element, reducing memory bandwidth requirements by 50% compared to single precision. This is particularly beneficial for large-scale distributed matrix operations in AI/ML workloads.

### COSMA pxgemm wrapper

COSMA also contains a wrapper for ScaLAPACK `pxgemm` calls which offers scalapack interface (pxgemm functions with exactly the same signatures as ScaLAPACK). Running these functions will take care of transforming the matrices between ScaLAPACK and COSMA data layout, perform the multiplication using COSMA algorithm and transform the result back to the specified ScaLAPACK data layout.
Expand Down Expand Up @@ -311,7 +322,7 @@ The overview of all supported options is given below:
- `--alpha` (optional, default: 1): alpha parameter in `C = alpha*A*B + beta*C`.
- `--beta` (optional, default: 0): beta parameter in `C = alpha*A*B + beta*C`.
- `-r (--n_rep)` (optional, default: 2): number of repetitions.
- `-t (--type)` (optional, default: `double`): data type of matrix entries. Can be one of: `float`, `double`, `zfloat` and `zdouble`. The last two correspond to complex numbers.
- `-t (--type)` (optional, default: `double`): data type of matrix entries. Can be one of: `float`, `double`, `zfloat`, `zdouble`, and `bfloat16`. The `bfloat16` type enables reduced-precision arithmetic.
- `--test` (optional): if present, the result of COSMA will be verified with the result of the available SCALAPACK.
- `--algorithm` (optional, default: `both`): defines which algorithm (`cosma`, `scalapack` or `both`) to run.
- `-h (--help) (optional)`: print available options.
Expand Down
Loading