Merge branch 'develop' into Threading_Callback

OpenMathLib · Apr 21, 2024 · ddcd7d6 · ddcd7d6
2 parents 7102367 + de465ff
commit ddcd7d6
Show file tree

Hide file tree

Showing 2,261 changed files with 9,629 additions and 37,262 deletions.
diff --git a/.cirrus.yml b/.cirrus.yml
@@ -156,7 +156,7 @@ FreeBSD_task:
     image_family: freebsd-13-2
   install_script:
   - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc 
-  - ln -s /usr/local/lib/gcc12/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
+  - ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
   compile_script:
   - gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1
 

diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml
@@ -84,6 +84,7 @@ jobs:
         run: |
           export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
           qemu-riscv64 ./utest/openblas_utest
+          qemu-riscv64 ./utest/openblas_utest_ext
           OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1
           OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1
           OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -0,0 +1,24 @@
+name: Publish docs via GitHub Pages
+on:
+  push:
+    branches:
+      - develop
+jobs:
+  build:
+    name: Deploy docs
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: "3.10"
+      - run: pip install mkdocs mkdocs-material
+      # mkdocs gh-deploy command only builds to the top-level, hence building then deploying ourselves
+      - run: mkdocs build
+      - name: Deploy docs
+        uses: peaceiris/actions-gh-pages@v3
+        if: ${{ github.ref == 'refs/heads/develop' }}
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: ./site
+          destination_dir: docs/
diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml
@@ -77,6 +77,7 @@ jobs:
       - name: Test
         run: |
           qemu-loongarch64-static ./utest/openblas_utest
+          qemu-loongarch64-static ./utest/openblas_utest_ext
           OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1
           OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1
           OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1

diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml
@@ -80,6 +80,7 @@ jobs:
         run: |
           export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
           qemu-mips64el ./utest/openblas_utest
+          qemu-mips64el ./utest/openblas_utest_ext
           OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat1
           OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat1
           OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat1

diff --git a/.gitignore b/.gitignore
@@ -51,43 +51,55 @@ utest/openblas_utest_ext
 ctest/xccblat1
 ctest/xccblat2
 ctest/xccblat3
+ctest/xccblat3_3m
 ctest/xdcblat1
 ctest/xdcblat2
 ctest/xdcblat3
+ctest/xdcblat3_3m
 ctest/xscblat1
 ctest/xscblat2
 ctest/xscblat3
+ctest/xscblat3_3m
 ctest/xzcblat1
 ctest/xzcblat2
 ctest/xzcblat3
+ctest/xzcblat3_3m
 exports/linktest.c
 exports/linux.def
 kernel/setparam_*.c
 kernel/kernel_*.h
 test/CBLAT2.SUMM
 test/CBLAT3.SUMM
+test/CBLAT3_3M.SUMM
 test/DBLAT2.SUMM
 test/DBLAT3.SUMM
+test/DBLAT3_3M.SUMM
 test/SBLAT2.SUMM
 test/SBLAT3.SUMM
+test/SBLAT3_3M.SUMM
 test/ZBLAT2.SUMM
 test/ZBLAT3.SUMM
+test/ZBLAT3_3M.SUMM
 test/SHBLAT3.SUMM
 test/SBBLAT3.SUMM
 test/cblat1
 test/cblat2
 test/cblat3
+test/cblat3_3m
 test/dblat1
 test/dblat2
 test/dblat3
+test/dblat3_3m
 test/sblat1
 test/sblat2
 test/sblat3
+test/sblat3_3m
 test/test_shgemm
 test/test_sbgemm
 test/zblat1
 test/zblat2
 test/zblat3
+test/zblat3_3m
 build
 build.*
 *.swp

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
 
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 26.dev)
+set(OpenBLAS_PATCH_VERSION 27.dev)
 
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 

diff --git a/Changelog.txt b/Changelog.txt
@@ -1,4 +1,104 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.27
+ 4-Apr-2024
+
+general:
+- added initial (generic) support for the CSKY architecture
+- capped the maximum number of threads used in GEMM, GETRF and POTRF to avoid creating
+  underutilized or idle threads
+- sped up multithreaded POTRF on all platforms
+- added extension openblas_set_num_threads_local() that returns the previous thread count
+- re-evaluated the SGEMV and DGEMV load thresholds to avoid activating multithreading 
+  for too small workloads
+- improved the fallback code used when the precompiled number of threads is exceeded,
+  and made it callable multiple times during the lifetime of an instance
+- added CBLAS interfaces for the BLAS extensions ?AMIN,?AMAX, CAXPYC and ZAXPYC
+- fixed a potential buffer overflow in the interface to the GEMMT kernels
+- fixed use of incompatible pointer types in GEMMT and C/ZAXPBY as flagged by GCC-14
+- fixed unwanted case sensitivity of the character parameters in ?TRTRS
+- sped up the OpenMP thread management code
+- fixed sizing of logical variables in INTERFACE64 builds of the C version of LAPACK
+- fixed inclusion of new LAPACK and LAPACKE functions from LAPACK 3.11 in the shared library
+- added a testsuite for the BLAS extensions
+- modified the error thresholds for SGS/DGS functions in the LAPACK testsuite to suppress
+  spurious errors
+- added support for building the benchmark collection with CMAKE
+- added rewriting of linker options to avoid linking both libgomp and libomp in CMAKE builds
+  with OpenMP enabled that use clang with gfortran
+- fixed building on systems with ucLibc
+- added support for calling ?NRM2 with a negative increment value on all architectures
+- added support for the LLVM18 version of the flang-new compiler
+- fixed handling of the OPENBLAS_LOOPS variable in several benchmarks
+- Integrated fixes from the Reference-LAPACK project:
+  - Increased accuracy in C/ZLARFGP (Reference-LAPACK PR 981)
+
+x86:
+- fixed handling of NaN and Inf arguments in ZSCAL
+- fixed GEMM3M functions failing in CMAKE builds
+
+x86-64:
+- removed all instances of sched_yield() on Linux and BSD
+- fixed a potential deadlock in the thread server on MSWindows (introduced in 0.3.26)
+- fixed GEMM3M functions failing in CMAKE builds
+- fixed handling of NaN and Inf arguments in ZSCAL
+- added compiler checks for AVX512BF16 compatibility
+- fixed LLVM compiler options for Sapphire Rapids 
+- fixed cpu handling fallbacks for Sapphire Rapids with
+  disabled AVX2 in DYNAMIC_ARCH mode
+- fixed extensions SCSUM and DZSUM
+- improved GEMM performance for ZEN targets
+
+arm:
+- fixed handling of NaN and Inf arguments in ZSCAL
+
+arm64:
+- added initial support for the Cortex-A76 cpu
+- fixed handling of NaN and Inf arguments in ZSCAL
+- fixed default compiler options for gcc (-march and -mtune)
+- added support for ArmCompilerForLinux
+- added support for the NeoverseV2 cpu in DYNAMIC_ARCH builds
+- fixed mishandling of the INTERFACE64 option in CMAKE builds
+- corrected SCSUM kernels (erroneously duplicating SCASUM behaviour)  
+- added SVE-enabled kernels for CSUM/ZSUM
+- worked around an inaccuracy in the NRM2 kernels for NeoverseN1 and Apple M
+
+power:
+- improved performance of SGEMM on POWER8/9/10
+- improved performance of DGEMM on POWER10
+- added support for OpenMP builds with xlc/xlf on AIX
+- improved cpu autodetection for DYNAMIC_ARCH builds on older AIX
+- fixed cpu core counting on AIX
+- added support for building a shared library on AIX
+
+riscv64:
+- added support for the X280 cpu
+- added support for semi-generic RISCV models with vector length 128 or 256
+- added support for compiling with either RVV 0.7.1 or RVV 1.0 standard compilers
+- fixed handling of NaN and Inf arguments in ZSCAL
+- improved cpu model autodetection
+- fixed corner cases in ?AXPBY for C910V
+- fixed handling of zero increments in ?AXPY kernels for C910V
+
+loongarch64:
+- added optimized kernels for ?AMIN and ?AMAX
+- fixed handling of NaN and Inf arguments in ZSCAL
+- fixed handling of corner cases in ?AXPBY
+- fixed computation of SAMIN and DAMIN in LSX mode
+- fixed computation of ?ROT
+- added optimized SSYMV and DSYMV kernels for LSX and LASX mode
+- added optimized CGEMM and ZGEMM kernels for LSX and LASX mode
+- added optimized CGEMV and ZGEMV kernels
+
+mips:
+- fixed utilizing MSA on P5600 and related cpus (broken in 0.3.22)
+- fixed handling of NaN and Inf arguments in ZSCAL
+- fixed mishandling of the INTERFACE64 option in CMAKE builds
+
+zarch:
+- fixed handling of NaN and Inf arguments in ZSCAL
+- fixed calculation of ?SUM on Z13
+
 ====================================================================
 Version 0.3.26
  2-Jan-2024

diff --git a/Makefile.arm64 b/Makefile.arm64
@@ -58,6 +58,13 @@ FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
 endif
 endif
 
+ifeq ($(CORE), CORTEXA76)
+CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a76
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a76
+endif
+endif
+
 ifeq ($(CORE), FT2000)
 CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
 ifneq ($(F_COMPILER), NAG)
@@ -138,13 +145,13 @@ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
 ifneq ($(OSNAME), Darwin)
 CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
 else
-CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8.2-a+sve+bf16 -mtune=cortex-a72
 endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
 endif
 else
-CCOMMON_OPT += -march=armv8.5-a+sve
+CCOMMON_OPT += -march=armv8.5-a+sve+bf16
 ifneq ($(CROSS), 1)
 CCOMMON_OPT += -mtune=native
 endif
@@ -156,13 +163,13 @@ endif
 endif
 endif
 else
-CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8.2-a+sve+bf16 -mtune=cortex-a72
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
 endif
 endif
 else
-CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8-a+sve+bf16 -mtune=cortex-a72
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
 endif

diff --git a/Makefile.rule b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.26.dev
+VERSION = 0.3.27.dev
 
 # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
 # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
@@ -173,6 +173,10 @@ NO_AFFINITY = 1
 # If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
 # BIGNUMA = 1
 
+# If you are compiling for an embedded system ("bare metal") like Cortex M series
+# Note that you will have to provide implementations of malloc() and free() in this case
+# EMBEDDED = 1
+
 # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
 # and OS. However, the performance is low.
 # NO_AVX = 1

diff --git a/Makefile.system b/Makefile.system
@@ -811,8 +811,12 @@ ifeq ($(ARCH), arm)
 NO_BINARY_MODE  = 1
 BINARY_DEFINED  = 1
 
+ifneq ($(EMBEDDED), 1)
 CCOMMON_OPT += -marm
 FCOMMON_OPT += -marm
+else
+CCOMMON_OPT += -DOS_EMBEDDED -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16
+endif
 
 # If softfp abi is mentioned on the command line, force it.
 ifeq ($(ARM_SOFTFP_ABI), 1)
@@ -1195,9 +1199,6 @@ endif
 else
 FCOMMON_OPT += -q32
 endif
-ifeq ($(USE_OPENMP), 1)
-FCOMMON_OPT += -openmp
-endif
 endif
 
 ifeq ($(F_COMPILER), PGI)

diff --git a/README.md b/README.md
@@ -167,6 +167,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 - **Cortex A57**: Optimized Level-3 and Level-2 functions
 - **Cortex A72**: same as A57 ( different cpu specifications)
 - **Cortex A73**: same as A57 (different cpu specifications)
+- **Cortex A76**: same as A57 (different cpu specifications)
 - **Falkor**: same as A57 (different cpu specifications)
 - **ThunderX**: Optimized some Level-1 functions
 - **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
@@ -187,7 +188,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 
 - **AIX**: Dynamic architecture with OpenXL and OpenMP.
   ```sh
-  make CC=ibm-clang_r FC=xlf TARGET=POWER7 BINARY=64 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 USE_THREAD=1
+  make CC=ibm-clang_r FC=xlf_r TARGET=POWER7 BINARY=64 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 USE_THREAD=1
   ```
 
 #### IBM zEnterprise System
@@ -211,7 +212,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 - **ZVL???B**: Level-3 BLAS and Level-1,2 including vectorised kernels targeting generic RISCV cores with vector support with registers of at least the corresponding width; ZVL128B and ZVL256B are available.
 e.g.:
   ```sh
-make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \
+    make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \
     BINARY=64 ARCH=riscv64 CC='clang -target riscv64-unknown-linux-gnu' \
     AR=riscv64-unknown-linux-gnu-ar AS=riscv64-unknown-linux-gnu-gcc \
     LD=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran \

diff --git a/TargetList.txt b/TargetList.txt
@@ -93,6 +93,7 @@ CORTEXA53
 CORTEXA57
 CORTEXA72
 CORTEXA73
+CORTEXA76
 CORTEXA510
 CORTEXA710
 CORTEXX1

diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
@@ -932,7 +932,7 @@ endif ()
     set(ZGEMM_UNROLL_M 4)
     set(ZGEMM_UNROLL_N 4)
     set(SYMV_P 16)
-  elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73")
+  elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73" OR "${TCORE}" STREQUAL "CORTEXA76")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_CODE_SIZE\t49152\n"
       "#define L1_CODE_LINESIZE\t64\n"