Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] rvv intrinsic 1.0+ #5642

Open
wants to merge 32 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .github/workflows/android.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ on:
- 'src/*'
- 'src/layer/*'
- 'src/layer/arm/**'
- 'src/layer/riscv/**'
- 'src/layer/x86/**'
- 'src/layer/vulkan/**'
pull_request:
Expand All @@ -20,6 +21,7 @@ on:
- 'src/*'
- 'src/layer/*'
- 'src/layer/arm/**'
- 'src/layer/riscv/**'
- 'src/layer/x86/**'
- 'src/layer/vulkan/**'
concurrency:
Expand Down Expand Up @@ -64,6 +66,11 @@ jobs:
mkdir build-x86_64 && cd build-x86_64
cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="x86_64" ..
cmake --build . -j $(nproc)
- name: riscv64
run: |
mkdir build-riscv64 && cd build-riscv64
cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="riscv64" ..
cmake --build . -j $(nproc)

- name: armeabi-v7a-shared
run: |
Expand All @@ -85,6 +92,11 @@ jobs:
mkdir build-x86_64-shared && cd build-x86_64-shared
cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="x86_64" -DNCNN_SHARED_LIB=ON ..
cmake --build . -j $(nproc)
- name: riscv64-shared
run: |
mkdir build-riscv64-shared && cd build-riscv64-shared
cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="riscv64" -DNCNN_SHARED_LIB=ON ..
cmake --build . -j $(nproc)

ndk-r16b:
runs-on: ubuntu-latest
Expand Down
63 changes: 38 additions & 25 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -420,40 +420,53 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")

if(CMAKE_SIZEOF_VOID_P EQUAL 8)
set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv")
check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat32m1_t _s, _w; float _v; size_t vl; _s = vfmacc_vf_f32m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV)
check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat32m8_t _s, _w; float _v; size_t vl; _s = __riscv_vfmacc_vf_f32m8(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_V)

set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh")
check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m1_t _s, _w; __fp16 _v; size_t vl; _s = vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_ZFH)
set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16")
check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m8_t _s, _w; __fp16 _v; size_t vl; _s = __riscv_vfmacc_vf_f16m8(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_ZVFH)

if(NOT NCNN_COMPILER_SUPPORT_RVV_ZFH)
set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16")
check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m1_t _s, _w; __fp16 _v; size_t vl; _s = vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_ZVFH)
endif()
set(CMAKE_REQUIRED_FLAGS "-march=rv64gc_zfh_xtheadvector -D__fp16=_Float16")
check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m8_t _s, _w; __fp16 _v; size_t vl; _s = __riscv_vfmacc_vf_f16m8(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)

unset(CMAKE_REQUIRED_FLAGS)

if(NCNN_COMPILER_SUPPORT_RVV)
if(NCNN_COMPILER_SUPPORT_RISCV_V OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
option(NCNN_RVV "optimize risc-v platform with v extension" ON)
option(NCNN_RVV_CHECK_VFREDSUM "check compilter about support rvv-intrinsic" ON)
if(NCNN_RVV_CHECK_VFREDSUM)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/ncnn_check_rvv_vfredusum.cmake)
endif()
if(NOT (NCNN_COMPILER_SUPPORT_RVV_ZFH OR NCNN_COMPILER_SUPPORT_RVV_ZVFH))
message(WARNING "The compiler does not support risc-v zfh extension. Upgrading your toolchain is strongly recommended.")
endif()
option(NCNN_RVV_CHECK_PLAIN_SEGMENT "check compilter about rvv segment load/store interface" ON)
if(NCNN_RVV_CHECK_PLAIN_SEGMENT)
set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv")
check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat32m1_t _s, _w; size_t vl; float src[32]={.0f}; vlseg2e32_v_f32m1(&_s, &_w, src, vl); return 0; }" NCNN_COMPILER_USE_RVV_PLAIN_SEG)
unset(CMAKE_REQUIRED_FLAGS)
endif()
if(NOT NCNN_COMPILER_USE_RVV_PLAIN_SEG)
message(WARNING "The compiler uses tuple types for segment load/store. Upgrading your toolchain is strongly recommended.")
add_definitions(-D__rvv_tuple)
if(NCNN_COMPILER_SUPPORT_RISCV_ZVFH OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
if(NCNN_RVV)
option(NCNN_ZVFH "optimize risc-v platform with zvfh extension" ON)
endif()
else()
message(WARNING "The compiler does not support zvfh extension. NCNN_ZVFH will be OFF.")
endif()

# option(NCNN_RVV_CHECK_VFREDSUM "check compilter about support rvv-intrinsic" ON)
# if(NCNN_RVV_CHECK_VFREDSUM)
# include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/ncnn_check_rvv_vfredusum.cmake)
# endif()
# if(NOT (NCNN_COMPILER_SUPPORT_RVV_ZFH OR NCNN_COMPILER_SUPPORT_RVV_ZVFH))
# message(WARNING "The compiler does not support risc-v zfh extension. Upgrading your toolchain is strongly recommended.")
# endif()
# option(NCNN_RVV_CHECK_PLAIN_SEGMENT "check compilter about rvv segment load/store interface" ON)
# if(NCNN_RVV_CHECK_PLAIN_SEGMENT)
# set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv")
# check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat32m1_t _s, _w; size_t vl; float src[32]={.0f}; vlseg2e32_v_f32m1(&_s, &_w, src, vl); return 0; }" NCNN_COMPILER_USE_RVV_PLAIN_SEG)
# unset(CMAKE_REQUIRED_FLAGS)
# endif()
# if(NOT NCNN_COMPILER_USE_RVV_PLAIN_SEG)
# message(WARNING "The compiler uses tuple types for segment load/store. Upgrading your toolchain is strongly recommended.")
# add_definitions(-D__rvv_tuple)
# endif()
else()
message(WARNING "The compiler does not support risc-v v or xtheadvector extension. NCNN_RVV will be OFF.")
endif()

if(NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
option(NCNN_XTHEADVECTOR "optimize risc-v platform with xtheadvector extension" ON)
else()
message(WARNING "The compiler does not support risc-v v extension. NCNN_RVV will be OFF.")
message(WARNING "The compiler does not support risc-v xtheadvector extension. NCNN_XTHEADVECTOR will be OFF.")
endif()

endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)")
set(NCNN_TARGET_ARCH powerpc)
Expand Down
10 changes: 9 additions & 1 deletion build-android.cmd
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
:: Set android ndk root
@ECHO OFF
@SETLOCAL
@SET ANDROID_NDK=<your-ndk-root_path, such as"E:\android-ndk-r18b">
@SET ANDROID_NDK=<your-ndk-root_path, such as"E:\android-ndk-r27">

:: Set ninja.exe
:: @SET NINJA_EXE=<your-ninja-exe_path, such as"D:\android\sdk\cmake\3.10.2.4988404\bin\ninja.exe">
Expand Down Expand Up @@ -38,4 +38,12 @@ cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd

:: android riscv64
mkdir build-android-riscv64
pushd build-android-riscv64
cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="riscv64" -DANDROID_PLATFORM=android-35 -DNCNN_VULKAN=ON ..
cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd

@ENDLOCAL
8 changes: 8 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@ make -j4
make install
popd

##### android riscv64
mkdir -p build-android-riscv64
pushd build-android-riscv64
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="riscv64" -DANDROID_PLATFORM=android-35 -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### linux of hisiv300 (forgot the chip name) toolchain with neon and openmp
mkdir -p build-hisiv300-linux
pushd build-hisiv300-linux
Expand Down
14 changes: 7 additions & 7 deletions cmake/ncnn_add_layer.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -364,13 +364,13 @@ macro(ncnn_add_layer class)

if(NCNN_TARGET_ARCH STREQUAL "riscv" AND CMAKE_SIZEOF_VOID_P EQUAL 8)
if(NCNN_RUNTIME_CPU AND NCNN_RVV)
if(NCNN_COMPILER_SUPPORT_RVV_ZFH)
ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh")
elseif(NCNN_COMPILER_SUPPORT_RVV_ZVFH)
ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16")
elseif(NCNN_COMPILER_SUPPORT_RVV)
ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv")
endif()
ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv")
endif()
if(NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR)
ncnn_add_arch_opt_layer(${class} xtheadvector "-march=rv64gc_zfh_xtheadvector -D__fp16=_Float16")
endif()
if(NCNN_ZVFH)
ncnn_add_arch_opt_source(${class} zvfh "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16")
endif()
endif()

Expand Down
14 changes: 14 additions & 0 deletions cmake/ncnn_generate_xtheadvector_source.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

# must define SRC DST CLASS

file(READ ${SRC} source_data)

# replace
string(TOUPPER ${CLASS} CLASS_UPPER)
string(TOLOWER ${CLASS} CLASS_LOWER)

string(REGEX REPLACE "LAYER_${CLASS_UPPER}_RISCV_H" "LAYER_${CLASS_UPPER}_RISCV_XTHEADVECTOR_H" source_data "${source_data}")
string(REGEX REPLACE "${CLASS}_riscv" "${CLASS}_riscv_xtheadvector" source_data "${source_data}")
string(REGEX REPLACE "#include \"${CLASS_LOWER}_riscv.h\"" "#include \"${CLASS_LOWER}_riscv_xtheadvector.h\"" source_data "${source_data}")

file(WRITE ${DST} "${source_data}")
16 changes: 9 additions & 7 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -630,16 +630,18 @@ if(NCNN_TARGET_ARCH STREQUAL "loongarch")
endif()
endif()

if(NCNN_TARGET_ARCH STREQUAL "riscv" AND NOT C906)
if(NCNN_TARGET_ARCH STREQUAL "riscv" AND CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT C906)
if(NOT NCNN_RUNTIME_CPU AND NCNN_RVV)
if(NCNN_COMPILER_SUPPORT_RVV_ZFH)
target_compile_options(ncnn PRIVATE -march=rv64gcv_zfh)
elseif(NCNN_COMPILER_SUPPORT_RVV_ZVFH)
target_compile_options(ncnn PRIVATE -march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16)
elseif(NCNN_COMPILER_SUPPORT_RVV)
target_compile_options(ncnn PRIVATE -march=rv64gcv)
set(RISCV_MARCH_FLAG "-march=rv64gcv")
if(NCNN_ZVFH)
set(RISCV_MARCH_FLAG "${RISCV_MARCH_FLAG}_zfh_zvfh")
target_compile_options(ncnn PRIVATE -D__fp16=_Float16)
endif()
elseif(NOT NCNN_RUNTIME_CPU AND NCNN_ZVFH)
set(RISCV_MARCH_FLAG "-march=rv64gc_zfh_xtheadvector")
target_compile_options(ncnn PRIVATE -D__fp16=_Float16)
endif()
target_compile_options(ncnn PRIVATE ${RISCV_MARCH_FLAG})
endif()

if(NCNN_PPC64LE_VSX)
Expand Down
32 changes: 32 additions & 0 deletions src/cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2564,6 +2564,38 @@ int cpu_support_riscv_zfh()
#endif
}

int cpu_support_riscv_zvfh()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __riscv
// v + f does not imply zfh, but how to discover zvfh properly ?
// upstream issue https://github.com/riscv/riscv-isa-manual/issues/414
return g_hwcaps & COMPAT_HWCAP_ISA_V && g_hwcaps & COMPAT_HWCAP_ISA_F;
#else
return 0;
#endif
#else
return 0;
#endif
}

int cpu_support_riscv_xtheadvector()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __riscv
// v + f does not imply zfh, but how to discover zvfh properly ?
// upstream issue https://github.com/riscv/riscv-isa-manual/issues/414
return g_hwcaps & COMPAT_HWCAP_ISA_V && g_hwcaps & COMPAT_HWCAP_ISA_F;
#else
return 0;
#endif
#else
return 0;
#endif
}

int cpu_riscv_vlenb()
{
try_initialize_global_cpu_info();
Expand Down
4 changes: 4 additions & 0 deletions src/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,10 @@ NCNN_EXPORT int cpu_support_loongson_mmi();
NCNN_EXPORT int cpu_support_riscv_v();
// zfh = riscv half-precision float
NCNN_EXPORT int cpu_support_riscv_zfh();
// zvfh = riscv vector half-precision float
NCNN_EXPORT int cpu_support_riscv_zvfh();
// xtheadvector = riscv xtheadvector
NCNN_EXPORT int cpu_support_riscv_xtheadvector();
// vlenb = riscv vector length in bytes
NCNN_EXPORT int cpu_riscv_vlenb();

Expand Down
7 changes: 7 additions & 0 deletions src/layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,13 @@ Layer* create_layer_cpu(int index)
}
else
#endif // NCNN_RUNTIME_CPU && NCNN_RVV
#if NCNN_RUNTIME_CPU && NCNN_XTHEADVECTOR
if (ncnn::cpu_support_riscv_xtheadvector())
{
layer_creator = layer_registry_xtheadvector[index].creator;
}
else
#endif // NCNN_RUNTIME_CPU && NCNN_XTHEADVECTOR
{
layer_creator = layer_registry_arch[index].creator;
}
Expand Down
2 changes: 1 addition & 1 deletion src/layer/noop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Noop::Noop()
{
support_inplace = true;
support_packing = true;
support_fp16_storage = cpu_support_arm_asimdhp() || cpu_support_riscv_zfh();
support_fp16_storage = cpu_support_arm_asimdhp() || cpu_support_riscv_zvfh();
support_bf16_storage = true;
}

Expand Down
66 changes: 66 additions & 0 deletions src/layer/riscv/absval_fp16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#if NCNN_RUNTIME_CPU && NCNN_ZVFH && __riscv_vector && !__riscv_zvfh
void absval_fp16_zvfh(Mat& bottom_top_blob, const Option& opt);
#endif

#if __riscv_zvfh
static inline vfloat16m8_t __riscv_vfabs_v_f16m8_absval(vfloat16m8_t op1, size_t vl)
{
return __riscv_vfsgnjx_vv_f16m8(op1, op1, vl);
}
#endif // __riscv_zvfh

static void absval_fp16(Mat& bottom_top_blob, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ZVFH && __riscv_vector && !__riscv_xtheadvector && !__riscv_zvfh
if (ncnn::cpu_support_riscv_zvfh())
{
absval_fp16_zvfh(bottom_top_blob, opt);
return;
}
#endif

#if __riscv_zvfh
const int w = bottom_top_blob.w;
const int h = bottom_top_blob.h;
const int d = bottom_top_blob.d;
const int channels = bottom_top_blob.c;
const int elempack = bottom_top_blob.elempack;
const int size = w * h * d * elempack;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
__fp16* ptr = bottom_top_blob.channel(q);

int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e16m8(n);

vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
_p = __riscv_vfabs_v_f16m8_absval(_p, vl);
__riscv_vse16_v_f16m8(ptr, _p, vl);

ptr += vl;
n -= vl;
}
}
#else
(void)bottom_top_blob;
(void)opt;
#endif // __riscv_zvfh
}
Loading
Loading