diff --git a/3rdparty/mshadow b/3rdparty/mshadow index 317fad64cc23..a8c650ce8a70 160000 --- a/3rdparty/mshadow +++ b/3rdparty/mshadow @@ -1 +1 @@ -Subproject commit 317fad64cc234c458e3f01ff47fffe3b8b3e5f63 +Subproject commit a8c650ce8a708608a282c4d1e251c57873a8db25 diff --git a/CMakeLists.txt b/CMakeLists.txt index 363237f909e5..05d8021c3677 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,7 @@ mxnet_option(USE_OPENCV "Build with OpenCV support" ON) mxnet_option(USE_OPENMP "Build with Openmp support" ON) mxnet_option(USE_CUDNN "Build with cudnn support" ON) # one could set CUDNN_ROOT for search path mxnet_option(USE_SSE "Build with x86 SSE instruction support" ON) +mxnet_option(USE_F16C "Build with x86 F16C instruction support" ON) # autodetects support if ON mxnet_option(USE_LAPACK "Build with lapack support" ON IF NOT MSVC) mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON) mxnet_option(USE_MKLML_MKL "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE)) @@ -86,6 +87,10 @@ if(MSVC) add_definitions(-DNNVM_EXPORTS) add_definitions(-DDMLC_STRICT_CXX11) add_definitions(-DNOMINMAX) + set(SUPPORT_F16C FALSE) + if(USE_F16C) + message("F16C instruction set is not yet supported for MSVC") + endif() set(CMAKE_C_FLAGS "/MP") set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} /bigobj") else(MSVC) @@ -102,6 +107,29 @@ else(MSVC) else() set(SUPPORT_MSSE2 FALSE) endif() + # For cross complication, turn off flag if target device does not support it + if(USE_F16C) + check_cxx_compiler_flag("-mf16c" COMPILER_SUPPORT_MF16C) + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + execute_process(COMMAND cat /proc/cpuinfo + COMMAND grep flags + COMMAND grep f16c + OUTPUT_VARIABLE CPU_SUPPORT_F16C) + elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + execute_process(COMMAND sysctl -a + COMMAND grep machdep.cpu.features + COMMAND grep F16C + OUTPUT_VARIABLE CPU_SUPPORT_F16C) + endif() + if(NOT CPU_SUPPORT_F16C) + message("CPU does not support F16C instructions") + endif() + if(CPU_SUPPORT_F16C AND COMPILER_SUPPORT_MF16C) + set(SUPPORT_F16C TRUE) + endif() + else() + set(SUPPORT_F16C FALSE) + endif() set(CMAKE_C_FLAGS "-Wall -Wno-unknown-pragmas -fPIC -Wno-sign-compare") if ("${CMAKE_CXX_COMPILER_ID}" MATCHES ".*Clang$") set(CMAKE_C_FLAGS "-Wno-braced-scalar-init") diff --git a/Makefile b/Makefile index 043dcb9795e3..951b29b41cf4 100644 --- a/Makefile +++ b/Makefile @@ -187,6 +187,30 @@ ifeq ($(USE_CUDNN), 1) LDFLAGS += -lcudnn endif +# whether to use F16C instruction set extension for fast fp16 compute on CPU +# if cross compiling you may want to explicitly turn it off if target system does not support it +ifndef USE_F16C + ifneq ($(OS),Windows_NT) + detected_OS := $(shell uname -s) + ifeq ($(detected_OS),Darwin) + F16C_SUPP = $(shell sysctl -a | grep machdep.cpu.features | grep F16C) + endif + ifeq ($(detected_OS),Linux) + F16C_SUPP = $(shell cat /proc/cpuinfo | grep flags | grep f16c) + endif + ifneq ($(strip $(F16C_SUPP)),) + USE_F16C=1 + else + USE_F16C=0 + endif + endif + # if OS is Windows, check if your processor and compiler support F16C architecture. + # One way to check if processor supports it is to download the tool + # https://docs.microsoft.com/en-us/sysinternals/downloads/coreinfo. + # If coreinfo -c shows F16C and compiler supports it, + # then you can set USE_F16C=1 explicitly to leverage that capability" +endif + # gperftools malloc library (tcmalloc) ifeq ($(USE_GPERFTOOLS), 1) # FIND_LIBNAME=tcmalloc_and_profiler diff --git a/amalgamation/Makefile b/amalgamation/Makefile index f7f3c001e197..f03a2b97fc44 100644 --- a/amalgamation/Makefile +++ b/amalgamation/Makefile @@ -55,7 +55,6 @@ CFLAGS=-std=c++11 -Wno-unknown-pragmas -Wall $(DEFS) # if architecture of the CPU supports F16C instruction set, enable USE_F16C for fast fp16 computation on CPU ifeq ($(USE_F16C), 1) CFLAGS+=-mf16c - DEFS+=-DMSHADOW_USE_F16C=1 else DEFS+=-DMSHADOW_USE_F16C=0 endif diff --git a/make/config.mk b/make/config.mk index 9eded6f50807..dd67c33cc9e5 100644 --- a/make/config.mk +++ b/make/config.mk @@ -132,10 +132,19 @@ endif ARCH := $(shell uname -a) ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64)) USE_SSE=0 + USE_F16C=0 else USE_SSE=1 endif +#---------------------------- +# F16C instruction support for faster arithmetic of fp16 on CPU +#---------------------------- +# For distributed training with fp16, this helps even if training on GPUs +# If left empty, checks CPU support and turns it on. +# For cross compilation, please check support for F16C on target device and turn off if necessary. +USE_F16C = + #---------------------------- # distributed computing #---------------------------- diff --git a/make/crosscompile.jetson.mk b/make/crosscompile.jetson.mk index 31a1398c1b75..acc9c4a5a8a4 100644 --- a/make/crosscompile.jetson.mk +++ b/make/crosscompile.jetson.mk @@ -132,7 +132,10 @@ endif # Settings for power and arm arch #---------------------------- USE_SSE=0 + +# Turn off F16C instruction set support USE_F16C=0 + #---------------------------- # distributed computing #----------------------------