ATen ReduceOps (pytorch#5776)

pytorch#5481 was reverted due to a strange test bug. This PR attempts to fix that. This diff adds vectorization to ATen. It uses intel intrinsics to build a general vec256 class, that represents types of 256bit width. These can then be treated like regular variables. Using those it implements torch.sum() for the contiguous case. It uses Intel TBB for multithreading, which allows workstealing and chunks the reduction operations based on a experimentally chosen value (_THRESHOLD). It uses cpuinfo to pick the right code depending on the host's capabilities. The kernels are implemented under native/cpu. Each .cpp file is compiled with -avx, -avx2 and no additional flags. A macro is used to append AVX, AVX2 or NONE to the function name. The header then needs to define the functions three times, one for each capability. This could be improved by either changing the cmake file a bit or possibly generating source code using a Python script etc. For the non-contiguous case this defaults to the current implementation within TH. For CUDA is entirely defaults to the implementation within THC. There probably needs to be a bit of a debate around the design decisions here, the additional dependencies, parallelization strategy, clarity, etc. The numerical results also diverge from numpy with larger tensors, which is expected since we're summing, for example, 8 numbers and then adding the result to the running sum, instead of each number one by one. But there might be something to be said about accumulating into a double for floats or the degree of divergence, the behavior with respect to CUDA, etc. I wrote a [small Python script]( https://github.com/cpuhrsch/benchmark/blob/sumall/benchmarks/sum_bench.py) to compare the results with numpy numerically as well as on timing. I ran this script to create timings both on master and this branch. Here is the command for 1 core `OMP_NUM_THREAD=1 taskset -c 0 python sum_bench.py --enable_numpy 200` Here is the command for all cores `python sum_bench.py --enable_numpy 200` Here are the results of each: [Master, 1 core](https://paste.fedoraproject.org/paste/Nho9JzHpPVK9av8a6mByjQ) [This branch, 1 core](https://paste.fedoraproject.org/paste/6xLHkYvcVJx9z~5MoHxN4w) [Master, all cores](https://paste.fedoraproject.org/paste/5l3V1d5zGqvJcMXIUteMRw) [This branch, all cores](https://paste.fedoraproject.org/paste/J4RuDU-0Drz0aZwtphQwEA) To test the command is `python sum_bench.py --test 200` [This branch, test results](https://paste.fedoraproject.org/paste/kTEoUC~oWgXA6XWMAfNfNw) For this test we look at the average absolute value of the differences. This does not take into account the relative magnitude of the numbers. The numbers are sampled from a standard normal distribution. In terms of performance this diff should bring PyTorch on par with Numpy and usually exceed it by 1.5 to 2x.
lcy-seso · Mar 15, 2018 · 5fa3aac · 5fa3aac
1 parent 42ba8c1
commit 5fa3aac
Show file tree

Hide file tree

Showing 21 changed files with 1,193 additions and 7 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -7,3 +7,10 @@
 [submodule "torch/lib/nanopb"]
 	path = torch/lib/nanopb
 	url = https://github.com/nanopb/nanopb.git
+[submodule "aten/src/ATen/cpu/cpuinfo"]
+	path = aten/src/ATen/cpu/cpuinfo
+	url = https://github.com/Maratyszcza/cpuinfo
+[submodule "aten/src/ATen/cpu/tbb/tbb_remote"]
+	path = aten/src/ATen/cpu/tbb/tbb_remote
+	url = https://github.com/01org/tbb
+	branch = tbb_2018
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
@@ -32,6 +32,11 @@ ENDIF(NOT MSVC)
 
 INCLUDE(CheckCXXSourceCompiles)
 
+# windef.h will define max/min macros if NOMINMAX is not defined
+IF(MSVC)
+  add_definitions(/DNOMINMAX)
+ENDIF(MSVC)
+
 #Check if certain std functions are supported. Sometimes
 #_GLIBCXX_USE_C99 macro is not defined and some functions are missing.
 CHECK_CXX_SOURCE_COMPILES("

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
@@ -45,7 +45,34 @@ IF(NOT MSVC AND NOT "${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
   SET_SOURCE_FILES_PROPERTIES(${PROJECT_SOURCE_DIR}/src/TH/THAtomic.c PROPERTIES COMPILE_FLAGS "-fno-openmp")
   SET_SOURCE_FILES_PROPERTIES(${PROJECT_SOURCE_DIR}/src/TH/THAllocator.c PROPERTIES COMPILE_FLAGS "-fno-openmp")
 ENDIF()
-########################
+
+FILE(GLOB cpu_kernel_cpp_in RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "native/cpu/*.cpp")
+
+list(APPEND CPU_CAPABILITY_NAMES "DEFAULT" "AVX" "AVX2")
+IF(MSVC)
+  LIST(APPEND CPU_CAPABILITY_FLAGS "${MSVC_OPT_FLAG}" "${MSVC_OPT_FLAG}/arch:AVX" "${MSVC_OPT_FLAG}/arch:AVX2")
+ELSE(MSVC)
+  LIST(APPEND CPU_CAPABILITY_FLAGS "-O3" "-O3 -mavx" "-O3 -mavx2")
+ENDIF(MSVC)
+
+list(LENGTH CPU_CAPABILITY_NAMES NUM_CPU_CAPABILITY_NAMES)
+math(EXPR NUM_CPU_CAPABILITY_NAMES "${NUM_CPU_CAPABILITY_NAMES}-1")
+
+FOREACH(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
+  FOREACH(IMPL ${cpu_kernel_cpp_in})
+    LIST(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY)
+    SET(NEW_IMPL ${CMAKE_CURRENT_BINARY_DIR}/${IMPL}.${CPU_CAPABILITY}.cpp)
+    CONFIGURE_FILE(${IMPL} ${NEW_IMPL} COPYONLY)
+    SET(cpu_kernel_cpp ${NEW_IMPL} ${cpu_kernel_cpp}) # Create list of copies
+    LIST(GET CPU_CAPABILITY_FLAGS ${i} FLAGS)
+    IF(MSVC)
+      SET(MACRO_FLAG "/DCPUCAPABILITY${CPU_CAPABILITY}")
+    ELSE(MSVC)
+      SET(MACRO_FLAG "-DCPUCAPABILITY${CPU_CAPABILITY}")
+    ENDIF(MSVC)
+    SET_SOURCE_FILES_PROPERTIES(${NEW_IMPL} PROPERTIES COMPILE_FLAGS "${FLAGS} ${MACRO_FLAG}")
+  ENDFOREACH()
+ENDFOREACH()
 
 ################################################################################
 # Helper functions
@@ -152,7 +179,8 @@ ADD_CUSTOM_TARGET(aten_files_are_generated
 )
 
 
-SET(all_cpp ${base_cpp} ${native_cpp} ${native_cudnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS})
+SET(all_cpp ${base_cpp} ${native_cpp} ${native_cudnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp})
+
 
 INCLUDE_DIRECTORIES(${ATen_CPU_INCLUDE})
 IF(NOT NO_CUDA)
@@ -181,6 +209,17 @@ ELSE()
 ENDIF()
 ADD_DEPENDENCIES(ATen aten_files_are_generated)
 
+set(TBB_ROOT_DIR "${PROJECT_SOURCE_DIR}/src/ATen/cpu/tbb/tbb_remote")
+set(TBB_BUILD_STATIC ON CACHE BOOL " " FORCE)
+set(TBB_BUILD_SHARED OFF CACHE BOOL " " FORCE)
+set(TBB_BUILD_TBBMALLOC OFF CACHE BOOL " " FORCE)
+set(TBB_BUILD_TBBMALLOC_PROXY OFF CACHE BOOL " " FORCE)
+set(TBB_BUILD_TESTS OFF CACHE BOOL " " FORCE)
+add_subdirectory(${PROJECT_SOURCE_DIR}/src/ATen/cpu/tbb)
+set_property(TARGET tbb_static tbb_def_files PROPERTY FOLDER "dependencies")
+target_include_directories(tbb_static PUBLIC ${TBB_ROOT_DIR}/include)
+target_link_libraries(ATen tbb_static)
+
 SET_TARGET_PROPERTIES(ATen PROPERTIES VERSION 1 SOVERSION 1)
 
 if(NOT ${CMAKE_VERSION} VERSION_LESS "3.1")
@@ -252,6 +291,18 @@ if (NNPACK_FOUND)
   target_link_libraries(ATen ${NNPACK_LIBRARIES})
 endif(NNPACK_FOUND)
 
+
+# ---[ Configure cpuinfo
+IF(NOT TARGET cpuinfo)
+  SET(CPUINFO_BUILD_TOOLS OFF CACHE BOOL "")
+  SET(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL "")
+  SET(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL "")
+  SET(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL "")
+  ADD_SUBDIRECTORY("cpu/cpuinfo")
+ENDIF()
+TARGET_LINK_LIBRARIES(ATen cpuinfo)
+
+
 IF(CUDA_FOUND)
   TARGET_LINK_LIBRARIES(ATen
     ${CUDA_LIBRARIES}

diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
@@ -2312,7 +2312,7 @@
     - THTensor* self
 ]]
 [[
-  name: sum
+  name: _sumall
   variants:
     - method
     - function
@@ -2321,6 +2321,13 @@
       return: accreal
       arguments:
         - THTensor* self
+]]
+[[
+  name: _sum
+  variants:
+    - method
+    - function
+  options:
     - cname: sum
       return: argument 0
       scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1)
@@ -2334,7 +2341,7 @@
           default: "false"
 ]]
 [[
-  name: prod
+  name: _prodall
   variants:
     - method
     - function
@@ -2343,6 +2350,13 @@
       return: accreal
       arguments:
         - THTensor* self
+]]
+[[
+  name: _prod
+  variants:
+    - method
+    - function
+  options:
     - cname: prod
       return: argument 0
       scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1)

diff --git a/aten/src/ATen/cpu/cpuinfo b/aten/src/ATen/cpu/cpuinfo