Changed make to support more gpu archs, multiple toolkits, reduce lib size. (apache#6588)

DickJC123 · piiswrong · commit 5e3f3e6a0cc6 · 2017-06-07T10:12:32.000-07:00
* Updated make to support more gpu archs, tolerate multiple toolkit versions, reduce lib size.

* Moved CUDA_ARCH setting to Makefile, removed from all make/*.mk files.
diff --git a/Makefile b/Makefile
@@ -19,7 +19,7 @@ ifndef NNVM_PATH
 endif
 
 ifndef DLPACK_PATH
-  	DLPACK_PATH = $(ROOTDIR)/dlpack
+	DLPACK_PATH = $(ROOTDIR)/dlpack
 endif
 
 ifneq ($(USE_OPENMP), 1)
@@ -58,7 +58,7 @@ LDFLAGS = -pthread $(MSHADOW_LDFLAGS) $(DMLC_LDFLAGS)
 ifeq ($(DEBUG), 1)
 	NVCCFLAGS = -std=c++11 -Xcompiler -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 else
-	NVCCFLAGS = -std=c++11 -Xcompiler -D_FORCE_INLINES -g -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+	NVCCFLAGS = -std=c++11 -Xcompiler -D_FORCE_INLINES -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 endif
 
 # CFLAGS for profiler
@@ -129,6 +129,35 @@ ifneq ($(USE_CUDA_PATH), NONE)
 	NVCC=$(USE_CUDA_PATH)/bin/nvcc
 endif
 
+# Sets 'CUDA_ARCH', which determines the GPU architectures supported
+# by the compiled kernels.  Users can edit the KNOWN_CUDA_ARCHS list below
+# to remove archs they don't wish to support to speed compilation, or they
+# can pre-set the CUDA_ARCH args in config.mk for full control.
+#
+# For archs in this list, nvcc will create a fat-binary that will include
+# the binaries (SASS) for all architectures supported by the installed version
+# of the cuda toolkit, plus the assembly (PTX) for the most recent such architecture.
+# If these kernels are then run on a newer-architecture GPU, the binary will
+# be JIT-compiled by the updated driver from the included PTX.
+ifeq ($(USE_CUDA), 1)
+ifeq ($(origin CUDA_ARCH), undefined)
+	KNOWN_CUDA_ARCHS := 30 35 50 52 60 61
+	# Run nvcc on a zero-length file to check architecture-level support.
+	# Create args to include SASS in the fat binary for supported levels.
+	CUDA_ARCH := $(foreach arch,$(KNOWN_CUDA_ARCHS), \
+                  $(shell $(NVCC) -arch=sm_$(arch) -E --x cu /dev/null >/dev/null 2>&1 && \
+                          echo -gencode arch=compute_$(arch),code=sm_$(arch)))
+	# Convert a trailing "code=sm_NN" to "code=[sm_NN,compute_NN]" to also
+	# include the PTX of the most recent arch in the fat-binaries for
+	# forward compatibility with newer GPUs.
+	CUDA_ARCH := $(shell echo $(CUDA_ARCH) | sed 's/sm_\([0-9]*\)$$/[sm_\1,compute_\1]/')
+	# Add fat binary compression if supported by nvcc.
+	COMPRESS := --fatbin-options -compress-all
+	CUDA_ARCH += $(shell $(NVCC) -cuda $(COMPRESS) --x cu /dev/null -o /dev/null >/dev/null 2>&1 && \
+	                     echo $(COMPRESS))
+endif
+endif
+
 # ps-lite
 PS_PATH=$(ROOTDIR)/ps-lite
 DEPS_PATH=$(shell pwd)/deps
diff --git a/make/config.mk b/make/config.mk
@@ -54,13 +54,6 @@ USE_CUDA_PATH = NONE
 # whether use CuDNN R3 library
 USE_CUDNN = 0
 
-# CUDA architecture setting: going with all of them.
-# For CUDA < 6.0, comment the *_50 lines for compatibility.
-CUDA_ARCH := -gencode arch=compute_30,code=sm_30 \
-		-gencode arch=compute_35,code=sm_35 \
-		-gencode arch=compute_50,code=sm_50 \
-		-gencode arch=compute_50,code=compute_50
-
 # whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
 USE_NVRTC = 0
 
diff --git a/make/osx.mk b/make/osx.mk
@@ -48,13 +48,6 @@ USE_CUDA = 0
 # USE_CUDA_PATH = /usr/local/cuda
 USE_CUDA_PATH = NONE
 
-# CUDA architecture setting: going with all of them.
-# For CUDA < 6.0, comment the *_50 lines for compatibility.
-CUDA_ARCH := -gencode arch=compute_30,code=sm_30 \
-		-gencode arch=compute_35,code=sm_35 \
-		-gencode arch=compute_50,code=sm_50 \
-		-gencode arch=compute_50,code=compute_50
-
 # whether use CUDNN R3 library
 USE_CUDNN = 0
 
diff --git a/make/pip_linux_cpu.mk b/make/pip_linux_cpu.mk
@@ -50,13 +50,6 @@ USE_CUDA_PATH = NONE
 # whether use CuDNN R3 library
 USE_CUDNN = 0
 
-# CUDA architecture setting: going with all of them.
-# For CUDA < 6.0, comment the *_50 lines for compatibility.
-CUDA_ARCH := -gencode arch=compute_30,code=sm_30 \
-		-gencode arch=compute_35,code=sm_35 \
-		-gencode arch=compute_50,code=sm_50 \
-		-gencode arch=compute_50,code=compute_50
-
 # whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
 USE_NVRTC = 0