@@ -19,7 +19,7 @@ ifndef NNVM_PATH
19
19
endif
20
20
21
21
ifndef DLPACK_PATH
22
- DLPACK_PATH = $(ROOTDIR)/dlpack
22
+ DLPACK_PATH = $(ROOTDIR)/dlpack
23
23
endif
24
24
25
25
ifneq ($(USE_OPENMP ) , 1)
@@ -58,7 +58,7 @@ LDFLAGS = -pthread $(MSHADOW_LDFLAGS) $(DMLC_LDFLAGS)
58
58
ifeq ($(DEBUG ) , 1)
59
59
NVCCFLAGS = -std=c++11 -Xcompiler -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
60
60
else
61
- NVCCFLAGS = -std=c++11 -Xcompiler -D_FORCE_INLINES -g - O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
61
+ NVCCFLAGS = -std=c++11 -Xcompiler -D_FORCE_INLINES -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
62
62
endif
63
63
64
64
# CFLAGS for profiler
@@ -129,6 +129,35 @@ ifneq ($(USE_CUDA_PATH), NONE)
129
129
NVCC=$(USE_CUDA_PATH)/bin/nvcc
130
130
endif
131
131
132
+ # Sets 'CUDA_ARCH', which determines the GPU architectures supported
133
+ # by the compiled kernels. Users can edit the KNOWN_CUDA_ARCHS list below
134
+ # to remove archs they don't wish to support to speed compilation, or they
135
+ # can pre-set the CUDA_ARCH args in config.mk for full control.
136
+ #
137
+ # For archs in this list, nvcc will create a fat-binary that will include
138
+ # the binaries (SASS) for all architectures supported by the installed version
139
+ # of the cuda toolkit, plus the assembly (PTX) for the most recent such architecture.
140
+ # If these kernels are then run on a newer-architecture GPU, the binary will
141
+ # be JIT-compiled by the updated driver from the included PTX.
142
+ ifeq ($(USE_CUDA ) , 1)
143
+ ifeq ($(origin CUDA_ARCH ) , undefined)
144
+ KNOWN_CUDA_ARCHS := 30 35 50 52 60 61
145
+ # Run nvcc on a zero-length file to check architecture-level support.
146
+ # Create args to include SASS in the fat binary for supported levels.
147
+ CUDA_ARCH := $(foreach arch,$(KNOWN_CUDA_ARCHS), \
148
+ $(shell $(NVCC) -arch=sm_$(arch) -E --x cu /dev/null >/dev/null 2>&1 && \
149
+ echo -gencode arch=compute_$(arch),code=sm_$(arch)))
150
+ # Convert a trailing "code=sm_NN" to "code=[sm_NN,compute_NN]" to also
151
+ # include the PTX of the most recent arch in the fat-binaries for
152
+ # forward compatibility with newer GPUs.
153
+ CUDA_ARCH := $(shell echo $(CUDA_ARCH) | sed 's/sm_\([0-9]*\)$$/[sm_\1,compute_\1]/')
154
+ # Add fat binary compression if supported by nvcc.
155
+ COMPRESS := --fatbin-options -compress-all
156
+ CUDA_ARCH += $(shell $(NVCC) -cuda $(COMPRESS) --x cu /dev/null -o /dev/null >/dev/null 2>&1 && \
157
+ echo $(COMPRESS))
158
+ endif
159
+ endif
160
+
132
161
# ps-lite
133
162
PS_PATH =$(ROOTDIR ) /ps-lite
134
163
DEPS_PATH =$(shell pwd) /deps
0 commit comments