Merge pull request NVIDIA#24 from NVIDIA/cutlass_1.1

Cutlass 1.1
pdh930105 · Sep 20, 2018 · d85f6a1 · d85f6a1
2 parents cf0301e + 0826572
commit d85f6a1
Show file tree

Hide file tree

Showing 1,315 changed files with 92,843 additions and 16,834 deletions.
diff --git a/changelog.md → CHANGELOG.md b/changelog.md → CHANGELOG.md
@@ -1,6 +1,22 @@
 # NVIDIA CUTLASS Changelog
 
-## [1.0.1](https://github.com/NVIDIA/cutlass/releases/tag/v1.0.1) (2018-06-11)
+
+## 1.1.0 (2018-09-19)
+  * Turing Features
+    * WMMA GEMM targeting TensorCores - INT8, INT4, 1-bit
+  * Batched Strided GEMM
+  * Threadblock rasterization strategies
+    * Improved performance for adverse problem sizes and data layouts
+  * Extended CUTLASS Core comonents
+    * Tensor views support arbitrary matrix and tensor layouts
+    * Zip iterators for structuring multiple data streams
+  * Enhanced CUTLASS utilities
+    * Reference code for tensor operations in host and device code
+    * Added HostMatrix<> for simplified matrix creation
+  * Examples
+    * Basic GEMM, tensor views, CUTLASS utilities, batched GEMM, WMMA GEMM
+
+## 1.0.1 (2018-06-11)
 
   * Intra-threadblock reduction added for small threadblock tile sizes
     * sgemm_64x128x16, sgemm_128x128x16, sgemm_128x64x16, sgemm_128x32x16, sgemm_64x64x16, sgemm_64x32x16

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -55,11 +55,21 @@ endif()
 find_package(CUDA)
 find_package(Doxygen QUIET)
 
+###################################################################################################
+#
+# Configure CMake variables
+#
+###################################################################################################
+
+find_library(CUBLAS_LIBRARY cublas HINTS
+                                   ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+                                   ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
+
 # By default we want to build in Release mode to ensure that we're getting best performance
 if (NOT (CMAKE_BUILD_TYPE OR CONFIGURATION_TYPES))
   set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose build level" FORCE)
   # We do support Debug or Release builds
-  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release")
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "RelWithDebInfo" "Release")
 endif()
 
 if(WIN32)
@@ -68,27 +78,59 @@ if(WIN32)
 endif()
 
 if (WIN32)
-  # Enable more warnings and treat as errors
-  string(APPEND NVCC_FLAGS " -Xcompiler /W3 -Xcompiler /WX")
+    # Enable more warnings and treat as errors
+    string(APPEND NVCC_FLAGS " -Xcompiler /W3 -Xcompiler /WX")
 
-  # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
-  string(APPEND NVCC_FLAGS " -Xcompiler /fp:strict")
+    # Disable warning on Unicode characters
+    string(APPEND NVCC_FLAGS " -Xcompiler /wd4819")
 
-  # Verbose option
-  if (${CUTLASS_NVCC_VERBOSE})
-      string(APPEND NVCC_FLAGS " -v")
-  endif()
+    # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
+    string(APPEND NVCC_FLAGS " -Xcompiler /fp:strict")
+
+    # Verbose option
+    if (${CUTLASS_NVCC_VERBOSE})
+        string(APPEND NVCC_FLAGS " -v")
+    endif()
 endif(WIN32)
 
-# Configure CUDA options
-set(CUTLASS_NVCC_ARCHS             "50;60;61;70"  CACHE STRING "The SM architectures to build code for.")
-set(CUTLASS_NVCC_KEEP              OFF            CACHE BOOL "Keep intermediate files generated by NVCC.")
+set(CUTLASS_NVCC_ARCHS "50;60;61;70;75" CACHE STRING "The SM architectures to build code for.")
+set(CUTLASS_NVCC_EMBED_CUBIN ON CACHE BOOL "Embed compiled CUDA kernel binaries into executables.")
+set(CUTLASS_NVCC_EMBED_PTX ON CACHE BOOL "Embed compiled PTX into executables.")
+set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.")
+
+#
+# NOTE: running with asan and CUDA requires the following environment variable:
+#
+#  ASAN_OPTIONS=protect_shadow_gap=0:replace_intrin=0:detect_leaks=0
+#
+# without the above environment setting, an error like the following may be generated:
+#
+#  *** Error: Could not detect active GPU device ID [out of memory]
+#  ...
+#  ==9149==ERROR: LeakSanitizer: detected memory leaks
+#  ...
+#
+if(ENABLE_ASAN)  # https://github.com/google/sanitizers/wiki/AddressSanitizer
+  string(APPEND NVCC_FLAGS " --compiler-options -fsanitize=address --compiler-options -fno-omit-frame-pointer")
+  string(APPEND CMAKE_EXE_LINKER_FLAGS " -fsanitize=address")
+endif()
 
+###################################################################################################
+#
+# Configure CUDA build options
+#
+###################################################################################################
+
+# Set NVCC arguments
 foreach(ARCH ${CUTLASS_NVCC_ARCHS})
-  string(APPEND NVCC_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}")
+  if(CUTLASS_NVCC_EMBED_CUBIN)
+    string(APPEND NVCC_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}")
+  endif()
+  if(CUTLASS_NVCC_EMBED_PTX)
+    string(APPEND NVCC_FLAGS " -gencode arch=compute_${ARCH},code=compute_${ARCH}")
+  endif()
 endforeach()
 
-
 if (CUTLASS_NVCC_KEEP)
     string(APPEND NVCC_FLAGS " -keep")
 endif()
@@ -99,23 +141,22 @@ else()
   string(APPEND NVCC_FLAGS " -lineinfo")
 endif()
 
-if (UNIX)
-  string(APPEND NVCC_FLAGS " -Xcompiler -Wconversion")
-endif()
-
 string(APPEND NVCC_FLAGS_DEBUG " -g")
+string(APPEND NVCC_FLAGS_RELWITHDEBINFO " -O3")
 string(APPEND NVCC_FLAGS_RELEASE " -O3")
 
 # define NDEBUG for release mode to disable assertions
 string(APPEND NVCC_FLAGS_RELEASE " -DNDEBUG")
 
 if (CUTLASS_NATIVE_CUDA)
   set(CMAKE_CUDA_FLAGS "${NVCC_FLAGS}")
-  set(CMAKE_CUDA_FLAGS_DEBUG "${NVCC_FLAGS_DEBUG}")
   set(CMAKE_CUDA_FLAGS_RELEASE "${NVCC_FLAGS_RELEASE}")
+  set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "${NVCC_FLAGS_RELWITHDEBINFO}")
+  set(CMAKE_CUDA_FLAGS_DEBUG "${NVCC_FLAGS_DEBUG}")
 else()
   set(CUDA_NVCC_FLAGS ${NVCC_FLAGS})
   set(CUDA_NVCC_FLAGS_DEBUG ${NVCC_FLAGS_DEBUG})
+  set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ${NVCC_FLAGS_RELWITHDEBINFO})
   set(CUDA_NVCC_FLAGS_RELEASE ${NVCC_FLAGS_RELEASE})
 endif()
 
@@ -128,6 +169,11 @@ file(GLOB CUTLASS_GEMM RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/gemm/*.h)
 file(GLOB CUTLASS_UTIL RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/util/*.h)
 file(GLOB CUTLASS_DEVICE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/device/*.h)
 file(GLOB CUTLASS_CORE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/*.h)
+###################################################################################################
+#
+# Define build targets
+#
+###################################################################################################
 
 source_group("cutlass\\gemm" FILES ${CUTLASS_GEMM})
 source_group("cutlass\\util" FILES ${CUTLASS_UTIL})
@@ -156,9 +202,9 @@ add_custom_target(cutlass_ide SOURCES
 if (DOXYGEN_FOUND)
     # DOT is available. Enable graph generation in the documentation
     if (DOXYGEN_DOT_EXECUTABLE)
-        set(CUTLASS_ENABLE_DOXYGEN_DOT              ON            CACHE BOOL "Use dot to generate graphs in the doxygen documentation.")
+        set(CUTLASS_ENABLE_DOXYGEN_DOT ON CACHE BOOL "Use dot to generate graphs in the doxygen documentation.")
     else()
-        set(CUTLASS_ENABLE_DOXYGEN_DOT              OFF            CACHE BOOL "Use dot to generate graphs in the doxygen documentation." FORCE)
+        set(CUTLASS_ENABLE_DOXYGEN_DOT OFF CACHE BOOL "Use dot to generate graphs in the doxygen documentation." FORCE)
     endif()
 
     if (CUTLASS_ENABLE_DOXYGEN_DOT)
@@ -177,6 +223,5 @@ if (DOXYGEN_FOUND)
     )
 endif()
 
-
-#add_subdirectory(examples/gemm)
 add_subdirectory(tools)
+add_subdirectory(examples)