Skip to content

Commit

Permalink
cutlass 1.3.1 (NVIDIA#46)
Browse files Browse the repository at this point in the history
CUTLASS 1.3.1 patch resolves failing text with NVRTC.
  • Loading branch information
TimmyLiu authored and kerrmudgeon committed Apr 19, 2019
1 parent 877bdca commit fe3438a
Show file tree
Hide file tree
Showing 13 changed files with 133 additions and 79 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# NVIDIA CUTLASS Changelog

## [1.3.1](https://github.com/NVIDIA/cutlass/releases/tag/v1.3.1) (2019-04-09)
* Corrected NVRTC unit tests.

## [1.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v1.3.0) (2019-03-20)
* Efficient GEMM kernel targeting Volta Tensor Cores via `mma.sync` instruction added in CUDA 10.1.

Expand Down
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# CUTLASS 1.3

_CUTLASS 1.3.0 - March 2019_
_CUTLASS 1.3.1 - April 2019_

CUTLASS is a collection of CUDA C++ template abstractions for implementing
high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
Expand All @@ -28,6 +28,10 @@ CUTLASS 1.3 is described in the [CUTLASS Documentation](CUTLASS.md) and the acco
We describe the structure of an efficient GEMM in our talk at the
[GPU Technology Conference 2018](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf).

# What's New in CUTLASS 1.3.1
_April 2019_
* CUTLASS 1.3.1 corrected NVRTC unit tests..

# What's New in CUTLASS 1.3
_March 2019_
* CUTLASS 1.3 includes an efficient GEMM implementation with the `mma.sync` instruction added in CUDA 10.1.
Expand Down
8 changes: 7 additions & 1 deletion cutlass/cutlass.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

#define CUTLASS_MAJOR 1
#define CUTLASS_MINOR 3
#define CUTLASS_PATCH 0
#define CUTLASS_PATCH 1
#define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)

#ifdef __NVCC__
Expand All @@ -58,8 +58,13 @@

// CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler.
#if defined(__CUDA_ARCH__)
#ifdef __NVCC__
#define CUTLASS_PRAGMA_UNROLL #pragma unroll
#define CUTLASS_PRAGMA_NO_UNROLL #pragma unroll 1
#elif defined(__CUDACC_RTC__)
#define CUTLASS_PRAGMA_UNROLL _Pragma("unroll")
#define CUTLASS_PRAGMA_NO_UNROLL _Pragma("unroll 1")
#endif

#define CUTLASS_GEMM_LOOP CUTLASS_PRAGMA_NO_UNROLL

Expand All @@ -80,6 +85,7 @@ template <typename T>
struct DebugType {};

template <typename T>
CUTLASS_HOST_DEVICE
void DebugTypeFunc(T const& t) {
T::t;
}
Expand Down
61 changes: 49 additions & 12 deletions cutlass/gemm/gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@

#include "cutlass/coord.h"
#include "cutlass/util/platform.h"
#include <cstdio>
namespace cutlass {
namespace gemm {

Expand Down Expand Up @@ -84,6 +83,7 @@ void gemm_kernel_nolb(typename Gemm_::Params params) {

////////////////////////////////////////////////////////////////////////////////////////////////////

#if !defined(__CUDACC_RTC__)
/// Partial specialization for launching the GEMM kernel with or without launch bounds
template <typename Gemm, bool WithLaunchBounds>
struct Launch {
Expand Down Expand Up @@ -152,7 +152,51 @@ struct Launch<Gemm, false> {
smem_size,
stream >>>(params);
}

// Use device API to launch kernel
Launch(cudaError_t &result, CUfunction kernel,
typename Gemm::Params params, dim3 grid, dim3 block, CUstream stream = CU_STREAM_LEGACY) {
void* params_[] = {const_cast<void*>(reinterpret_cast<void const*>(&params))};

int smem_size = int(sizeof(typename Gemm::SharedStorage));
if (smem_size >= (48 << 10)) {

result = cudaFuncSetAttribute(
kernel,
cudaFuncAttributeMaxDynamicSharedMemorySize,
smem_size
);

if (result != cudaSuccess) {
return;
}

result = cudaFuncSetAttribute(
kernel,
cudaFuncAttributePreferredSharedMemoryCarveout,
100);

if (result != cudaSuccess) {
return;
}
}

CUresult launch_result = cuLaunchKernel(
kernel,
grid.x, grid.y, grid.z,
block.x, block.y, block.z,
smem_size, stream, params_, 0);

if (launch_result != CUDA_SUCCESS) {
result = cudaErrorLaunchFailure;
return;
}

result = cudaSuccess;
return;
}
};
#endif

////////////////////////////////////////////////////////////////////////////////////////////////////

Expand Down Expand Up @@ -188,20 +232,13 @@ struct Gemm {
static __host__ cudaError_t launch(CUfunction kernel,
Params const& params,
CUstream stream = CU_STREAM_LEGACY) {
cudaError_t result;

// Launch the kernel.
void* params_[] = {const_cast<void*>(reinterpret_cast<void const*>(&params))};

CUresult result = cuLaunchKernel(
kernel,
params.grid.x, params.grid.y, params.grid.z,
params.block.x, params.block.y, params.block.z,
0, stream, params_, 0);
Launch<KernelClass, Traits::GemmConfig::kLaunchBounds>(
result, kernel, params, params.grid, params.block, stream);

if (result != CUDA_SUCCESS) {
return cudaErrorLaunchFailure;
}
return cudaSuccess;
return result;
}

#endif
Expand Down
1 change: 1 addition & 0 deletions cutlass/gemm/scalar_or_pointer.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

/***************************************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
Expand Down
4 changes: 4 additions & 0 deletions cutlass/layout/thread/transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ struct Copy {
}
};

#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16)
template <int rank>
struct Copy<half, half, rank, cutlass::MatrixLayout::RowMajor, cutlass::MatrixLayout::RowMajor> {
CUTLASS_DEVICE void copy(cutlass::TensorView<half, rank, cutlass::MatrixLayout::RowMajor> dst,
Expand Down Expand Up @@ -140,6 +141,7 @@ struct Copy<half, half, 2, cutlass::MatrixLayout::RowMajor, cutlass::MatrixLayou
}
}
};
#endif

/// igemm swizzle
/// Transform a fragment.
Expand Down Expand Up @@ -239,6 +241,7 @@ struct Transform {
}
};

#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16)
template <typename Shape, int Rank, typename DstLayout, typename SrcLayout>
struct Transform<Shape, Rank, half, DstLayout, half, SrcLayout> {
typedef Fragment<half, ShapeCount<Shape>::kCount> DstFragment;
Expand Down Expand Up @@ -266,6 +269,7 @@ struct Transform<Shape, Rank, half, DstLayout, half, SrcLayout> {
Transformer.copy(dstView, srcView);
}
};
#endif

template <typename Shape, int Rank, typename DstLayout, typename SrcLayout>
struct Transform<Shape, Rank, int8_t, DstLayout, int8_t, SrcLayout> {
Expand Down
2 changes: 0 additions & 2 deletions cutlass/tensor_view.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@

#pragma once

#include <cmath>

#include "cutlass/cutlass.h"
#include "cutlass/tensor_ref.h"

Expand Down
1 change: 0 additions & 1 deletion cutlass/tile_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
#include "cutlass/load_store.h"
#include "cutlass/predicate_vector.h"
#include "cutlass/vector.h"
#include <cstdio>

namespace cutlass {

Expand Down
40 changes: 0 additions & 40 deletions cutlass/util/performance_tuning.h

This file was deleted.

3 changes: 2 additions & 1 deletion cutlass/vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ union Vector {

////////////////////////////////////////////////////////////////////////////////////////////////////

#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16)

template <>
union Vector<half, 1> {
/// The scalar type.
Expand Down Expand Up @@ -118,7 +120,6 @@ union Vector<half, 1> {
}
};

#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16)

template <int kLanes_>
union Vector<half, kLanes_> {
Expand Down
2 changes: 1 addition & 1 deletion tools/nvrtc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ if (CUTLASS_NVRTC_ENABLE)
string(APPEND NVRTC_INCLUDES_STRINGS "char const *kCutlassHeaders[] = {\n")
string(APPEND NVRTC_INCLUDES_NAMES "char const *kCutlassHeaderNames[] = {\n")

add_nvrtc_headers(${CMAKE_SOURCE_DIR} "${CUTLASS_CORE};${CUTLASS_GEMM};${CUTLASS_UTIL};${CUTLASS_DEVICE}")
add_nvrtc_headers(${CMAKE_SOURCE_DIR} "${CUTLASS_CORE};${CUTLASS_GEMM};${CUTLASS_UTIL};${CUTLASS_DEVICE};${CUTLASS_ARCH};${CUTLASS_LAYOUT_THREAD}")
message("${CMAKE_CURRENT_SOURCE_DIR}/")
add_nvrtc_headers("${CMAKE_CURRENT_SOURCE_DIR}/stdlib" "assert.h;stdint.h")
if(CUTLASS_NVRTC_HAS_CUDA_FP16)
Expand Down
4 changes: 4 additions & 0 deletions tools/test/unit/gemm/gemm_nvrtc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ TEST(Dgemm_nvrtc_64x32x8, dgemm_nvrtc_64x32x8_nt) {

////////////////////////////////////////////////////////////////////////////////////////////////////

#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 610))

TEST(Igemm__nvrtc_128x128x32, igemm_nvrtc_256x256x64_tt) {
typedef cutlass::gemm::IgemmTraits<cutlass::MatrixLayout::kRowMajor,
cutlass::MatrixLayout::kRowMajor,
Expand All @@ -52,6 +54,8 @@ TEST(Igemm__nvrtc_128x128x32, igemm_nvrtc_256x256x64_tt) {
run_gemm_nvrtc<IgemmTraits>(gemm_traits, 256, 256, 64);
}

#endif

////////////////////////////////////////////////////////////////////////////////////////////////////

TEST(Sgemm_nvrtc_128x128x8, sgemm_nvrtc_128x112x16_alpha2_beta1_nt) {
Expand Down
Loading

0 comments on commit fe3438a

Please sign in to comment.