Spurious __syncthreads in the beginning of the kernel. #530

thetheodor · 2018-06-20T13:55:09Z

I noticed that in some cases the first statement after variable definitions in the kernel is a __syncthreads(); which if I am not mistaken makes no sense.

For example, in GroupNormalizationSingleKernel with input sizes C=1024, G=4, N=32, H=4, W=41 and options:

tc::CudaMappingOptions::makeNaiveMappingOptions()
    .outerScheduleFusionStrategy(tc::FusionStrategy::Max)
    .outerScheduleAllowSkewing(true)
    .outerSchedulePositiveOrthant(true)
    .intraTileScheduleFusionStrategy(tc::FusionStrategy::Max)
    .intraTileScheduleAllowSkewing(true)
    .intraTileSchedulePositiveOrthant(true)
    .fixParametersBeforeScheduling(false)
    .tile(1)
    .unroll(148)
    .tileImperfectlyNested(false)
    .matchLibraryCalls(false)
    .mapToThreads(1, 126, 3)
    .mapToBlocks(118, 61, 193)
    .useSharedMemory(true)
    .usePrivateMemory(false)
    .unrollCopyShared(true)
    .useReadOnlyCache(true);

The generated code is:

template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}
#define if_then_else(cond,a,b) ((cond) ? (a) : (b))
ft
// Can't include system dependencies with NVRTC
// Can't include cuda_fp16.h with NVRTC due to transitive system dependencies
// #include <cuda_fp16.h>

// Halide type handling
typedef char int8;
typedef short int16;
typedef int int32;
typedef long int64;
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint32;
typedef unsigned long uint64;
// typedef half float16;
typedef float float32;
typedef double float64;

#define inff __int_as_float(0x7f800000)
#define inf __longlong_as_double(0x7ff0000000000000LL)

// Before CUDA 9, syncwarp is a noop since warps are always synchronized.
#if __CUDACC_VER_MAJOR__ < 9
__device__ void __syncwarp(unsigned mask = 0xFFFFFFFF) {}
#endif



namespace __tc {
template<typename T>
__device__ __forceinline__ T ldg(const T* ptr) {
#if __CUDA_ARCH__ >= 350
  return __ldg(ptr);
#else
  return *ptr;
#endif
}
} // namespace __tc
__global__ __launch_bounds__(4) void group_normalization_single_kernel_256_4_34_32_41(int32 D, int32 G, int32 H, int32 N, int32 W, float32* pO, float32* psum, float32* psumSquares, const float32* pI, const float32* pgamma, const float32* pbeta) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*O)[4][256][34][41] = reinterpret_cast<float32 (*)[4][256][34][41]>(pO);
  float32 (*sum)[4] = reinterpret_cast<float32 (*)[4]>(psum);
  float32 (*sumSquares)[4] = reinterpret_cast<float32 (*)[4]>(psumSquares);
  const float32 (*I)[4][256][34][41] = reinterpret_cast<const float32 (*)[4][256][34][41]>(pI);
  const float32 (*gamma)[256] = reinterpret_cast<const float32 (*)[256]>(pgamma);
  const float32 (*beta)[256] = reinterpret_cast<const float32 (*)[256]>(pbeta);
  __shared__ float32 _sumSquares_0[1][5];
  __shared__ float32 _sum_0[1][5];
  __shared__ float32 _beta_0[4][257];
  __shared__ float32 _gamma_0[4][257];
  __syncthreads();
  if (t1 == 0) {
    _sumSquares_0[0][0] = sumSquares[b0][0];
    _sumSquares_0[0][1] = sumSquares[b0][1];
    _sumSquares_0[0][2] = sumSquares[b0][2];
    _sumSquares_0[0][3] = sumSquares[b0][3];
    _sum_0[0][0] = sum[b0][0];
    _sum_0[0][1] = sum[b0][1];
    _sum_0[0][2] = sum[b0][2];
    _sum_0[0][3] = sum[b0][3];
  }
  for (int c3 = 0; c3 <= 255; c3 += 1) {
    _gamma_0[t1][c3] = __tc::ldg(&gamma[t1][c3]);
  }
  for (int c3 = 0; c3 <= 255; c3 += 1) {
    _beta_0[t1][c3] = __tc::ldg(&beta[t1][c3]);
  }
  __syncthreads();
  if (t1 == 0) {
    for (int c4 = 0; c4 <= 3; c4 += 1) {
      _sum_0[0][c4] = 0.000000f;
      for (int c5 = 0; c5 <= 255; c5 += 1) {
        for (int c6 = 0; c6 <= 33; c6 += 1) {
          for (int c7 = 0; c7 <= 40; c7 += 1) {
            _sum_0[0][c4] = (_sum_0[0][c4] + __tc::ldg(&I[b0][c4][c5][c6][c7]));
            if (c5 == 0 && c6 == 0 && c7 == 0) {
              _sumSquares_0[0][c4] = 0.000000f;
            }
            _sumSquares_0[0][c4] = (_sumSquares_0[0][c4] + (__tc::ldg(&I[b0][c4][c5][c6][c7])*__tc::ldg(&I[b0][c4][c5][c6][c7])));
          }
        }
      }
      for (int c5 = 256; c5 <= 511; c5 += 1) {
        for (int c6 = 0; c6 <= 33; c6 += 1) {
          for (int c7 = 0; c7 <= 40; c7 += 1) {
            O[b0][c4][(c5 - 256)][c6][c7] = (((_gamma_0[c4][c5 - 256]*(__tc::ldg(&I[b0][c4][(c5 - 256)][c6][c7]) - (_sum_0[0][c4]/float32(((256*34)*41)))))*rsqrt((((_sumSquares_0[0][c4]/float32(((256*34)*41))) - (((_sum_0[0][c4]*_sum_0[0][c4])/float32(((256*34)*41)))/float32(((256*34)*41)))) + 0.000010f))) + _beta_0[c4][c5 - 256]);
          }
        }
      }
    }
  }
  __syncthreads();
  if (t1 == 0) {
    sum[b0][0] = _sum_0[0][0];
    sum[b0][1] = _sum_0[0][1];
    sum[b0][2] = _sum_0[0][2];
    sum[b0][3] = _sum_0[0][3];
    sumSquares[b0][0] = _sumSquares_0[0][0];
    sumSquares[b0][1] = _sumSquares_0[0][1];
    sumSquares[b0][2] = _sumSquares_0[0][2];
    sumSquares[b0][3] = _sumSquares_0[0][3];
  }
  __syncthreads();
}

I've also observed this with grouped convolution.

The text was updated successfully, but these errors were encountered:

ftynse · 2018-06-20T14:42:47Z

Does it cause slowdowns?

I know exactly where it comes from and have no intention of fixing this in the near future unless it improves performance.

thetheodor · 2018-06-20T14:44:04Z

I didn't measure.

thetheodor assigned ftynse, nicolasvasilache and skimo-openhub Jun 20, 2018

skimo-openhub removed their assignment Jun 20, 2018

ftynse unassigned nicolasvasilache Jun 20, 2018

ftynse added the awaiting response label Jun 20, 2018

ftynse added wontfix and removed awaiting response labels Jul 4, 2018

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Spurious __syncthreads in the beginning of the kernel. #530

Spurious __syncthreads in the beginning of the kernel. #530

thetheodor commented Jun 20, 2018

ftynse commented Jun 20, 2018 •

edited

Loading

thetheodor commented Jun 20, 2018

Spurious __syncthreads in the beginning of the kernel. #530

Spurious __syncthreads in the beginning of the kernel. #530

Comments

thetheodor commented Jun 20, 2018

ftynse commented Jun 20, 2018 • edited Loading

thetheodor commented Jun 20, 2018

ftynse commented Jun 20, 2018 •

edited

Loading