Skip to content

__syncthreads() needed for reduction ?  #13

@zjin-lcf

Description

@zjin-lcf

For the following codes in miniFE, the comments show that __syncthreads() are not needed in a warp. However, I think __syncthreads() are actually needed to produce correct sum results. I got incorrect sum results when omitting them. Could you reproduce the issue ? Thank you for your comments.

template<typename Vector>
__global__ void dot_kernel(const Vector x, const Vector y, typename TypeTraits<typename Vector::ScalarType>::magnitude_type *d) {

  typedef typename TypeTraits<typename Vector::ScalarType>::magnitude_type magnitude;
  const int BLOCK_SIZE=512;

  magnitude sum=0;
  for(int idx=blockIdx.x*blockDim.x+threadIdx.x;idx<x.n;idx+=gridDim.x*blockDim.x) {
    sum+=x.coefs[idx]*y.coefs[idx];
  }

  //Do a shared memory reduction on the dot product
  __shared__ volatile magnitude red[BLOCK_SIZE];
  red[threadIdx.x]=sum;
  //__syncthreads(); if(threadIdx.x<512) {sum+=red[threadIdx.x+512]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<256)  {sum+=red[threadIdx.x+256]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<128)  {sum+=red[threadIdx.x+128]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<64)   {sum+=red[threadIdx.x+64];  red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<32)   {sum+=red[threadIdx.x+32];  red[threadIdx.x]=sum;}
  //the remaining ones don't need syncthreads because they are warp synchronous
                   if(threadIdx.x<16)   {sum+=red[threadIdx.x+16];  red[threadIdx.x]=sum;}
                   if(threadIdx.x<8)    {sum+=red[threadIdx.x+8];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<4)    {sum+=red[threadIdx.x+4];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<2)    {sum+=red[threadIdx.x+2];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<1)    {sum+=red[threadIdx.x+1];}

  //save partial dot products
  if(threadIdx.x==0) d[blockIdx.x]=sum;
}

template<typename Scalar>
__global__ void dot_final_reduce_kernel(Scalar *d) {
  const int BLOCK_SIZE=1024;
  Scalar sum=d[threadIdx.x];
  __shared__ volatile Scalar red[BLOCK_SIZE];

  red[threadIdx.x]=sum;
  __syncthreads(); if(threadIdx.x<512)  {sum+=red[threadIdx.x+512]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<256)  {sum+=red[threadIdx.x+256]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<128)  {sum+=red[threadIdx.x+128]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<64)   {sum+=red[threadIdx.x+64];  red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<32)   {sum+=red[threadIdx.x+32];  red[threadIdx.x]=sum;}
  //the remaining ones don't need syncthreads because they are warp synchronous
                   if(threadIdx.x<16)   {sum+=red[threadIdx.x+16];  red[threadIdx.x]=sum;}
                   if(threadIdx.x<8)    {sum+=red[threadIdx.x+8];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<4)    {sum+=red[threadIdx.x+4];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<2)    {sum+=red[threadIdx.x+2];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<1)    {sum+=red[threadIdx.x+1];}

  //save final dot product at the front
                   if(threadIdx.x==0) d[0]=sum;
}
#define BLOCK_SIZE  256

#include <stdio.h>
#include <cuda.h>

__global__ void dot_kernel(const int n, const int* x, const int* y, int *d) {

  int sum=0;
  for(int idx=blockIdx.x*blockDim.x+threadIdx.x;idx<n;idx+=gridDim.x*blockDim.x) {
    sum+=x[idx]*y[idx];
  }

  //Do a shared memory reduction on the dot product
  __shared__ int red[BLOCK_SIZE];
  red[threadIdx.x]=sum;
  #pragma unroll
  for (int n = 128; n > 0; n = n/2) {   // incorrect results when syncthreads() are omitted in a wrap
     __syncthreads();
     if(threadIdx.x<n)  {sum+=red[threadIdx.x+n]; red[threadIdx.x]=sum;}
  }

  //save partial dot products
  if(threadIdx.x==0) d[blockIdx.x]=sum;
}

__global__ void final(int *d) {
  int sum=d[threadIdx.x];
  __shared__ int red[BLOCK_SIZE];

  red[threadIdx.x]=sum;
  #pragma unroll
  for (int n = 128; n > 0; n = n/2) {    
     __syncthreads();
     if(threadIdx.x<n)  {sum+=red[threadIdx.x+n]; red[threadIdx.x]=sum;}
  }
  //save final dot product at the front
  if(threadIdx.x==0) d[0]=sum;
}

#define LEN 1025
int main() {
  int a[LEN];
  int b[LEN];
  int r[256];
  srand(2);
  int sum = 0;
  int d_sum = 0;

// sum on the host
  for (int i = 0; i < LEN; i++) {
    a[i] = rand() % 3;
    b[i] = rand() % 3;
    sum += a[i]*b[i];
  }

// sum on the device
  int *da, *db;
  int *dr;
  const int n = LEN;
  cudaMalloc((void**)&da, sizeof(int)*LEN);
  cudaMalloc((void**)&db, sizeof(int)*LEN);
  cudaMalloc((void**)&dr, sizeof(int)*256);
  cudaMemcpy(da, a, sizeof(int)*LEN, cudaMemcpyHostToDevice);
  cudaMemcpy(db, b, sizeof(int)*LEN, cudaMemcpyHostToDevice);
  dot_kernel<<<(n+255)/256, 256 >>>(n, da,db,dr);
  final<<<1, 256>>>(dr);
  cudaMemcpy(&d_sum, dr, sizeof(int), cudaMemcpyDeviceToHost);
  printf("%d %d\n", sum ,d_sum);
  cudaFree(da);
  cudaFree(db);
  cudaFree(dr);
  return 0;
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions