|  | 
| 9 | 9 | // NTX must be >=2 so r is filled properly | 
| 10 | 10 | template <typename T, Int NTX, Int NTY> | 
| 11 | 11 | __global__ void AveragePooling_fp(T *input_features, T *output_features, | 
| 12 |  | -				  Int nPlanes, Int input_stride, | 
| 13 |  | -				  Int output_stride, Int *rules, Int nHot, | 
| 14 |  | -				  T alpha) { | 
|  | 12 | +                                  Int nPlanes, Int input_stride, | 
|  | 13 | +                                  Int output_stride, Int *rules, Int nHot, | 
|  | 14 | +                                  T alpha) { | 
| 15 | 15 |   __shared__ Int r[NTY * 2]; | 
| 16 | 16 |   for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) { | 
| 17 | 17 |     { | 
| 18 | 18 |       Int i = threadIdx.x + NTX * threadIdx.y; | 
| 19 | 19 |       if (i < NTY * 2 and i < 2 * (nHot - n)) | 
| 20 |  | -	r[i] = rules[2 * n + i]; | 
|  | 20 | +        r[i] = rules[2 * n + i]; | 
| 21 | 21 |     } | 
| 22 | 22 |     __syncthreads(); | 
| 23 | 23 |     if (n + threadIdx.y < nHot) { | 
| 24 | 24 |       Int i = r[2 * threadIdx.y] * input_stride; | 
| 25 | 25 |       Int o = r[2 * threadIdx.y + 1] * output_stride; | 
| 26 | 26 |       for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX) | 
| 27 |  | -	output_features[o + plane]+= alpha * input_features[i + plane]; | 
| 28 |  | -	// atomicAdd(&output_features[o + plane], | 
| 29 |  | -	//           alpha * input_features[i + plane]); | 
|  | 27 | +        output_features[o + plane] += alpha * input_features[i + plane]; | 
|  | 28 | +      // atomicAdd(&output_features[o + plane], | 
|  | 29 | +      //           alpha * input_features[i + plane]); | 
| 30 | 30 |     } | 
| 31 | 31 |     __syncthreads(); | 
| 32 | 32 |   } | 
| 33 | 33 | } | 
| 34 | 34 | 
 | 
| 35 | 35 | template <typename T> | 
| 36 | 36 | void cuda_AveragePooling_ForwardPass(T *input_features, T *output_features, | 
| 37 |  | -				     Int nPlanes, Int input_stride, | 
| 38 |  | -				     Int output_stride, RuleBook _rules, | 
| 39 |  | -				     Int filterVolume) { | 
| 40 |  | -  RULEBOOKITERATOR((AveragePooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>( | 
| 41 |  | -      input_features, output_features, nPlanes, input_stride, output_stride, | 
| 42 |  | -      rbB, nHotB, 1.0 / filterVolume)); | 
| 43 |  | -		   , ) | 
|  | 37 | +                                     Int nPlanes, Int input_stride, | 
|  | 38 | +                                     Int output_stride, RuleBook _rules, | 
|  | 39 | +                                     Int filterVolume) { | 
|  | 40 | +  auto application = [&](Int *rbB, Int nHotB, cudaStream_t &stream) -> void { | 
|  | 41 | +    AveragePooling_fp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>( | 
|  | 42 | +        input_features, output_features, nPlanes, input_stride, output_stride, | 
|  | 43 | +        rbB, nHotB, 1.0 / filterVolume); | 
|  | 44 | +  }; | 
|  | 45 | + | 
|  | 46 | +  iterateRuleBookSeq(_rules, application); | 
| 44 | 47 | } | 
| 45 | 48 | template <typename T, Int NTX, Int NTY> | 
| 46 | 49 | __global__ void AveragePooling_bp(T *d_input_features, T *d_output_features, | 
| 47 |  | -				  Int nPlanes, Int input_stride, | 
| 48 |  | -				  Int output_stride, Int *rules, Int nHot, | 
| 49 |  | -				  T alpha) { | 
|  | 50 | +                                  Int nPlanes, Int input_stride, | 
|  | 51 | +                                  Int output_stride, Int *rules, Int nHot, | 
|  | 52 | +                                  T alpha) { | 
| 50 | 53 |   __shared__ Int r[NTY * 2]; | 
| 51 | 54 |   for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) { | 
| 52 | 55 |     { | 
| 53 | 56 |       Int i = threadIdx.x + NTX * threadIdx.y; | 
| 54 | 57 |       if (i < NTY * 2 and i < 2 * (nHot - n)) | 
| 55 |  | -	r[i] = rules[2 * n + i]; | 
|  | 58 | +        r[i] = rules[2 * n + i]; | 
| 56 | 59 |     } | 
| 57 | 60 |     __syncthreads(); | 
| 58 | 61 |     if (n + threadIdx.y < nHot) { | 
| 59 | 62 |       Int i = r[2 * threadIdx.y] * input_stride; | 
| 60 | 63 |       Int o = r[2 * threadIdx.y + 1] * output_stride; | 
| 61 | 64 |       for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX) | 
| 62 |  | -	d_input_features[i + plane] += alpha * d_output_features[o + plane]; | 
|  | 65 | +        d_input_features[i + plane] += alpha * d_output_features[o + plane]; | 
| 63 | 66 |     } | 
| 64 | 67 |     __syncthreads(); | 
| 65 | 68 |   } | 
| 66 | 69 | } | 
| 67 | 70 | 
 | 
| 68 | 71 | template <typename T> | 
| 69 | 72 | void cuda_AveragePooling_BackwardPass(T *d_input_features, T *d_output_features, | 
| 70 |  | -				      Int nPlanes, Int input_stride, | 
| 71 |  | -				      Int output_stride, RuleBook _rules, | 
| 72 |  | -				      Int filterVolume) { | 
| 73 |  | -  RULEBOOKITERATOR((AveragePooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>( | 
| 74 |  | -      d_input_features, d_output_features, nPlanes, input_stride, output_stride, | 
| 75 |  | -      rbB, nHotB, 1.0 / filterVolume)); | 
| 76 |  | -		   , ) | 
| 77 |  | -} | 
| 78 |  | - | 
| 79 |  | - | 
| 80 |  | - | 
| 81 |  | - | 
| 82 |  | - | 
| 83 |  | - | 
| 84 |  | - | 
| 85 |  | - | 
| 86 |  | - | 
|  | 73 | +                                      Int nPlanes, Int input_stride, | 
|  | 74 | +                                      Int output_stride, RuleBook _rules, | 
|  | 75 | +                                      Int filterVolume) { | 
| 87 | 76 | 
 | 
|  | 77 | +  auto application = [&](Int *rbB, Int nHotB, cudaStream_t &stream) -> void { | 
|  | 78 | +    AveragePooling_bp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>( | 
|  | 79 | +      d_input_features, d_output_features, nPlanes, input_stride, output_stride, | 
|  | 80 | +      rbB, nHotB, 1.0 / filterVolume); | 
|  | 81 | +  }; | 
| 88 | 82 | 
 | 
|  | 83 | +  iterateRuleBookSeq(_rules, application); | 
|  | 84 | +} | 
| 89 | 85 | 
 | 
| 90 | 86 | // NTX must be >=2 so r is filled properly | 
| 91 | 87 | template <typename T, Int NTX, Int NTY> | 
| 92 |  | -__global__ void CopyFeaturesHelper_fp(T *input_features, T *output_features, Int * rules, | 
| 93 |  | -				  Int nPlanes,  Int nHot) { | 
|  | 88 | +__global__ void CopyFeaturesHelper_fp(T *input_features, T *output_features, | 
|  | 89 | +                                      Int *rules, Int nPlanes, Int nHot) { | 
| 94 | 90 |   __shared__ Int r[NTY * 2]; | 
| 95 | 91 |   for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) { | 
| 96 | 92 |     { | 
| 97 | 93 |       Int i = threadIdx.x + NTX * threadIdx.y; | 
| 98 | 94 |       if (i < NTY * 2 and i < 2 * (nHot - n)) | 
| 99 |  | -	r[i] = rules[2 * n + i]; | 
|  | 95 | +        r[i] = rules[2 * n + i]; | 
| 100 | 96 |     } | 
| 101 | 97 |     __syncthreads(); | 
| 102 | 98 |     if (n + threadIdx.y < nHot) { | 
| 103 |  | -      Int i = r[2 * threadIdx.y+1] * nPlanes; | 
| 104 |  | -      Int o = r[2 * threadIdx.y ] * nPlanes; | 
|  | 99 | +      Int i = r[2 * threadIdx.y + 1] * nPlanes; | 
|  | 100 | +      Int o = r[2 * threadIdx.y] * nPlanes; | 
| 105 | 101 |       for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX) | 
| 106 |  | -	output_features[o + plane]= input_features[i + plane]; | 
|  | 102 | +        output_features[o + plane] = input_features[i + plane]; | 
| 107 | 103 |     } | 
| 108 | 104 |     __syncthreads(); | 
| 109 | 105 |   } | 
| 110 | 106 | } | 
| 111 | 107 | 
 | 
| 112 | 108 | template <typename T> | 
| 113 |  | -void cuda_CopyFeaturesHelper_ForwardPass(T *input_features, T *output_features, Int* rules, | 
| 114 |  | -				     Int nPlanes, Int nHot) { | 
| 115 |  | -CopyFeaturesHelper_fp<T, 32, 32><<<32, dim3(32, 32)>>>( | 
| 116 |  | -      input_features, output_features, rules, nPlanes, | 
| 117 |  | -     nHot); | 
|  | 109 | +void cuda_CopyFeaturesHelper_ForwardPass(T *input_features, T *output_features, | 
|  | 110 | +                                         Int *rules, Int nPlanes, Int nHot) { | 
|  | 111 | +  CopyFeaturesHelper_fp<T, 32, 32><<<32, dim3(32, 32)>>>( | 
|  | 112 | +      input_features, output_features, rules, nPlanes, nHot); | 
| 118 | 113 | } | 
| 119 | 114 | template <typename T, Int NTX, Int NTY> | 
| 120 |  | -__global__ void CopyFeaturesHelper_bp(T *d_input_features, T *d_output_features, Int* rules, | 
| 121 |  | -				  Int nPlanes,Int nHot) { | 
|  | 115 | +__global__ void CopyFeaturesHelper_bp(T *d_input_features, T *d_output_features, | 
|  | 116 | +                                      Int *rules, Int nPlanes, Int nHot) { | 
| 122 | 117 |   __shared__ Int r[NTY * 2]; | 
| 123 | 118 |   for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) { | 
| 124 | 119 |     { | 
| 125 | 120 |       Int i = threadIdx.x + NTX * threadIdx.y; | 
| 126 | 121 |       if (i < NTY * 2 and i < 2 * (nHot - n)) | 
| 127 |  | -	r[i] = rules[2 * n + i]; | 
|  | 122 | +        r[i] = rules[2 * n + i]; | 
| 128 | 123 |     } | 
| 129 | 124 |     __syncthreads(); | 
| 130 | 125 |     if (n + threadIdx.y < nHot) { | 
| 131 |  | -      Int i = r[2 * threadIdx.y+1] * nPlanes; | 
|  | 126 | +      Int i = r[2 * threadIdx.y + 1] * nPlanes; | 
| 132 | 127 |       Int o = r[2 * threadIdx.y] * nPlanes; | 
| 133 | 128 |       for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX) | 
| 134 |  | -	d_input_features[i + plane] = d_output_features[o + plane]; | 
|  | 129 | +        d_input_features[i + plane] = d_output_features[o + plane]; | 
| 135 | 130 |     } | 
| 136 | 131 |     __syncthreads(); | 
| 137 | 132 |   } | 
| 138 | 133 | } | 
| 139 | 134 | 
 | 
| 140 | 135 | template <typename T> | 
| 141 |  | -void cuda_CopyFeaturesHelper_BackwardPass(T *d_input_features, T *d_output_features, | 
| 142 |  | -				      Int* rules, Int nPlanes, Int nHot) { | 
| 143 |  | -CopyFeaturesHelper_bp<T, 32, 32><<<32, dim3(32, 32)>>>( | 
|  | 136 | +void cuda_CopyFeaturesHelper_BackwardPass(T *d_input_features, | 
|  | 137 | +                                          T *d_output_features, Int *rules, | 
|  | 138 | +                                          Int nPlanes, Int nHot) { | 
|  | 139 | +  CopyFeaturesHelper_bp<T, 32, 32><<<32, dim3(32, 32)>>>( | 
| 144 | 140 |       d_input_features, d_output_features, rules, nPlanes, nHot); | 
| 145 | 141 | } | 
0 commit comments