Skip to content

Commit 5a0163c

Browse files
committed
Merge commit '80caca4edbc415abb2f0695fb2565e6b46c410a8'
2 parents 18d6579 + 80caca4 commit 5a0163c

File tree

1 file changed

+9
-9
lines changed

1 file changed

+9
-9
lines changed

torch/lib/THC/THCApply.cuh

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -109,16 +109,16 @@ inline bool getApplyGrid(THCState* state, ptrdiff_t totalElements, dim3& grid) {
109109
return false;
110110
}
111111

112-
// Assume a reasonable number of SMs if no state is available
113-
int numSM =
114-
state ? THCState_getCurrentDeviceProperties(state)->multiProcessorCount : 15;
115-
116-
// 16 warps per block * 4 per SM gives 64 warps per SM at maximum,
117-
// which seems to be a good sweetspot for latency hiding
118-
grid = dim3(min((long long) THCCeilDiv(totalElements,
119-
(ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK),
120-
4LL * numSM));
112+
if(THCState_getCurrentDeviceProperties(state)->major < 3){
113+
grid = dim3(min((long long) THCCeilDiv(totalElements,
114+
(ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK), (long long) 64*1024-1));
115+
return true;
116+
}
117+
118+
grid = dim3((long long) THCCeilDiv(totalElements,
119+
(ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK) );
121120
return true;
121+
122122
}
123123

124124
template <typename TensorTypeA,

0 commit comments

Comments
 (0)