Skip to content

cudev: Add __shfl_down implementation for long long and unsigned long on devices of CC < 7.0 #3963

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: 4.x
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -334,12 +334,28 @@ __device__ __forceinline__ uint shfl_down(uint val, uint delta, int width = warp

__device__ __forceinline__ signed long long shfl_down(signed long long val, uint delta, int width = warpSize)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ < 700
union { long long ll; int2 i2; } u;
u.ll = val;
u.i2.x = __shfl_down(u.i2.x, delta, width);
u.i2.y = __shfl_down(u.i2.y, delta, width);
return u.ll;
#else
return __shfl_down(val, delta, width);
#endif
}

__device__ __forceinline__ unsigned long long shfl_down(unsigned long long val, uint delta, int width = warpSize)
{
return (unsigned long long) __shfl_down(val, delta, width);
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ < 700
union { unsigned long long ull; uint2 u2; } u;
u.ull = val;
u.u2.x = __shfl_down(static_cast<int>(u.u2.x), delta, width);
u.u2.y = __shfl_down(static_cast<int>(u.u2.y), delta, width);
return u.ull;
#else
return __shfl_down(val, delta, width);
#endif
}

__device__ __forceinline__ float shfl_down(float val, uint delta, int width = warpSize)
Expand Down