Skip to content

Commit

Permalink
Fixed primitieves
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander Khokhlov committed Jul 26, 2018
1 parent 18ed668 commit 0144348
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 126 deletions.
105 changes: 10 additions & 95 deletions CLW/CL/CLW.cl
Original file line number Diff line number Diff line change
Expand Up @@ -1545,41 +1545,11 @@ inline void atomic_min_int(volatile __global int* addr, int value)
atomic_min(addr, value);
}

// --------------------- HELPERS ------------------------

#define DEFINE_ASSIGN_OPERATOR(type)\
inline void assign_##type(__local type* addr, type value)\
{\
*addr = value;\
}

inline void assign_float3(__local float3* addr, float3 value)
{
(*addr).xyz = value.xyz;
}

inline int divide_int(int dividend, int divider)
{
return dividend / (divider != 0 ? divider : 1);
}

inline float divide_float(float dividend, float divider)
{
return dividend / (fabs(divider) > epsilon ? divider : 1.f);
}

inline float3 divide_float3(float3 dividend, float3 divider)
{
return (float3)(divide_float(dividend.x, divider.x),
divide_float(dividend.y, divider.y),
divide_float(dividend.z, divider.z));
}

// --------------------- REDUCTION ------------------------

#define DEFINE_REDUCTION(bin_op, type)\
__kernel void reduction_##bin_op##_##type(__global type* buffer,\
int buf_count,\
int count,\
__local type* shared_mem,\
__global type* out)\
{\
Expand All @@ -1588,27 +1558,21 @@ __kernel void reduction_##bin_op##_##type(__global type* buffer,\
int local_id = get_local_id(0);\
int group_size = get_local_size(0);\
\
if (global_id < buf_count)\
if (global_id < count)\
{\
assign_##type(shared_mem + local_id, buffer[global_id]);\
*(shared_mem + local_id) = buffer[global_id];\
}\
else\
{\
assign_##type(shared_mem + local_id, neutral_##bin_op##_##type);\
}\
\
if (global_id == 0)\
{\
*out = neutral_##bin_op##_##type;\
*(shared_mem + local_id) = neutral_##bin_op##_##type;\
}\
\
barrier(CLK_LOCAL_MEM_FENCE);\
for (int i = group_size / 2; i > 0; i >>= 1)\
{\
if (local_id < i)\
{\
assign_##type(shared_mem + local_id,\
bin_op(shared_mem[local_id], shared_mem[local_id + i]));\
*(shared_mem + local_id) = bin_op(shared_mem[local_id], shared_mem[local_id + i]);\
}\
barrier(CLK_LOCAL_MEM_FENCE);\
}\
Expand All @@ -1624,60 +1588,14 @@ __kernel void reduction_##bin_op##_##type(__global type* buffer,\
#define DEFINE_BUFFER_NORMALIZATION(type)\
__kernel void buffer_normalization_##type(__global type* input,\
__global type* output,\
int buffer_count,\
__local type* shared_mem,\
__global type* auxiliary_buf)\
int count,\
type max,\
type min)\
{\
int global_id = get_global_id(0);\
int group_id = get_group_id(0);\
int local_id = get_local_id(0);\
int group_size = get_local_size(0);\
\
__local type* min_buffer = shared_mem;\
__local type* max_buffer = shared_mem + group_size;\
\
if (global_id < buffer_count)\
if (global_id < count)\
{\
min_buffer[local_id] = input[global_id];\
max_buffer[local_id] = input[global_id];\
}\
else\
{\
min_buffer[local_id] = neutral_min_##type;\
max_buffer[local_id] = neutral_max_##type;\
}\
\
if (global_id == 0)\
{\
auxiliary_buf[0] = neutral_min_##type;\
auxiliary_buf[1] = neutral_max_##type;\
}\
\
barrier(CLK_LOCAL_MEM_FENCE);\
\
for (int i = group_size / 2; i > 0; i >>= 1)\
{\
if (local_id < i)\
{\
assign_##type(min_buffer + local_id,\
min(min_buffer[local_id], min_buffer[local_id + i]));\
assign_##type(max_buffer + local_id,\
max(max_buffer[local_id], max_buffer[local_id + i]));\
}\
barrier(CLK_LOCAL_MEM_FENCE);\
}\
\
if (local_id == 0)\
{\
atomic_min_##type(auxiliary_buf, min_buffer[0]);\
atomic_max_##type(auxiliary_buf + 1, max_buffer[0]);\
}\
\
barrier(CLK_LOCAL_MEM_FENCE);\
type diff = auxiliary_buf[1] - auxiliary_buf[0];\
if (global_id < buffer_count)\
{\
output[global_id] = divide_##type(input[global_id], diff);\
output[global_id] = input[global_id] / (max - min);\
}\
}

Expand All @@ -1687,9 +1605,6 @@ DEFINE_ATOMIC(max)
DEFINE_ATOMIC_FLOAT3(min)
DEFINE_ATOMIC_FLOAT3(max)

DEFINE_ASSIGN_OPERATOR(int)
DEFINE_ASSIGN_OPERATOR(float)

DEFINE_REDUCTION(min, int)
DEFINE_REDUCTION(min, float)
DEFINE_REDUCTION(min, float3)
Expand Down
173 changes: 144 additions & 29 deletions CLW/CLWParallelPrimitives.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ THE SOFTWARE.
#define NUM_SEG_SCAN_ELEMS_PER_WI 1
#define NUM_SCAN_ELEMS_PER_WG (WG_SIZE * NUM_SCAN_ELEMS_PER_WI)
#define NUM_SEG_SCAN_ELEMS_PER_WG (WG_SIZE * NUM_SEG_SCAN_ELEMS_PER_WI)
#define NORMALIZATION_CACHE (2)

CLWParallelPrimitives::CLWParallelPrimitives(CLWContext context, char const* buildopts)
: context_(context)
Expand Down Expand Up @@ -921,65 +920,181 @@ void CLWParallelPrimitives::ReclaimTempBuffer(std::map<size_t, CLWBuffer<T> > co
collection[buffer.GetElementCount()] = buffer;
}

CLWEvent CLWParallelPrimitives::Copy(unsigned int deviceIdx, CLWBuffer<cl_int> input, CLWBuffer<cl_int> output, int numElems)
{
int ELEMS_PER_WI = 4;
int GROUP_BLOCK_SIZE = (WG_SIZE * ELEMS_PER_WI);
int NUM_BLOCKS = (numElems + GROUP_BLOCK_SIZE - 1) / GROUP_BLOCK_SIZE;

CLWKernel copyKernel = program_.GetKernel("copy");

copyKernel.SetArg(0, input);
copyKernel.SetArg(1, numElems);
copyKernel.SetArg(2, output);

return context_.Launch1D(0, NUM_BLOCKS * WG_SIZE, WG_SIZE, copyKernel);
}

const float epsilon = 0.001f;

template <class T>
CLWEvent CLWParallelPrimitives::Normalize(const char* kernelName, unsigned int deviceIdx, CLWBuffer<T> input, CLWBuffer<T> output, int numElems, int groupSize, CLWBuffer<T> cache)
T CLWParallelPrimitives::GetMaxNum()
{
return std::numeric_limits<T>::max();
}


template <>
cl_float3 CLWParallelPrimitives::GetMaxNum<cl_float3>()
{
cl_float3 val;

val.s[0] = std::numeric_limits<float>::max();
val.s[1] = std::numeric_limits<float>::max();
val.s[2] = std::numeric_limits<float>::max();

return val;
}

template <class T>
T CLWParallelPrimitives::GetMinNum()
{
return std::numeric_limits<T>::min();
}

template <>
cl_float3 CLWParallelPrimitives::GetMinNum<cl_float3>()
{
cl_float3 val;

val.s[0] = std::numeric_limits<float>::min();
val.s[1] = std::numeric_limits<float>::min();
val.s[2] = std::numeric_limits<float>::min();

return val;
}

template <class T>
CLWEvent CLWParallelPrimitives::Reduction(const char* kernelName,
unsigned int deviceIdx,
CLWBuffer<T> input,
int numElems,
CLWBuffer<T> out)
{
assert(groupSize);
assert(input.GetElementCount() >= numElems);
assert(output.GetElementCount() >= numElems);

int ELEMS_PER_WI = 4;
int GROUP_BLOCK_SIZE = (groupSize * ELEMS_PER_WI);
int NUM_BLOCKS = (numElems + GROUP_BLOCK_SIZE - 1) / GROUP_BLOCK_SIZE;
int NUM_BLOCKS = (int)((numElems + WG_SIZE - 1) / WG_SIZE);

CLWKernel normalizeKernel = program_.GetKernel(kernelName);
CLWKernel reductionKernel = program_.GetKernel(kernelName);

int argc = 0;

normalizeKernel.SetArg(argc++, input);
normalizeKernel.SetArg(argc++, output);
normalizeKernel.SetArg(argc++, numElems);
normalizeKernel.SetArg(argc++, SharedMemory(2 * sizeof(T) * groupSize));
normalizeKernel.SetArg(argc++, cache);
reductionKernel.SetArg(argc++, input);
reductionKernel.SetArg(argc++, numElems);
reductionKernel.SetArg(argc++, SharedMemory(sizeof(T) * WG_SIZE));
reductionKernel.SetArg(argc++, out);

return context_.Launch1D(deviceIdx, NUM_BLOCKS * groupSize, groupSize, normalizeKernel);
return context_.Launch1D(deviceIdx, NUM_BLOCKS * WG_SIZE, WG_SIZE, reductionKernel);
}

CLWEvent CLWParallelPrimitives::Copy(unsigned int deviceIdx, CLWBuffer<cl_int> input, CLWBuffer<cl_int> output, int numElems)
template <class T>
CLWEvent CLWParallelPrimitives::Normalize(const char* normalizeKernelName,
const char* minReductionKernelName,
const char* maxReductionKernelName,
unsigned int deviceIdx,
CLWBuffer<T> input,
CLWBuffer<T> output,
int numElems,
CLWBuffer<T> cache)
{
int ELEMS_PER_WI = 4;
int GROUP_BLOCK_SIZE = (WG_SIZE * ELEMS_PER_WI);
int NUM_BLOCKS = (numElems + GROUP_BLOCK_SIZE - 1) / GROUP_BLOCK_SIZE;
assert(input.GetElementCount() >= numElems);
assert(output.GetElementCount() >= numElems);

CLWKernel copyKernel = program_.GetKernel("copy");
int NUM_BLOCKS = (int)((numElems + WG_SIZE - 1) / WG_SIZE);

copyKernel.SetArg(0, input);
copyKernel.SetArg(1, numElems);
copyKernel.SetArg(2, output);
T min = GetMaxNum<T>();
T max = GetMinNum<T>();

return context_.Launch1D(0, NUM_BLOCKS * WG_SIZE, WG_SIZE, copyKernel);
context_.WriteBuffer<T>(deviceIdx, cache, &min, 1);

Reduction(minReductionKernelName,
0,
input,
numElems,
cache).Wait();

context_.ReadBuffer<T>(deviceIdx, cache, &min, 1).Wait();
context_.WriteBuffer<T>(deviceIdx, cache, &max, 1).Wait();

Reduction(maxReductionKernelName,
0,
input,
numElems,
cache).Wait();

context_.ReadBuffer<T>(deviceIdx, cache, &max, 1).Wait();

// launch normalization kernel
CLWKernel normalizeKernel = program_.GetKernel(normalizeKernelName);

int argc = 0;

normalizeKernel.SetArg(argc++, input);
normalizeKernel.SetArg(argc++, output);
normalizeKernel.SetArg(argc++, numElems);
normalizeKernel.SetArg(argc++, max);
normalizeKernel.SetArg(argc++, min);

return context_.Launch1D(deviceIdx, NUM_BLOCKS * WG_SIZE, WG_SIZE, normalizeKernel);
}

CLWEvent CLWParallelPrimitives::Normalize(unsigned int deviceIdx, CLWBuffer<cl_int> input, CLWBuffer<cl_int> output, int numElems)
{
CLWBuffer<cl_int> cache = GetTempIntBuffer(NORMALIZATION_CACHE);
CLWEvent event = Normalize<cl_int>("buffer_normalization_int", deviceIdx, input, output, numElems, WG_SIZE, cache);
CLWBuffer<cl_int> cache = GetTempIntBuffer(1);

CLWEvent event = Normalize("buffer_normalization_int",
"reduction_min_int",
"reduction_max_int",
deviceIdx,
input,
output,
numElems,
cache);

ReclaimTempIntBuffer(cache);
return event;
}

CLWEvent CLWParallelPrimitives::Normalize(unsigned int deviceIdx, CLWBuffer<cl_float> input, CLWBuffer<cl_float> output, int numElems)
{
CLWBuffer<cl_float> cache = GetTempFloatBuffer(NORMALIZATION_CACHE);
CLWEvent event = Normalize<cl_float>("buffer_normalization_float", deviceIdx, input, output, numElems, WG_SIZE, cache);
CLWBuffer<cl_float> cache = GetTempFloatBuffer(1);

CLWEvent event = Normalize("buffer_normalization_float",
"reduction_min_float",
"reduction_max_float",
deviceIdx,
input,
output,
numElems,
cache);

ReclaimTempFloatBuffer(cache);
return event;
}

CLWEvent CLWParallelPrimitives::Normalize(unsigned int deviceIdx, CLWBuffer<cl_float3> input, CLWBuffer<cl_float3> output, int numElems)
{
CLWBuffer<cl_float3> cache = GetTempBuffer<cl_float3>(float3_BufferCache_, NORMALIZATION_CACHE);
CLWEvent event = Normalize<cl_float3>("buffer_normalization_float3", deviceIdx, input, output, numElems, WG_SIZE, cache);
CLWBuffer<cl_float3> cache = GetTempBuffer<cl_float3>(float3_BufferCache_, 1);

CLWEvent event = Normalize("buffer_normalization_float3",
"reduction_min_float3",
"reduction_max_float3",
deviceIdx,
input,
output,
numElems,
cache);

ReclaimTempBuffer(float3_BufferCache_, cache);
return event;
}
18 changes: 16 additions & 2 deletions CLW/CLWParallelPrimitives.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,26 @@ class CLWParallelPrimitives
void ReclaimTempBuffer(std::map<size_t, CLWBuffer<T>> collection, CLWBuffer<T> buffer);

template <class T>
CLWEvent Normalize(const char* kernelName,
CLWEvent Reduction(const char* kernelName,
unsigned int deviceIdx,
CLWBuffer<T> input,
int numElems,
CLWBuffer<T> out);

template <class T>
T GetMaxNum();

template <class T>
T GetMinNum();

template <class T>
CLWEvent Normalize(const char* normalizeKernelName,
const char* minReductionKernelName,
const char* maxReductionKernelName,
unsigned int deviceIdx,
CLWBuffer<T> input,
CLWBuffer<T> output,
int numElems,
int groupSize,
CLWBuffer<T> cache);

CLWContext context_;
Expand Down

0 comments on commit 0144348

Please sign in to comment.