From 5271d77d31c8fa799285b826124235ea185c989d Mon Sep 17 00:00:00 2001 From: Moritz Lehmann Date: Mon, 25 Apr 2022 08:49:32 +0200 Subject: [PATCH] Added Memory host<->device transfer functions for 1D/2D/3D grid domains --- README.md | 4 +-- src/opencl.hpp | 78 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5e47ca0..a10703f 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Works in Windows, Linux and Android with C++17. - easy option to generate PTX assembly and save that in a `.ptx` file 2. create a `Memory` object with 1 line - one object for both host and device memory - - easy host <-> device memory transfer + - easy host <-> device memory transfer (also for 1D/2D/3D grid domains) - easy handling of multi-dimensional vectors - can also be used to only allocate memory on host or only allocate memory on device - automatically tracks total global memory usage of device when allocating/deleting memory @@ -136,7 +136,7 @@ int main() { const float intel = (float)(contains(to_lower(vendor), "intel"))*(is_gpu?8.0f:0.5f); // Intel integrated GPUs usually have 8 cores/CU, Intel CPUs (with HT) have 1/2 core/CU const float arm = (float)(contains(to_lower(vendor), "arm"))*(is_gpu?8.0f:1.0f); // ARM GPUs usually have 8 cores/CU, ARM CPUs have 1 core/CU const uint cores = to_uint((float)compute_units*(nvidia+amd+intel+arm)); // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading) - const float tflops = 1E-6f*(float)cores*(float)ipc*(float)clock_frequency; // estimated device floating point performance in TeraFLOPs/s + const float tflops = 1E-6f*(float)cores*(float)ipc*(float)clock_frequency; // estimated device FP32 floating point performance in TeraFLOPs/s if(tflops>best_value) { best_value = tflops; best_i = i; diff --git a/src/opencl.hpp b/src/opencl.hpp index 8076181..5eb94cb 100644 --- a/src/opencl.hpp +++ b/src/opencl.hpp @@ -24,7 +24,7 @@ struct Device_Info { bool is_cpu=false, is_gpu=false; uint is_fp64_capable=0u, is_fp32_capable=0u, is_fp16_capable=0u, is_int64_capable=0u, is_int32_capable=0u, is_int16_capable=0u, is_int8_capable=0u; uint cores=0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading) - float tflops=0.0f; // estimated device floating point performance in TeraFLOPs/s + float tflops=0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s inline Device_Info(const cl::Device& cl_device) { this->cl_device = cl_device; // see https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html name = trim(cl_device.getInfo()); // device name @@ -381,6 +381,82 @@ template class Memory { if(safe_length>0ull) cl_queue.enqueueWriteBuffer(device_buffer, blocking, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset)); } } + inline void read_from_device_1d(const ulong x0, const ulong x1, const int dimension=-1, const bool blocking=true) { // read 1D domain from device, either for all vector dimensions (-1) or for a specified dimension + if(host_buffer_exists&&device_buffer_exists) { + const uint i0=(uint)max(0, dimension), i1=dimension<0 ? d : i0+1u; + for(uint i=i0; i0ull) cl_queue.enqueueReadBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset)); + } + if(blocking) cl_queue.finish(); + } + } + inline void write_to_device_1d(const ulong x0, const ulong x1, const int dimension=-1, const bool blocking=true) { // write 1D domain to device, either for all vector dimensions (-1) or for a specified dimension + if(host_buffer_exists&&device_buffer_exists) { + const uint i0=(uint)max(0, dimension), i1=dimension<0 ? d : i0+1u; + for(uint i=i0; i0ull) cl_queue.enqueueWriteBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset)); + } + if(blocking) cl_queue.finish(); + } + } + inline void read_from_device_2d(const ulong x0, const ulong x1, const ulong y0, const ulong y1, const ulong Nx, const ulong Ny, const int dimension=-1, const bool blocking=true) { // read 2D domain from device, either for all vector dimensions (-1) or for a specified dimension + if(host_buffer_exists&&device_buffer_exists) { + for(uint y=y0; y0ull) cl_queue.enqueueReadBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset)); + } + } + if(blocking) cl_queue.finish(); + } + } + inline void write_to_device_2d(const ulong x0, const ulong x1, const ulong y0, const ulong y1, const ulong Nx, const ulong Ny, const int dimension=-1, const bool blocking=true) { // write 2D domain to device, either for all vector dimensions (-1) or for a specified dimension + if(host_buffer_exists&&device_buffer_exists) { + for(uint y=y0; y0ull) cl_queue.enqueueWriteBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset)); + } + } + if(blocking) cl_queue.finish(); + } + } + inline void read_from_device_3d(const ulong x0, const ulong x1, const ulong y0, const ulong y1, const ulong z0, const ulong z1, const ulong Nx, const ulong Ny, const ulong Nz, const int dimension=-1, const bool blocking=true) { // read 3D domain from device, either for all vector dimensions (-1) or for a specified dimension + if(host_buffer_exists&&device_buffer_exists) { + for(uint z=z0; z0ull) cl_queue.enqueueReadBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset)); + } + } + } + if(blocking) cl_queue.finish(); + } + } + inline void write_to_device_3d(const ulong x0, const ulong x1, const ulong y0, const ulong y1, const ulong z0, const ulong z1, const ulong Nx, const ulong Ny, const ulong Nz, const int dimension=-1, const bool blocking=true) { // write 3D domain to device, either for all vector dimensions (-1) or for a specified dimension + if(host_buffer_exists&&device_buffer_exists) { + for(uint z=z0; z0ull) cl_queue.enqueueWriteBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset)); + } + } + } + if(blocking) cl_queue.finish(); + } + } inline void finish() { cl_queue.finish(); }