Skip to content

jaredhoberock/managed_allocator

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

9 Commits
 
 
 
 
 
 

Repository files navigation

managed_allocator

A C++ allocator based on cudaMallocManaged().

To create a custom C++ allocator which allocates storage using cudaMallocManaged, we need to make a class and give it .allocate() and .deallocate() functions:

The .allocate() function calls cudaMallocManaged and throws an exception if it fails:

    value_type* allocate(size_t n)
    {
      value_type* result = nullptr;
  
      cudaError_t error = cudaMallocManaged(&result, n*sizeof(T), cudaMemAttachGlobal);
  
      if(error != cudaSuccess)
      {
        throw thrust::system_error(error, thrust::cuda_category(), "managed_allocator::allocate(): cudaMallocManaged");
      }
  
      return result;
    }

The .deallocate() function can just call cudaFree():

    void deallocate(value_type* ptr, size_t)
    {
      cudaError_t error = cudaFree(ptr);
  
      if(error != cudaSuccess)
      {
        throw thrust::system_error(error, thrust::cuda_category(), "managed_allocator::deallocate(): cudaFree");
      }
    }

Here's a program which demonstrates some of the different things you can do with it:

#include "managed_allocator.hpp"
#include <thrust/fill.h>
#include <thrust/logical.h>
#include <thrust/execution_policy.h>
#include <vector>
#include <algorithm>
#include <numeric>
#include <cassert>
#include <iostream>

// create a nickname for vectors which use a managed_allocator
template<class T>
using managed_vector = std::vector<T, managed_allocator<T>>;

__global__ void increment_kernel(int *data, size_t n)
{
  size_t i = blockDim.x * blockIdx.x + threadIdx.x;

  if(i < n)
  {
    data[i] += 1;
  }
}

int main()
{
  size_t n = 1 << 20;

  managed_vector<int> vec(n);

  // we can use the vector from the host
  std::iota(vec.begin(), vec.end(), 0);

  std::vector<int> ref(n);
  std::iota(ref.begin(), ref.end(), 0);
  assert(std::equal(ref.begin(), ref.end(), vec.begin()));

  // we can also use it in a CUDA kernel
  size_t block_size = 256;
  size_t num_blocks = (n + (block_size - 1)) / block_size;

  increment_kernel<<<num_blocks, block_size>>>(vec.data(), vec.size());

  cudaDeviceSynchronize();

  std::for_each(ref.begin(), ref.end(), [](int& x)
  {
    x += 1;
  });

  assert(std::equal(ref.begin(), ref.end(), vec.begin()));

  // we can also use it with Thrust algorithms

  // by default, the Thrust algorithm will execute on the host with the managed_vector
  thrust::fill(vec.begin(), vec.end(), 7);
  assert(std::all_of(vec.begin(), vec.end(), [](int x)
  {
    return x == 7;
  }));

  // to execute on the device, use the thrust::device execution policy
  thrust::fill(thrust::device, vec.begin(), vec.end(), 13);

  // we need to synchronize before attempting to use the vector on the host
  cudaDeviceSynchronize();

  // to execute on the host, use the thrust::host execution policy
  assert(thrust::all_of(thrust::host, vec.begin(), vec.end(), [](int x)
  {
    return x == 13;
  }));

  std::cout << "OK" << std::endl;

  return 0;
}

About

A C++ allocator based on cudaMallocManaged

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published