Skip to content

A GPU accelerated find() method

Hüseyin Tuğrul BÜYÜKIŞIK edited this page Feb 15, 2021 · 18 revisions

VirtualMultiArray has a find(T object, S objectMember, int maxIndices) method that searches for objects with same member value (same byte representation) and returns a std::vector containing them (but with maxIndices elements at most, if found).

  • Search value (S objectMember) needs to be in same parent T object for proper calculations. Using different parent objects for both parameters leads to errors.
  • If an array member (like struct test{ int data[100]; }) needs to be searched for exact byte representation, then it needs to be inside another struct (like struct search_layer { int data[100]; }); struct test { search_layer }; and that struct can be used for searching.
  • If no elements are found, returned vector size is 0.
  • Multiple index order is not guaranteed to be same between repeated searches. OpenCL kernel finds elements and save results using atomic functions. This lets find() compute quicker than a reduction algorithm when there are only few indices to find.

It's not very accelerated currently(14/02/2021) but it can do this:

preparing virtual array...
initializing data...
benchmarking find()...
test object: 5
found object at index: 5
data search: 40MB         358ms
data: 5
----------------------
----------------------
----------------------
test object: 100005
found object at index: 100005
data search: 40MB         6ms
data: 100005
----------------------
----------------------
----------------------
test object: 200005
found object at index: 200005
data search: 40MB         6ms
data: 200005

#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"

// testing
#include <iostream>
#include "omp.h"

struct ArrayPassByValue
{
	char buf[3];
};

class Obj
{
public:
	Obj(){b=-1;}
	Obj(int i){b=i;}
	char a;
	int b;
	ArrayPassByValue arr;
	char tmp;
};


int main(int argC, char ** argV)
{


	std::cout<<"preparing virtual array..."<<std::endl;
	size_t n = 10000000;
	size_t p = 10000;
	GraphicsCardSupplyDepot gpu;
	VirtualMultiArray<Obj> arr(n,gpu.requestGpus(),p,3,{4,2,2});

	std::cout<<"initializing data..."<<std::endl;
	#pragma omp parallel for
	for(int i=0;i<10;i++)
	{
		arr.mappedReadWriteAccess(i*(n/10),n/10,[&](Obj * ptr){
			for(size_t j=i*(n/10);j<i*(n/10)+(n/10);j++)
			{
				Obj obj(j);
				for(int k=0;k<3;k++)
					obj.arr.buf[k]=' ';
				obj.arr.buf[j%3]='X';
				ptr[j]=obj;
			}
		},false,false,true);
	}
	std::cout<<"benchmarking find()..."<<std::endl;

	Obj searchObj;

	for(int i=0;i<80;i++)
	{
		std::chrono::milliseconds t1 =  std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());
		searchObj.b=i*100000+5;
		searchObj.arr.buf[0]='X';
		searchObj.arr.buf[1]=' ';
		searchObj.arr.buf[2]=' ';

		// integer member as search value
		// find at most 100 indices that has same member value
		std::vector<size_t> found = arr.find(searchObj,searchObj.b,100);

		// buffer member as search value
		// find at most 100 indices that has same member value
		//std::vector<size_t> found = arr.find(searchObj,searchObj.arr,100);

		std::chrono::milliseconds t2 =  std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());
		std::cout<<"test object: "<<i*100000+5<<std::endl;
		std::cout<<"found object at index: "<<(found.size()>0?found[0]:-1)<<std::endl;
		std::cout<<"data search: "<<n*sizeof(int)/1000000.0<<"MB         "<<(t2.count()-t1.count())<<"ms"<<std::endl;
		std::cout<<"data: "<<arr.get(found[0]).b<<std::endl;

		// integer search error check
		if(arr.get(found[0]).b!=i*100000+5)
		{
			std::cout<<"ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR"<<std::endl;
			return 0;
		}
		std::cout<<"----------------------"<<std::endl;
		std::cout<<"----------------------"<<std::endl;
		std::cout<<"----------------------"<<std::endl;

	}
	return 0;
}
Clone this wiki locally