3939// O2 track model
4040#include " ReconstructionDataFormats/Track.h"
4141#include " DetectorsBase/Propagator.h"
42- #include " utils/strtag.h"
4342using namespace o2 ::track;
4443
4544namespace o2 ::its
@@ -1107,19 +1106,11 @@ void processNeighboursHandler(const int startLayer,
11071106 const int nBlocks,
11081107 const int nThreads)
11091108{
1110- constexpr uint64_t Tag = qStr2Tag (" ITS_PNH1" );
1111-
1112- // allocators used
11131109 auto allocInt = gpu::TypedAllocator<int >(alloc);
11141110 auto allocCellSeed = gpu::TypedAllocator<CellSeed<nLayers>>(alloc);
1115- // use sync_policy, this part cannot be run async but tell thrust to use the allocator
1116- auto sync_policy = THRUST_NAMESPACE::par (gpu::TypedAllocator<char >(alloc));
1117-
1118- // put initial computation on Tag1
1119- alloc->pushTagOnStack (Tag);
1120-
1121- // start processing of cells
11221111 thrust::device_vector<int , gpu::TypedAllocator<int >> foundSeedsTable (nCells[startLayer] + 1 , 0 , allocInt);
1112+ auto nosync_policy = THRUST_NAMESPACE::par_nosync (gpu::TypedAllocator<char >(alloc)).on (gpu::Stream::DefaultStream);
1113+
11231114 gpu::processNeighboursKernel<true , nLayers><<<nBlocks, nThreads>>> (
11241115 startLayer,
11251116 startLevel,
@@ -1138,10 +1129,10 @@ void processNeighboursHandler(const int startLayer,
11381129 maxChi2ClusterAttachment,
11391130 propagator,
11401131 matCorrType);
1141- thrust::exclusive_scan (sync_policy , foundSeedsTable.begin (), foundSeedsTable.end (), foundSeedsTable.begin ());
1142- auto foundSeeds{foundSeedsTable. back ()};
1143- thrust::device_vector<int , gpu::TypedAllocator<int >> updatedCellId (foundSeeds , 0 , allocInt);
1144- thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> updatedCellSeed (foundSeeds , allocCellSeed);
1132+ thrust::exclusive_scan (nosync_policy , foundSeedsTable.begin (), foundSeedsTable.end (), foundSeedsTable.begin ());
1133+
1134+ thrust::device_vector<int , gpu::TypedAllocator<int >> updatedCellId (foundSeedsTable. back () , 0 , allocInt);
1135+ thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> updatedCellSeed (foundSeedsTable. back () , allocCellSeed);
11451136 gpu::processNeighboursKernel<false , nLayers><<<nBlocks, nThreads>>> (
11461137 startLayer,
11471138 startLevel,
@@ -1160,41 +1151,20 @@ void processNeighboursHandler(const int startLayer,
11601151 maxChi2ClusterAttachment,
11611152 propagator,
11621153 matCorrType);
1154+ GPUChkErrS (cudaStreamSynchronize (gpu::Stream::DefaultStream));
11631155
1164- // now do inward steps until stop is reached
11651156 int level = startLevel;
1166-
1167- // Host buffers to break dependency
1168- // FIXME: these should be on our memory resource!
1169- std::vector<int > hostCellId;
1170- std::vector<CellSeed<nLayers>> hostCellSeed;
1171-
1172- // inward loop
1157+ thrust::device_vector<int , gpu::TypedAllocator<int >> lastCellId (allocInt);
1158+ thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> lastCellSeed (allocCellSeed);
11731159 for (int iLayer{startLayer - 1 }; iLayer > 0 && level > 2 ; --iLayer) {
1174- // copy current results to host
1175- hostCellId.resize (updatedCellId.size ());
1176- hostCellSeed.resize (updatedCellSeed.size ());
1177- thrust::copy (updatedCellId.begin (), updatedCellId.end (), hostCellId.begin ());
1178- thrust::copy (updatedCellSeed.begin (), updatedCellSeed.end (), hostCellSeed.begin ());
1179-
1180- auto lastCellSeedSize{hostCellSeed.size ()};
1181- // but before we clear the memory, and immediately start a new block
1182- alloc->popTagOffStack (Tag);
1183- alloc->pushTagOnStack (Tag);
1184-
1185- // based on the previous step's result create new LUT and zero it
1186- thrust::device_vector<int , gpu::TypedAllocator<int >>(allocInt).swap (foundSeedsTable);
1187- foundSeedsTable.resize (lastCellSeedSize + 1 );
1188- thrust::fill (sync_policy, foundSeedsTable.begin (), foundSeedsTable.end (), 0 );
1189-
1190- // recreate lastCell vectors from host
1191- thrust::device_vector<int , gpu::TypedAllocator<int >> lastCellId (hostCellId.begin (), hostCellId.end (), allocInt);
1192- thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> lastCellSeed (hostCellSeed.begin (), hostCellSeed.end (), allocCellSeed);
1193- // also create new vectors on new block
1160+ lastCellSeed.swap (updatedCellSeed);
1161+ lastCellId.swap (updatedCellId);
11941162 thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>>(allocCellSeed).swap (updatedCellSeed);
11951163 thrust::device_vector<int , gpu::TypedAllocator<int >>(allocInt).swap (updatedCellId);
1164+ auto lastCellSeedSize{lastCellSeed.size ()};
1165+ foundSeedsTable.resize (lastCellSeedSize + 1 );
1166+ thrust::fill (nosync_policy, foundSeedsTable.begin (), foundSeedsTable.end (), 0 );
11961167
1197- // start step
11981168 gpu::processNeighboursKernel<true , nLayers><<<nBlocks, nThreads>>> (
11991169 iLayer,
12001170 --level,
@@ -1213,13 +1183,14 @@ void processNeighboursHandler(const int startLayer,
12131183 maxChi2ClusterAttachment,
12141184 propagator,
12151185 matCorrType);
1216- // how many new seeds where found
1217- thrust::exclusive_scan (sync_policy, foundSeedsTable.begin (), foundSeedsTable.end (), foundSeedsTable.begin ());
1218- foundSeeds = foundSeedsTable.back ();
1219- // do a resize, we don't need to set the memory now since we know that all of these are written to
1220- // Note though this does not clear the memory...
1186+ thrust::exclusive_scan (nosync_policy, foundSeedsTable.begin (), foundSeedsTable.end (), foundSeedsTable.begin ());
1187+
1188+ auto foundSeeds{foundSeedsTable.back ()};
12211189 updatedCellId.resize (foundSeeds);
1190+ thrust::fill (nosync_policy, updatedCellId.begin (), updatedCellId.end (), 0 );
12221191 updatedCellSeed.resize (foundSeeds);
1192+ thrust::fill (nosync_policy, updatedCellSeed.begin (), updatedCellSeed.end (), CellSeed<nLayers>());
1193+
12231194 gpu::processNeighboursKernel<false , nLayers><<<nBlocks, nThreads>>> (
12241195 iLayer,
12251196 level,
@@ -1239,15 +1210,12 @@ void processNeighboursHandler(const int startLayer,
12391210 propagator,
12401211 matCorrType);
12411212 }
1242-
1243- // final copy of result
1244- const auto selector = gpu::seed_selector<nLayers>(1 .e3 , maxChi2NDF * ((startLevel + 2 ) * 2 - 5 ));
1245- const auto count = thrust::count_if (sync_policy, updatedCellSeed.begin (), updatedCellSeed.end (), selector);
1246- thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> outSeeds (count, allocCellSeed);
1247- thrust::copy_if (sync_policy, updatedCellSeed.begin (), updatedCellSeed.end (), outSeeds.begin (), selector);
1248- seedsHost.reserve (seedsHost.size () + count);
1249- thrust::copy (outSeeds.begin (), outSeeds.end (), std::back_inserter (seedsHost));
1250- alloc->popTagOffStack (Tag);
1213+ GPUChkErrS (cudaStreamSynchronize (gpu::Stream::DefaultStream));
1214+ thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> outSeeds (updatedCellSeed.size (), allocCellSeed);
1215+ auto end = thrust::copy_if (nosync_policy, updatedCellSeed.begin (), updatedCellSeed.end (), outSeeds.begin (), gpu::seed_selector<nLayers>(1 .e3 , maxChi2NDF * ((startLevel + 2 ) * 2 - 5 )));
1216+ auto s{end - outSeeds.begin ()};
1217+ seedsHost.reserve (seedsHost.size () + s);
1218+ thrust::copy (outSeeds.begin (), outSeeds.begin () + s, std::back_inserter (seedsHost));
12511219}
12521220
12531221template <int nLayers>
0 commit comments