Skip to content

Commit d00ca87

Browse files
f3schshahor02
authored andcommitted
Revert "ITS: GPU: more memory clearing in processNeighbours"
This reverts commit aa3ef37. Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
1 parent c8834de commit d00ca87

File tree

1 file changed

+26
-58
lines changed

1 file changed

+26
-58
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu

Lines changed: 26 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939
// O2 track model
4040
#include "ReconstructionDataFormats/Track.h"
4141
#include "DetectorsBase/Propagator.h"
42-
#include "utils/strtag.h"
4342
using namespace o2::track;
4443

4544
namespace o2::its
@@ -1107,19 +1106,11 @@ void processNeighboursHandler(const int startLayer,
11071106
const int nBlocks,
11081107
const int nThreads)
11091108
{
1110-
constexpr uint64_t Tag = qStr2Tag("ITS_PNH1");
1111-
1112-
// allocators used
11131109
auto allocInt = gpu::TypedAllocator<int>(alloc);
11141110
auto allocCellSeed = gpu::TypedAllocator<CellSeed<nLayers>>(alloc);
1115-
// use sync_policy, this part cannot be run async but tell thrust to use the allocator
1116-
auto sync_policy = THRUST_NAMESPACE::par(gpu::TypedAllocator<char>(alloc));
1117-
1118-
// put initial computation on Tag1
1119-
alloc->pushTagOnStack(Tag);
1120-
1121-
// start processing of cells
11221111
thrust::device_vector<int, gpu::TypedAllocator<int>> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt);
1112+
auto nosync_policy = THRUST_NAMESPACE::par_nosync(gpu::TypedAllocator<char>(alloc)).on(gpu::Stream::DefaultStream);
1113+
11231114
gpu::processNeighboursKernel<true, nLayers><<<nBlocks, nThreads>>>(
11241115
startLayer,
11251116
startLevel,
@@ -1138,10 +1129,10 @@ void processNeighboursHandler(const int startLayer,
11381129
maxChi2ClusterAttachment,
11391130
propagator,
11401131
matCorrType);
1141-
thrust::exclusive_scan(sync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), foundSeedsTable.begin());
1142-
auto foundSeeds{foundSeedsTable.back()};
1143-
thrust::device_vector<int, gpu::TypedAllocator<int>> updatedCellId(foundSeeds, 0, allocInt);
1144-
thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> updatedCellSeed(foundSeeds, allocCellSeed);
1132+
thrust::exclusive_scan(nosync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), foundSeedsTable.begin());
1133+
1134+
thrust::device_vector<int, gpu::TypedAllocator<int>> updatedCellId(foundSeedsTable.back(), 0, allocInt);
1135+
thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> updatedCellSeed(foundSeedsTable.back(), allocCellSeed);
11451136
gpu::processNeighboursKernel<false, nLayers><<<nBlocks, nThreads>>>(
11461137
startLayer,
11471138
startLevel,
@@ -1160,41 +1151,20 @@ void processNeighboursHandler(const int startLayer,
11601151
maxChi2ClusterAttachment,
11611152
propagator,
11621153
matCorrType);
1154+
GPUChkErrS(cudaStreamSynchronize(gpu::Stream::DefaultStream));
11631155

1164-
// now do inward steps until stop is reached
11651156
int level = startLevel;
1166-
1167-
// Host buffers to break dependency
1168-
// FIXME: these should be on our memory resource!
1169-
std::vector<int> hostCellId;
1170-
std::vector<CellSeed<nLayers>> hostCellSeed;
1171-
1172-
// inward loop
1157+
thrust::device_vector<int, gpu::TypedAllocator<int>> lastCellId(allocInt);
1158+
thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> lastCellSeed(allocCellSeed);
11731159
for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) {
1174-
// copy current results to host
1175-
hostCellId.resize(updatedCellId.size());
1176-
hostCellSeed.resize(updatedCellSeed.size());
1177-
thrust::copy(updatedCellId.begin(), updatedCellId.end(), hostCellId.begin());
1178-
thrust::copy(updatedCellSeed.begin(), updatedCellSeed.end(), hostCellSeed.begin());
1179-
1180-
auto lastCellSeedSize{hostCellSeed.size()};
1181-
// but before we clear the memory, and immediately start a new block
1182-
alloc->popTagOffStack(Tag);
1183-
alloc->pushTagOnStack(Tag);
1184-
1185-
// based on the previous step's result create new LUT and zero it
1186-
thrust::device_vector<int, gpu::TypedAllocator<int>>(allocInt).swap(foundSeedsTable);
1187-
foundSeedsTable.resize(lastCellSeedSize + 1);
1188-
thrust::fill(sync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), 0);
1189-
1190-
// recreate lastCell vectors from host
1191-
thrust::device_vector<int, gpu::TypedAllocator<int>> lastCellId(hostCellId.begin(), hostCellId.end(), allocInt);
1192-
thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> lastCellSeed(hostCellSeed.begin(), hostCellSeed.end(), allocCellSeed);
1193-
// also create new vectors on new block
1160+
lastCellSeed.swap(updatedCellSeed);
1161+
lastCellId.swap(updatedCellId);
11941162
thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>>(allocCellSeed).swap(updatedCellSeed);
11951163
thrust::device_vector<int, gpu::TypedAllocator<int>>(allocInt).swap(updatedCellId);
1164+
auto lastCellSeedSize{lastCellSeed.size()};
1165+
foundSeedsTable.resize(lastCellSeedSize + 1);
1166+
thrust::fill(nosync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), 0);
11961167

1197-
// start step
11981168
gpu::processNeighboursKernel<true, nLayers><<<nBlocks, nThreads>>>(
11991169
iLayer,
12001170
--level,
@@ -1213,13 +1183,14 @@ void processNeighboursHandler(const int startLayer,
12131183
maxChi2ClusterAttachment,
12141184
propagator,
12151185
matCorrType);
1216-
// how many new seeds where found
1217-
thrust::exclusive_scan(sync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), foundSeedsTable.begin());
1218-
foundSeeds = foundSeedsTable.back();
1219-
// do a resize, we don't need to set the memory now since we know that all of these are written to
1220-
// Note though this does not clear the memory...
1186+
thrust::exclusive_scan(nosync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), foundSeedsTable.begin());
1187+
1188+
auto foundSeeds{foundSeedsTable.back()};
12211189
updatedCellId.resize(foundSeeds);
1190+
thrust::fill(nosync_policy, updatedCellId.begin(), updatedCellId.end(), 0);
12221191
updatedCellSeed.resize(foundSeeds);
1192+
thrust::fill(nosync_policy, updatedCellSeed.begin(), updatedCellSeed.end(), CellSeed<nLayers>());
1193+
12231194
gpu::processNeighboursKernel<false, nLayers><<<nBlocks, nThreads>>>(
12241195
iLayer,
12251196
level,
@@ -1239,15 +1210,12 @@ void processNeighboursHandler(const int startLayer,
12391210
propagator,
12401211
matCorrType);
12411212
}
1242-
1243-
// final copy of result
1244-
const auto selector = gpu::seed_selector<nLayers>(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5));
1245-
const auto count = thrust::count_if(sync_policy, updatedCellSeed.begin(), updatedCellSeed.end(), selector);
1246-
thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> outSeeds(count, allocCellSeed);
1247-
thrust::copy_if(sync_policy, updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), selector);
1248-
seedsHost.reserve(seedsHost.size() + count);
1249-
thrust::copy(outSeeds.begin(), outSeeds.end(), std::back_inserter(seedsHost));
1250-
alloc->popTagOffStack(Tag);
1213+
GPUChkErrS(cudaStreamSynchronize(gpu::Stream::DefaultStream));
1214+
thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> outSeeds(updatedCellSeed.size(), allocCellSeed);
1215+
auto end = thrust::copy_if(nosync_policy, updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector<nLayers>(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5)));
1216+
auto s{end - outSeeds.begin()};
1217+
seedsHost.reserve(seedsHost.size() + s);
1218+
thrust::copy(outSeeds.begin(), outSeeds.begin() + s, std::back_inserter(seedsHost));
12511219
}
12521220

12531221
template <int nLayers>

0 commit comments

Comments
 (0)