Skip to content

Commit 796568b

Browse files
added row-major CPU matrices
replacing the 2D CPU memory of CompMatr and SuperOp, which each retain a 2D alias to the new 1D memory for user convenience. Existing code which accesses .cpuElems of both these structs remains valid, although some usages may be accelerated by switching to accesing .cpuElemsFlat (like some unit-testing code). Prompted by discussion #540 Co-authored-by: Erich Essmann <11630432+eessmann@users.noreply.github.com>
1 parent 2525681 commit 796568b

File tree

9 files changed

+130
-48
lines changed

9 files changed

+130
-48
lines changed

quest/include/channels.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,15 @@ typedef struct {
3333
int numQubits;
3434
qindex numRows;
3535

36+
// 2D CPU memory, which users can manually overwrite like cpuElems[i][j],
37+
// but which actually merely aliases the 1D cpuElemsFlat below
3638
qcomp** cpuElems;
39+
40+
// row-major flattened elements of cpuElems, always allocated
41+
qcomp* cpuElemsFlat;
42+
43+
// row-major flattened elems in GPU memory, allocated
44+
// only and always in GPU-enabled QuEST environments
3745
qcomp* gpuElemsFlat;
3846

3947
// whether the user has ever synchronised memory to the GPU, which is performed automatically

quest/include/matrices.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,15 @@ typedef struct {
7777
// made after an initial sync have been re-synched. This is a heap pointer, as above.
7878
int* wasGpuSynced;
7979

80-
// 2D CPU memory; not const, so users can overwrite addresses (e.g. with nullptr)
80+
// 2D CPU memory, which users can manually overwrite like cpuElems[i][j],
81+
// but which actually merely aliases the 1D cpuElemsFlat below
8182
qcomp** cpuElems;
8283

83-
// row-flattened elems in GPU memory, allocated only
84-
// and always in GPU-enabled QuEST environments
84+
// row-major flattened elements of cpuElems, always allocated
85+
qcomp* cpuElemsFlat;
86+
87+
// row-major flattened elems in GPU memory, allocated
88+
// only and always in GPU-enabled QuEST environments
8589
qcomp* gpuElemsFlat;
8690

8791
} CompMatr;

quest/src/api/channels.cpp

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@ using std::vector;
3232

3333
void freeSuperOp(SuperOp op) {
3434

35-
// free CPU memory, even if it is NULL
36-
cpu_deallocMatrix(op.cpuElems, op.numRows);
35+
// free CPU memory, even if it is nullptr
36+
cpu_deallocArray(op.cpuElemsFlat);
37+
cpu_deallocMatrixWrapper(op.cpuElems);
3738

3839
// free teeniy-tiny heap flag
3940
cpu_deallocHeapFlag(op.wasGpuSynced);
@@ -68,12 +69,9 @@ void freeObj(KrausMap map) {
6869

6970
bool didAnyLocalAllocsFail(SuperOp op) {
7071

71-
// god help us if this single-integer malloc failed
72-
if (!mem_isAllocated(op.wasGpuSynced))
73-
return true;
74-
75-
if (!mem_isAllocated(op.cpuElems, op.numRows))
76-
return true;
72+
if (!mem_isAllocated(op.wasGpuSynced)) return true;
73+
if (!mem_isAllocated(op.cpuElemsFlat)) return true;
74+
if (!mem_isOuterAllocated(op.cpuElems)) return true;
7775

7876
if (getQuESTEnv().isGpuAccelerated && !mem_isAllocated(op.gpuElemsFlat))
7977
return true;
@@ -84,19 +82,15 @@ bool didAnyLocalAllocsFail(SuperOp op) {
8482

8583
bool didAnyLocalAllocsFail(KrausMap map) {
8684

87-
// god help us if this single-integer malloc failed
8885
if (!mem_isAllocated(map.isCPTP))
8986
return true;
9087

91-
// list of CPU matrices and all matrices/rows therein shoul dbe non-NULL
9288
if (!mem_isAllocated(map.matrices, map.numMatrices, map.numRows))
9389
return true;
9490

95-
// check if anything in the superoperator failed to allocate
9691
if (didAnyLocalAllocsFail(map.superop))
9792
return true;
9893

99-
// otherwise, all pointers were non-NULL and ergo all allocs were successful
10094
return false;
10195
}
10296

@@ -131,12 +125,18 @@ SuperOp allocSuperOp(int numQubits) {
131125
qindex numRows = powerOf2(2 * numQubits);
132126
qindex numElems = numRows * numRows;
133127

128+
qcomp* cpuMem = cpu_allocArray(numElems); // nullptr if failed
129+
qcomp* gpuMem = nullptr;
130+
if (getQuESTEnv().isGpuAccelerated)
131+
gpuMem = gpu_allocArray(numElems); // nullptr if failed
132+
134133
SuperOp out = {
135134
.numQubits = numQubits,
136135
.numRows = numRows,
137136

138-
.cpuElems = cpu_allocMatrix(numRows), // nullptr if failed
139-
.gpuElemsFlat = (getQuESTEnv().isGpuAccelerated)? gpu_allocArray(numElems) : nullptr, // nullptr if failed or not needed
137+
.cpuElems = cpu_allocAndInitMatrixWrapper(cpuMem, numRows), // nullptr if failed
138+
.cpuElemsFlat = cpuMem,
139+
.gpuElemsFlat = gpuMem,
140140

141141
.wasGpuSynced = cpu_allocHeapFlag() // nullptr if failed
142142
};

quest/src/api/matrices.cpp

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,10 @@ template <class T>
8282
void freeHeapMatrix(T matr) {
8383

8484
// free the 1D or 2D matrix - safe even if nullptr
85-
if constexpr (util_isDenseMatrixType<T>())
86-
cpu_deallocMatrix(matr.cpuElems, matr.numRows);
87-
else
85+
if constexpr (util_isDenseMatrixType<T>()) {
86+
cpu_deallocMatrixWrapper(matr.cpuElems);
87+
cpu_deallocArray(matr.cpuElemsFlat);
88+
} else
8889
cpu_deallocArray(matr.cpuElems);
8990

9091
// we avoid invoking a GPU function in non-GPU mode
@@ -110,20 +111,16 @@ bool didAnyLocalAllocsFail(T matr) {
110111

111112
// outer CPU memory should always be allocated
112113
if constexpr (util_isDenseMatrixType<T>()) {
113-
if (!mem_isAllocated(matr.cpuElems, matr.numRows))
114-
return true;
115-
} else {
116-
if (!mem_isAllocated(matr.cpuElems))
117-
return true;
118-
}
114+
if (!mem_isAllocated(matr.cpuElemsFlat)) return true;
115+
if (!mem_isOuterAllocated(matr.cpuElems)) return true;
116+
} else
117+
if (!mem_isAllocated(matr.cpuElems)) return true;
119118

120119
// if memory is 2D, we must also check each inner array was allocated
121120
if constexpr (util_isDenseMatrixType<T>()) {
122-
if (!mem_isAllocated(matr.cpuElems, matr.numRows))
123-
return true;
121+
if (!mem_isAllocated(matr.cpuElems, matr.numRows)) return true;
124122
} else {
125-
if (!mem_isAllocated(matr.cpuElems))
126-
return true;
123+
if (!mem_isAllocated(matr.cpuElems)) return true;
127124
}
128125

129126
// if GPU memory is not allocated in a GPU environment...
@@ -197,6 +194,11 @@ extern "C" CompMatr createCompMatr(int numQubits) {
197194
qindex numRows = powerOf2(numQubits);
198195
qindex numElems = numRows * numRows;
199196

197+
qcomp* cpuMem = cpu_allocArray(numElems); // nullptr if failed
198+
qcomp* gpuMem = nullptr;
199+
if (getQuESTEnv().isGpuAccelerated)
200+
gpuMem = gpu_allocArray(numElems); // nullptr if failed
201+
200202
// initialise all CompMatr fields inline because most are const
201203
CompMatr out = {
202204
.numQubits = numQubits,
@@ -207,11 +209,9 @@ extern "C" CompMatr createCompMatr(int numQubits) {
207209
.isHermitian = cpu_allocHeapFlag(), // nullptr if failed
208210
.wasGpuSynced = cpu_allocHeapFlag(), // nullptr if failed
209211

210-
// 2D CPU memory
211-
.cpuElems = cpu_allocMatrix(numRows), // nullptr if failed, or may contain nullptr
212-
213-
// 1D GPU memory
214-
.gpuElemsFlat = (getQuESTEnv().isGpuAccelerated)? gpu_allocArray(numElems) : nullptr // nullptr if failed or not needed
212+
.cpuElems = cpu_allocAndInitMatrixWrapper(cpuMem, numRows), // nullptr if failed
213+
.cpuElemsFlat = cpuMem,
214+
.gpuElemsFlat = gpuMem
215215
};
216216

217217
validateMatrixAllocs(out, __func__);

quest/src/core/errors.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,11 @@ void error_gpuMemSyncQueriedButEnvNotGpuAccelerated() {
588588
raiseInternalError("A function checked whether persistent GPU memory (such as in a CompMatr) had been synchronised, but the QuEST environment is not GPU accelerated.");
589589
}
590590

591+
void error_gpuDeadCopyMatrixFunctionCalled() {
592+
593+
raiseInternalError("The internal GPU function copyMatrixIfGpuCompiled() was called, though is intended as dead-code - matrices needing copying to GPU should be stored as flat row-wise lists.");
594+
}
595+
591596
void assert_quregIsGpuAccelerated(Qureg qureg) {
592597

593598
if (!qureg.isGpuAccelerated)

quest/src/core/errors.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,8 @@ void error_gpuMemSyncQueriedButEnvNotGpuAccelerated();
228228

229229
void error_gpuUnexpectedlyInaccessible();
230230

231+
void error_gpuDeadCopyMatrixFunctionCalled();
232+
231233
void assert_gpuIsAccessible();
232234

233235
void assert_quregIsGpuAccelerated(Qureg qureg);

quest/src/cpu/cpu_config.cpp

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -101,17 +101,41 @@ void cpu_deallocArray(qcomp* arr) {
101101
}
102102

103103

104+
qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim) {
105+
106+
// allocate only the outer memory (i.e. one row's worth)
107+
qcomp** out = (qcomp**) malloc(dim * sizeof *out);
108+
109+
// caller will handle malloc failure
110+
if (out == nullptr)
111+
return out;
112+
113+
// populate out with offsets of arr
114+
for (qindex i=0; i<dim; i++)
115+
out[i] = &arr[i*dim];
116+
117+
return out;
118+
}
119+
120+
121+
void cpu_deallocMatrixWrapper(qcomp** wrapper) {
122+
123+
// only the outer pointer is freed; the
124+
// inner pointers are offsets to another
125+
// malloc which is separately freed
126+
free(wrapper);
127+
}
128+
129+
104130
qcomp** cpu_allocMatrix(qindex dim) {
105131

106-
// TODO:
107-
// the design of storing the CPU matrix elements as a 2D structure will impede
108-
// performance for many qubits; the allocated heap memories for each row
109-
// have no gaurantee to reside near other, so that their access/iteration in
110-
// hot loops may incur unnecessary caching penalties. Consider storing the
111-
// elements as a flat array, like we do for the GPU memory. This makes manual
112-
// modification by the user trivially harder (changing [r][c] to [r*n+c]),
113-
// but should improve caching, and significantly simplify allocation and its
114-
// validation; no more enumerating nested pointers! Benchmark this scenario.
132+
// NOTE:
133+
// this function creates a matrix where rows are not necessarily
134+
// contiguous in memory, which can incur gratuitous caching penalties
135+
// when accessed in hot loops. As such, we do not use this function
136+
// to allocate memory for CompMatr (instead, cpu_allocAndInitMatrixWrapper()),
137+
// but instead use it for the individual Kraus matrices of a KrausMap,
138+
// which are each quadratically smaller than the important superoperator.
115139

116140
// allocate outer array
117141
qcomp** rows = (qcomp**) malloc(dim * sizeof *rows); // nullptr if failed

quest/src/cpu/cpu_config.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ int cpu_getOpenmpThreadInd();
4141
qcomp* cpu_allocArray(qindex length);
4242
void cpu_deallocArray(qcomp* arr);
4343

44+
qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim);
45+
void cpu_deallocMatrixWrapper(qcomp** wrapper);
46+
4447
qcomp** cpu_allocMatrix(qindex dim);
4548
void cpu_deallocMatrix(qcomp** matrix, qindex dim);
4649

quest/src/gpu/gpu_config.cpp

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,14 @@ void copyArrayIfGpuCompiled(qcomp* cpuArr, qcomp* gpuArr, qindex numElems, enum
333333
void copyMatrixIfGpuCompiled(qcomp** cpuMatr, qcomp* gpuArr, qindex matrDim, enum CopyDirection direction) {
334334
#if COMPILE_CUDA
335335

336+
// NOTE:
337+
// this function copies a 2D CPU matrix into a 1D row-major GPU array,
338+
// although this is not actually needed by the QuEST backend which
339+
// maintains 1D row-major CPU memories merely aliased by 2D structures
340+
// for the user's benefit. As such, this is dead code, but preserved in
341+
// case it is ever needed (like if custom user 2D data was needed in GPU).
342+
error_gpuDeadCopyMatrixFunctionCalled();
343+
336344
// for completeness, we permit copying from the 1D GPU memory to the 2D CPU memory,
337345
// although we never actually have the need to do this!
338346
auto flag = (direction == TO_HOST)?
@@ -400,11 +408,25 @@ void gpu_copyGpuToCpu(Qureg qureg) {
400408

401409
void gpu_copyCpuToGpu(CompMatr matr) {
402410
assertHeapObjectGpuMemIsAllocated(matr);
403-
copyMatrixIfGpuCompiled(matr.cpuElems, util_getGpuMemPtr(matr), matr.numRows, TO_DEVICE);
411+
412+
// note matr.cpuElems is merely a 2D alias for matr.cpuElemsFlat, which
413+
// matches the format of matr.gpuElemsFlat. Ergo, we do not invoke
414+
// copyMatrixIfGpuCompiled(), and instead more efficiently overwrite
415+
// the contiguous memory, which retains any user changes to .cpuElems
416+
417+
qindex numElems = matr.numRows * matr.numRows;
418+
copyArrayIfGpuCompiled(matr.cpuElemsFlat, util_getGpuMemPtr(matr), numElems, TO_DEVICE);
404419
}
405420
void gpu_copyGpuToCpu(CompMatr matr) {
406421
assertHeapObjectGpuMemIsAllocated(matr);
407-
copyMatrixIfGpuCompiled(matr.cpuElems, util_getGpuMemPtr(matr), matr.numRows, TO_HOST);
422+
423+
// note matr.cpuElems is merely a 2D alias for matr.cpuElemsFlat, which
424+
// matches the format of matr.gpuElemsFlat. Ergo, we do not invoke
425+
// copyMatrixIfGpuCompiled(), and instead more efficiently overwrite
426+
// the contiguous matr.cpuElemsFlat, which users can access via .cpuElems
427+
428+
qindex numElems = matr.numRows * matr.numRows;
429+
copyArrayIfGpuCompiled(matr.cpuElemsFlat, util_getGpuMemPtr(matr), numElems, TO_HOST);
408430
}
409431

410432

@@ -420,11 +442,25 @@ void gpu_copyGpuToCpu(DiagMatr matr) {
420442

421443
void gpu_copyCpuToGpu(SuperOp op) {
422444
assertHeapObjectGpuMemIsAllocated(op);
423-
copyMatrixIfGpuCompiled(op.cpuElems, util_getGpuMemPtr(op), op.numRows, TO_DEVICE);
445+
446+
// note op.cpuElems is merely a 2D alias for op.cpuElemsFlat, which
447+
// matches the format of op.gpuElemsFlat. Ergo, we do not invoke
448+
// copyMatrixIfGpuCompiled(), and instead more efficiently overwrite
449+
// the contiguous memory, which retains any user changes to .cpuElems
450+
451+
qindex numElems = op.numRows * op.numRows;
452+
copyArrayIfGpuCompiled(op.cpuElemsFlat, util_getGpuMemPtr(op), numElems, TO_DEVICE);
424453
}
425454
void gpu_copyGpuToCpu(SuperOp op) {
426455
assertHeapObjectGpuMemIsAllocated(op);
427-
copyMatrixIfGpuCompiled(op.cpuElems, util_getGpuMemPtr(op), op.numRows, TO_HOST);
456+
457+
// note op.cpuElems is merely a 2D alias for op.cpuElemsFlat, which
458+
// matches the format of op.gpuElemsFlat. Ergo, we do not invoke
459+
// copyMatrixIfGpuCompiled(), and instead more efficiently overwrite
460+
// the contiguous op.cpuElemsFlat, which users can access via .cpuElems
461+
462+
qindex numElems = op.numRows * op.numRows;
463+
copyArrayIfGpuCompiled(op.cpuElemsFlat, util_getGpuMemPtr(op), numElems, TO_HOST);
428464
}
429465

430466

0 commit comments

Comments
 (0)