added row-major CPU matrices

TysonRayJones · eessmann · web-flow · commit 796568ba0a79 · 2025-02-19T19:01:45.000+01:00
replacing the 2D CPU memory of CompMatr and SuperOp, which each retain a 2D alias to the new 1D memory for user convenience. Existing code which accesses .cpuElems of both these structs remains valid, although some usages may be accelerated by switching to accesing .cpuElemsFlat (like some unit-testing code). Prompted by discussion #540 Co-authored-by: Erich Essmann <11630432+eessmann@users.noreply.github.com>
diff --git a/quest/include/channels.h b/quest/include/channels.h
@@ -33,7 +33,15 @@ typedef struct {
     int numQubits;
     qindex numRows;
     
+    // 2D CPU memory, which users can manually overwrite like cpuElems[i][j],
+    // but which actually merely aliases the 1D cpuElemsFlat below
     qcomp** cpuElems;
+
+    // row-major flattened elements of cpuElems, always allocated
+    qcomp* cpuElemsFlat;
+
+    // row-major flattened elems in GPU memory, allocated 
+    // only and always in GPU-enabled QuEST environments
     qcomp* gpuElemsFlat;
 
     // whether the user has ever synchronised memory to the GPU, which is performed automatically
diff --git a/quest/include/matrices.h b/quest/include/matrices.h
@@ -77,11 +77,15 @@ typedef struct {
     // made after an initial sync have been re-synched. This is a heap pointer, as above.
     int* wasGpuSynced;
 
-    // 2D CPU memory; not const, so users can overwrite addresses (e.g. with nullptr)
+    // 2D CPU memory, which users can manually overwrite like cpuElems[i][j],
+    // but which actually merely aliases the 1D cpuElemsFlat below
     qcomp** cpuElems;
 
-    // row-flattened elems in GPU memory, allocated only
-    // and always in GPU-enabled QuEST environments
+    // row-major flattened elements of cpuElems, always allocated 
+    qcomp* cpuElemsFlat;
+
+    // row-major flattened elems in GPU memory, allocated 
+    // only and always in GPU-enabled QuEST environments
     qcomp* gpuElemsFlat;
 
 } CompMatr;
diff --git a/quest/src/api/channels.cpp b/quest/src/api/channels.cpp
@@ -32,8 +32,9 @@ using std::vector;
 
 void freeSuperOp(SuperOp op) {
 
-    // free CPU memory, even if it is NULL
-    cpu_deallocMatrix(op.cpuElems, op.numRows);
+    // free CPU memory, even if it is nullptr
+    cpu_deallocArray(op.cpuElemsFlat);
+    cpu_deallocMatrixWrapper(op.cpuElems);
 
     // free teeniy-tiny heap flag
     cpu_deallocHeapFlag(op.wasGpuSynced);
@@ -68,12 +69,9 @@ void freeObj(KrausMap map) {
 
 bool didAnyLocalAllocsFail(SuperOp op) {
 
-    // god help us if this single-integer malloc failed
-    if (!mem_isAllocated(op.wasGpuSynced))
-        return true;
-
-    if (!mem_isAllocated(op.cpuElems, op.numRows))
-        return true;
+    if (!mem_isAllocated(op.wasGpuSynced))  return true;
+    if (!mem_isAllocated(op.cpuElemsFlat))  return true;
+    if (!mem_isOuterAllocated(op.cpuElems)) return true;
 
     if (getQuESTEnv().isGpuAccelerated && !mem_isAllocated(op.gpuElemsFlat))
         return true;
@@ -84,19 +82,15 @@ bool didAnyLocalAllocsFail(SuperOp op) {
 
 bool didAnyLocalAllocsFail(KrausMap map) {
 
-    // god help us if this single-integer malloc failed
     if (!mem_isAllocated(map.isCPTP))
         return true;
 
-    // list of CPU matrices and all matrices/rows therein shoul dbe non-NULL
     if (!mem_isAllocated(map.matrices, map.numMatrices, map.numRows))
         return true;
 
-    // check if anything in the superoperator failed to allocate
     if (didAnyLocalAllocsFail(map.superop))
         return true;
 
-    // otherwise, all pointers were non-NULL and ergo all allocs were successful
     return false;
 }
 
@@ -131,12 +125,18 @@ SuperOp allocSuperOp(int numQubits) {
     qindex numRows = powerOf2(2 * numQubits);
     qindex numElems = numRows * numRows;
 
+    qcomp* cpuMem = cpu_allocArray(numElems); // nullptr if failed
+    qcomp* gpuMem = nullptr;
+    if (getQuESTEnv().isGpuAccelerated)
+        gpuMem = gpu_allocArray(numElems); // nullptr if failed
+
     SuperOp out = {
         .numQubits = numQubits,
         .numRows = numRows,
 
-        .cpuElems = cpu_allocMatrix(numRows), // nullptr if failed
-        .gpuElemsFlat = (getQuESTEnv().isGpuAccelerated)? gpu_allocArray(numElems) : nullptr, // nullptr if failed or not needed
+        .cpuElems = cpu_allocAndInitMatrixWrapper(cpuMem, numRows), // nullptr if failed
+        .cpuElemsFlat = cpuMem,
+        .gpuElemsFlat = gpuMem,
 
         .wasGpuSynced = cpu_allocHeapFlag() // nullptr if failed
     };
diff --git a/quest/src/api/matrices.cpp b/quest/src/api/matrices.cpp
@@ -82,9 +82,10 @@ template <class T>
 void freeHeapMatrix(T matr) {
 
     // free the 1D or 2D matrix - safe even if nullptr
-    if constexpr (util_isDenseMatrixType<T>())
-        cpu_deallocMatrix(matr.cpuElems, matr.numRows);
-    else
+    if constexpr (util_isDenseMatrixType<T>()) {
+        cpu_deallocMatrixWrapper(matr.cpuElems);
+        cpu_deallocArray(matr.cpuElemsFlat);
+    } else
         cpu_deallocArray(matr.cpuElems);
 
     // we avoid invoking a GPU function in non-GPU mode
@@ -110,20 +111,16 @@ bool didAnyLocalAllocsFail(T matr) {
 
     // outer CPU memory should always be allocated
     if constexpr (util_isDenseMatrixType<T>()) {
-        if (!mem_isAllocated(matr.cpuElems, matr.numRows))
-            return true;
-    } else {
-        if (!mem_isAllocated(matr.cpuElems))
-            return true;
-    }
+        if (!mem_isAllocated(matr.cpuElemsFlat))  return true;
+        if (!mem_isOuterAllocated(matr.cpuElems)) return true;
+    } else
+        if (!mem_isAllocated(matr.cpuElems)) return true;
 
     // if memory is 2D, we must also check each inner array was allocated
     if constexpr (util_isDenseMatrixType<T>()) {
-        if (!mem_isAllocated(matr.cpuElems, matr.numRows))
-            return true;
+        if (!mem_isAllocated(matr.cpuElems, matr.numRows)) return true;
     } else {
-        if (!mem_isAllocated(matr.cpuElems))
-            return true;
+        if (!mem_isAllocated(matr.cpuElems)) return true;
     }
 
     // if GPU memory is not allocated in a GPU environment...
@@ -197,6 +194,11 @@ extern "C" CompMatr createCompMatr(int numQubits) {
     qindex numRows = powerOf2(numQubits);
     qindex numElems = numRows * numRows;
 
+    qcomp* cpuMem = cpu_allocArray(numElems); // nullptr if failed
+    qcomp* gpuMem = nullptr;
+    if (getQuESTEnv().isGpuAccelerated)
+        gpuMem = gpu_allocArray(numElems); // nullptr if failed
+
     // initialise all CompMatr fields inline because most are const
     CompMatr out = {
         .numQubits = numQubits,
@@ -207,11 +209,9 @@ extern "C" CompMatr createCompMatr(int numQubits) {
         .isHermitian  = cpu_allocHeapFlag(), // nullptr if failed
         .wasGpuSynced = cpu_allocHeapFlag(), // nullptr if failed
 
-        // 2D CPU memory
-        .cpuElems = cpu_allocMatrix(numRows), // nullptr if failed, or may contain nullptr
-
-        // 1D GPU memory
-        .gpuElemsFlat = (getQuESTEnv().isGpuAccelerated)? gpu_allocArray(numElems) : nullptr // nullptr if failed or not needed
+        .cpuElems = cpu_allocAndInitMatrixWrapper(cpuMem, numRows), // nullptr if failed
+        .cpuElemsFlat = cpuMem,
+        .gpuElemsFlat = gpuMem
     };
 
     validateMatrixAllocs(out, __func__);
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
@@ -588,6 +588,11 @@ void error_gpuMemSyncQueriedButEnvNotGpuAccelerated() {
     raiseInternalError("A function checked whether persistent GPU memory (such as in a CompMatr) had been synchronised, but the QuEST environment is not GPU accelerated.");  
 }
 
+void error_gpuDeadCopyMatrixFunctionCalled() {
+
+    raiseInternalError("The internal GPU function copyMatrixIfGpuCompiled() was called, though is intended as dead-code - matrices needing copying to GPU should be stored as flat row-wise lists.");
+}
+
 void assert_quregIsGpuAccelerated(Qureg qureg) {
 
     if (!qureg.isGpuAccelerated)
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
@@ -228,6 +228,8 @@ void error_gpuMemSyncQueriedButEnvNotGpuAccelerated();
 
 void error_gpuUnexpectedlyInaccessible();
 
+void error_gpuDeadCopyMatrixFunctionCalled();
+
 void assert_gpuIsAccessible();
 
 void assert_quregIsGpuAccelerated(Qureg qureg);
diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
@@ -101,17 +101,41 @@ void cpu_deallocArray(qcomp* arr) {
 }
 
 
+qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim) {
+
+    // allocate only the outer memory (i.e. one row's worth)
+    qcomp** out = (qcomp**) malloc(dim * sizeof *out);
+
+    // caller will handle malloc failure
+    if (out == nullptr)
+        return out;
+
+    // populate out with offsets of arr
+    for (qindex i=0; i<dim; i++)
+        out[i] = &arr[i*dim];
+
+    return out;
+}
+
+
+void cpu_deallocMatrixWrapper(qcomp** wrapper) {
+
+    // only the outer pointer is freed; the
+    // inner pointers are offsets to another
+    // malloc which is separately freed
+    free(wrapper);
+}
+
+
 qcomp** cpu_allocMatrix(qindex dim) {
 
-    // TODO:
-    // the design of storing the CPU matrix elements as a 2D structure will impede
-    // performance for many qubits; the allocated heap memories for each row
-    // have no gaurantee to reside near other, so that their access/iteration in
-    // hot loops may incur unnecessary caching penalties. Consider storing the
-    // elements as a flat array, like we do for the GPU memory. This makes manual
-    // modification by the user trivially harder (changing [r][c] to [r*n+c]),
-    // but should improve caching, and significantly simplify allocation and its
-    // validation; no more enumerating nested pointers! Benchmark this scenario.
+    // NOTE:
+    // this function creates a matrix where rows are not necessarily
+    // contiguous in memory, which can incur gratuitous caching penalties
+    // when accessed in hot loops. As such, we do not use this function
+    // to allocate memory for CompMatr (instead, cpu_allocAndInitMatrixWrapper()),
+    // but instead use it for the individual Kraus matrices of a KrausMap,
+    // which are each quadratically smaller than the important superoperator.
 
     // allocate outer array
     qcomp** rows = (qcomp**) malloc(dim * sizeof *rows); // nullptr if failed
diff --git a/quest/src/cpu/cpu_config.hpp b/quest/src/cpu/cpu_config.hpp
@@ -41,6 +41,9 @@ int cpu_getOpenmpThreadInd();
 qcomp* cpu_allocArray(qindex length);
 void cpu_deallocArray(qcomp* arr);
 
+qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim);
+void cpu_deallocMatrixWrapper(qcomp** wrapper);
+
 qcomp** cpu_allocMatrix(qindex dim);
 void cpu_deallocMatrix(qcomp** matrix, qindex dim);
 
diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
@@ -333,6 +333,14 @@ void copyArrayIfGpuCompiled(qcomp* cpuArr, qcomp* gpuArr, qindex numElems, enum
 void copyMatrixIfGpuCompiled(qcomp** cpuMatr, qcomp* gpuArr, qindex matrDim, enum CopyDirection direction) {
 #if COMPILE_CUDA
 
+    // NOTE:
+    // this function copies a 2D CPU matrix into a 1D row-major GPU array,
+    // although this is not actually needed by the QuEST backend which
+    // maintains 1D row-major CPU memories merely aliased by 2D structures
+    // for the user's benefit. As such, this is dead code, but preserved in
+    // case it is ever needed (like if custom user 2D data was needed in GPU).
+    error_gpuDeadCopyMatrixFunctionCalled();
+
     // for completeness, we permit copying from the 1D GPU memory to the 2D CPU memory,
     // although we never actually have the need to do this!
     auto flag = (direction == TO_HOST)? 
@@ -400,11 +408,25 @@ void gpu_copyGpuToCpu(Qureg qureg) {
 
 void gpu_copyCpuToGpu(CompMatr matr) {
     assertHeapObjectGpuMemIsAllocated(matr);
-    copyMatrixIfGpuCompiled(matr.cpuElems, util_getGpuMemPtr(matr), matr.numRows, TO_DEVICE);
+
+    // note matr.cpuElems is merely a 2D alias for matr.cpuElemsFlat, which
+    // matches the format of matr.gpuElemsFlat. Ergo, we do not invoke 
+    // copyMatrixIfGpuCompiled(), and instead more efficiently overwrite
+    // the contiguous memory, which retains any user changes to .cpuElems
+
+    qindex numElems = matr.numRows * matr.numRows;
+    copyArrayIfGpuCompiled(matr.cpuElemsFlat, util_getGpuMemPtr(matr), numElems, TO_DEVICE);
 }
 void gpu_copyGpuToCpu(CompMatr matr) {
     assertHeapObjectGpuMemIsAllocated(matr);
-    copyMatrixIfGpuCompiled(matr.cpuElems, util_getGpuMemPtr(matr), matr.numRows, TO_HOST);
+
+    // note matr.cpuElems is merely a 2D alias for matr.cpuElemsFlat, which
+    // matches the format of matr.gpuElemsFlat. Ergo, we do not invoke 
+    // copyMatrixIfGpuCompiled(), and instead more efficiently overwrite
+    // the contiguous matr.cpuElemsFlat, which users can access via .cpuElems
+
+    qindex numElems = matr.numRows * matr.numRows;
+    copyArrayIfGpuCompiled(matr.cpuElemsFlat, util_getGpuMemPtr(matr), numElems, TO_HOST);
 }
 
 
@@ -420,11 +442,25 @@ void gpu_copyGpuToCpu(DiagMatr matr) {
 
 void gpu_copyCpuToGpu(SuperOp op) {
     assertHeapObjectGpuMemIsAllocated(op);
-    copyMatrixIfGpuCompiled(op.cpuElems, util_getGpuMemPtr(op), op.numRows, TO_DEVICE);
+
+    // note op.cpuElems is merely a 2D alias for op.cpuElemsFlat, which
+    // matches the format of op.gpuElemsFlat. Ergo, we do not invoke 
+    // copyMatrixIfGpuCompiled(), and instead more efficiently overwrite
+    // the contiguous memory, which retains any user changes to .cpuElems
+
+    qindex numElems = op.numRows * op.numRows;
+    copyArrayIfGpuCompiled(op.cpuElemsFlat, util_getGpuMemPtr(op), numElems, TO_DEVICE);
 }
 void gpu_copyGpuToCpu(SuperOp op) {
     assertHeapObjectGpuMemIsAllocated(op);
-    copyMatrixIfGpuCompiled(op.cpuElems, util_getGpuMemPtr(op), op.numRows, TO_HOST);
+
+    // note op.cpuElems is merely a 2D alias for op.cpuElemsFlat, which
+    // matches the format of op.gpuElemsFlat. Ergo, we do not invoke 
+    // copyMatrixIfGpuCompiled(), and instead more efficiently overwrite
+    // the contiguous op.cpuElemsFlat, which users can access via .cpuElems
+
+    qindex numElems = op.numRows * op.numRows;
+    copyArrayIfGpuCompiled(op.cpuElemsFlat, util_getGpuMemPtr(op), numElems, TO_HOST);
 }