pulp-platform · Xeratec · Jun 16, 2025 · Jun 2, 2025 · Jun 3, 2025 · Jun 3, 2025
@@ -90,6 +90,7 @@ jobs:
         testFloatRelu
         testFloatMaxPool
         testFloatMatmul
+        testFloatReshapeWithSkipConnection
         testFloatSoftmax
         testFloatTranspose
         testFloatMul

@@ -5,6 +5,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 This release containing major architectural changes, new platform support, enhanced simulation workflows, floating-point kernel support, training infrastructure for CCT models, memory allocation strategies, and documentation improvements.
 
 ### List of Pull Requests
+- Reshape Memory Freeing and Generic Float GEMM Fixes [#91](https://github.com/pulp-platform/Deeploy/pull/91)
 - Prepare for Release and Separate Dependencies [#90](https://github.com/pulp-platform/Deeploy/pull/90)
 - Move PULP SDK to main branch/fork [#88](https://github.com/pulp-platform/Deeploy/pull/88)
 - Finite Lifetime for IO Tensors [#51](https://github.com/pulp-platform/Deeploy/pull/51)
@@ -63,6 +64,9 @@ This release containing major architectural changes, new platform support, enhan
 
 
 ### Added
+- New alias list parameter for buffer objects
+- New test, also included in the CI pipeline, for the reshape and skip connection situation
+- 'shape' parameter handling similar to the 'indices' parameter in the generic reshape template
 - Test the correcteness of the memory map generated by the tiler
 - Add attribute to `VariableBuffer` to distinguish I/Os
 - Add proper static memory allocation with finite lifetime for I/Os
@@ -229,6 +233,8 @@ This release containing major architectural changes, new platform support, enhan
 - Packages listed in `dev-requirements.txt` are installed in the final stage of the Deeploy container.
 
 ### Fixed
+- Buffer deallocation to only happen when all its aliases are not live anymore (the data stored there is not needed anymore, not even by other nodes)
+- GEMM Generic float template to iterate through terms only when they actually contain multiple matrices
 - Fix the PULP Deployer where outputs were unecessary loaded in L3
 - Fix the lifetime computation of aliased buffers
 - Removed unsupported `-MMD` compiler flag in LLVM-based toolchains.

@@ -152,16 +152,32 @@ def apply(self,
         # We have to allocate the output buffers, unless they are global
 
         for buffer in list(reversed(outputNames)) + transientBuffers:
+            # Extract buffer info from context
             nb = ctxt.lookup(buffer)
+
+            # Check that it was not already allocated
             assert ctxt.localObjects[nb.name]._live == False, f"Tried to allocate already live buffer {nb.name}"
+
+            # Mark it as live
             ctxt.localObjects[nb.name]._live = True
+
+            # Add the allocation code to the execution block
             executionBlock.addLeft(nb.allocTemplate, nb._bufferRepresentation())
 
         for buffer in inputNames + transientBuffers:
+            # Extract buffer info from context
             nb = ctxt.lookup(buffer)
+
+            # Check that it was not already deallocated
             assert ctxt.localObjects[nb.name]._live == True, f"Tried to deallocate already dead buffer {nb.name}"
+
+            # Mark it as dead (not useful anymore)
             ctxt.localObjects[nb.name]._live = False
-            executionBlock.addRight(nb.deallocTemplate, nb._bufferRepresentation())
+
+            # Check for live ancestors (buffers that this is an alias of, that are still live),
+            # and add the deallocation code to the execution block if none found
+            if not nb.has_live_ancestors(ctxt = ctxt):
+                executionBlock.addRight(nb.deallocTemplate, nb._bufferRepresentation())
 
         return ctxt, executionBlock
 

@@ -255,7 +255,7 @@ class VariableBuffer():
     allocTemplate: NodeTemplate  #: NodeTemplate: Holds the buffer's allocation code
     deallocTemplate: NodeTemplate  #: NodeTemplate: Holds the buffer's deallocation code
 
-    def __init__(self, name: str = '', shape = [1]):
+    def __init__(self, name: str = '', shape = [1], alias_of: Optional[List[str]] = []):
         self.name: str = name  #: str: Canonical name that this buffer is registered as in the NetworkContext
         self.shape: Sequence[
             int] = shape  #: Sequence[int]: Represents the dimensions of the underlying tensor as a sequence of dimension sizes
@@ -274,6 +274,8 @@ def __init__(self, name: str = '', shape = [1]):
         self.is_input: bool = False
         self.is_output: bool = False
 
+        self.alias_of: List[str] = alias_of if alias_of is not None else []
+
     def _bufferRepresentation(self) -> Dict:
         return {"type": self._instance, "name": self.name, "size": int(np.prod(self.shape))}
 
@@ -339,6 +341,61 @@ def __getstate__(self):
     def fromNode(cls, node: gs.Node):
         return (cls(name = node.name, shape = node.shape if not isinstance(node, gs.Constant) else node.values.shape))
 
+    def add_aliases(self, aliases_to_add: List[str]):
+        """
+        Adds list of aliases to the alias_of attribute.
+        Parameters
+        ----------
+        alias_to_add : List[str]
+            List of names of aliases to add to the alias_of attribute.
+        Returns
+        -------
+        None
+        """
+
+        if not hasattr(self, "alias_of"):
+            return None
+
+        for alias in aliases_to_add:
+            if alias not in self.alias_of:
+                self.alias_of.append(alias)
+
+        return None
+
+    def get_aliases_of(self):
+        """
+        Getter function for the alias_of attribute.
+        Returns
+        -------
+        List[str]
+            List of names o all aliases of this VariableBuffer.
+        """
+
+        if hasattr(self, "alias_of"):
+            return self.alias_of
+        else:
+            return list()
+
+    def has_live_ancestors(self, ctxt: NetworkContext) -> bool:
+        """Checks whether this VariableBuffer has any live ancestors, i.e. buffers that are still live and are aliased by this buffer.
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        Returns
+        -------
+        bool
+            True if this VariableBuffer has any live ancestors, False otherwise
+        """
+        if not hasattr(self, "alias_of"):
+            return False
+
+        for alias in self.alias_of:
+            if ctxt.lookup(alias)._live:
+                return True
+
+        return False
+
 
 class TransientBuffer(VariableBuffer):
     """Class to represent memory space required by kernels that is not covered by input and output tensors, e.g. im2col buffers in convolutions
@@ -365,6 +422,8 @@ def __init__(self, name: str = '', size = 0):
         self.is_input: bool = False
         self.is_output: bool = False
 
+        self.alias_of: List[str] = []
+
     def __eq__(self, other):
 
         ret = all([self.name == other.name, self.size == other.size])

@@ -1001,14 +1001,28 @@ def parseNodeCtxt(self,
                       node: gs.Node,
                       channels_first: bool = True) -> Tuple[NetworkContext, bool]:
 
+        # Define names of node inputs and outputs, according to the ONNX standard
         inputs = ['data_in', 'shape']
         outputs = ['data_out']
 
+        # Map inputs and outputs to their corresponding names in the operator representation
         for idx, inputNode in enumerate(node.inputs):
             self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
         for idx, outputNode in enumerate(node.outputs):
             self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
 
+        # Update alias_of parameter for the output node
+        output_node = ctxt.lookup(node.outputs[outputs.index("data_out")].name)
+        input_node = ctxt.lookup(node.inputs[inputs.index("data_in")].name)
+
+        # Prepare new aliases
+        new_output_node_aliases = input_node.get_aliases_of()
+        new_output_node_aliases.append(input_node.name)
+
+        # Add new aliases
+        output_node.add_aliases(aliases_to_add = new_output_node_aliases)
+
+        # Compute data size
         self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
 
         return ctxt, True
@@ -1623,9 +1637,13 @@ def parseNodeCtxt(self,
             node.inputs.append(zeroTensor)
             self.operatorRepresentation['C'] = f'{node.name}_C_Tensor'
 
+        # Store the input and output shapes in the operator representation
         self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
         self.operatorRepresentation['A_shape'] = ctxt.lookup(node.inputs[0].name).shape
         self.operatorRepresentation['B_shape'] = ctxt.lookup(node.inputs[1].name).shape
+        self.operatorRepresentation['data_out_shape'] = ctxt.lookup(node.outputs[0].name).shape
+
+        # Store the matrix dimensions in the operator representation
         self.operatorRepresentation['M'] = ctxt.lookup(
             node.inputs[0].name).shape[(-2 + self.operatorRepresentation['transA'])]
         self.operatorRepresentation['N'] = ctxt.lookup(
@@ -1637,11 +1655,24 @@ def parseNodeCtxt(self,
         ret = ret and (self.operatorRepresentation['N'] == ctxt.lookup(
             node.inputs[1].name).shape[-2 + self.operatorRepresentation['transB']])
 
-        self.operatorRepresentation['batch'] = np.prod(ctxt.lookup(node.inputs[0].name).shape[:-2])
+        # Check if the batch dimensions are compatible
+        self.operatorRepresentation['batch_A'] = np.prod(ctxt.lookup(node.inputs[0].name).shape[:-2])
+        self.operatorRepresentation['batch_B'] = np.prod(ctxt.lookup(node.inputs[1].name).shape[:-2])
+
+        self.operatorRepresentation['batch'] = max(self.operatorRepresentation['batch_A'],
+                                                   self.operatorRepresentation['batch_B'])
 
-        # SCHEREMO: Assert that batch is the same on both matrices
-        W_batched = (self.operatorRepresentation['batch'] == np.prod(ctxt.lookup(node.inputs[1].name).shape[:-2]))
-        self.operatorRepresentation['W_batched'] = W_batched
+        assert (self.operatorRepresentation["batch_A"] == self.operatorRepresentation["batch_B"]) or (
+            self.operatorRepresentation["batch_A"] == 1
+        ) or (
+            self.operatorRepresentation["batch_B"] == 1
+        ), "Incompatible dimensions for input matrices. Broadcasting not yet supported for dimensions larger than 1 on one of the inputs, or equal dimensions between the 2."
+
+        # Create flags for same dimension between each input matrix and the final batch dimension
+        self.operatorRepresentation['A_batched'] = (self.operatorRepresentation['batch'] == np.prod(
+            ctxt.lookup(node.inputs[0].name).shape[:-2]))
+        self.operatorRepresentation['W_batched'] = self.operatorRepresentation['B_batched'] = (
+            self.operatorRepresentation['batch'] == np.prod(ctxt.lookup(node.inputs[1].name).shape[:-2]))
 
         return ctxt, ret
 
@@ -1745,12 +1776,25 @@ def parseNodeCtxt(self,
                 self.operatorRepresentation[outputs[idx]] = newCtxt.lookup(outputNode.name).name
 
             if len(node.inputs) == 3:
+                # Compute bias name and shape if present in the inputs
                 self.operatorRepresentation['C'] = newCtxt.lookup(node.inputs[2].name).name
+                self.operatorRepresentation['C_shape'] = newCtxt.lookup(node.inputs[2].name).shape
+
+                # Create flag for same dimension between bias matrix and the final batch dimension
+                self.operatorRepresentation['C_batched'] = (self.operatorRepresentation['batch'] == np.prod(
+                    newCtxt.lookup(node.inputs[2].name).shape[:-2]))
             elif not self.noBiasHoisting:
+                # Create mock bias matrix if not present in the inputs
                 values = np.zeros((1))
                 zeroTensor = gs.Constant(f'{node.name}_C_Tensor', values = values)
                 newCtxt.hoistConstant(zeroTensor)
+
+                # Store it in the operator representation
                 self.operatorRepresentation['C'] = f'{node.name}_C_Tensor'
+                self.operatorRepresentation['C_shape'] = (0,)
+
+                # Create flag for same dimension between bias matrix and the final batch dimension
+                self.operatorRepresentation['C_batched'] = False
 
             self.operatorRepresentation['size'] = np.prod(newCtxt.lookup(node.inputs[0].name).shape)
 

@@ -45,9 +45,18 @@
             ${transB}
         );
 
+        % if A_batched:
         ref_${data_out}_${A} += ${M} * ${N};
+        % endif
+
+        % if B_batched:
         ref_${data_out}_${B} += ${N} * ${O};
+        % endif
+
+        % if C_batched:
         ref_${data_out}_${C} += ${M} * ${O};
+        % endif
+
         ref_${data_out}_${data_out} += ${M} * ${O};
     }
 END_SINGLE_CORE

@@ -41,6 +41,11 @@ def alignToContext(self, ctxt: NetworkContext,
             ctxt.globalObjects[operatorRepresentation['indices']]._deploy = False
             ctxt.globalObjects[operatorRepresentation['indices']]._live = False
 
+        # Same for "shape"
+        if "shape" in operatorRepresentation.keys():
+            ctxt.globalObjects[operatorRepresentation["shape"]]._deploy = False
+            ctxt.globalObjects[operatorRepresentation["shape"]]._live = False
+
         inBuffer = ctxt.lookup(operatorRepresentation['data_in'])
         outBuffer = ctxt.lookup(operatorRepresentation['data_out'])
         outBuffer._alias = inBuffer.name