pulp-platform · Xeratec · Sep 16, 2025 · Sep 8, 2025
@@ -55,7 +55,9 @@ jobs:
           {"name":"testFloatSoftmax","L1":[4000]},
           {"name":"testFloatTranspose","L1":[2000]},
           {"name":"testFloatMul","L1":[2000]},
-          {"name":"largeFloatAdd","L1":[220000]}
+          {"name":"largeFloatAdd","L1":[220000]},
+          {"name":"testRQGEMMwBatch","L1":[20000]},
+          {"name":"testMatMulBatch","L1":[20000]}
         ]
       num-cores: 8
 

@@ -4,6 +4,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 ## Unreleased (Planned Release Target: v0.2.1)
 
 ### List of Pull Requests
+- Fix PULP GEMM `batch` serialization [#109](https://github.com/pulp-platform/Deeploy/pull/109)
 - Split CI Workflows by Platform and Task, Improve Formatting and Linting Reliability [#108](https://github.com/pulp-platform/Deeploy/pull/108)
 - Refactor tiling code generation [#105](https://github.com/pulp-platform/Deeploy/pull/105)
 - Change order of typeMatching entries [#68](https://github.com/pulp-platform/Deeploy/pull/68)
@@ -61,6 +62,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Prevent node duplication for graphs generated via GraphSurgeon
 - Resolved issue with missing `id` in the `Build Cache for Docker` step, used in the `Inject build-cache` step.
 - Fix license CI check and prevent potential issues with `jq` installation
+- PULP Gemm `batch` variable serialization
 
 ### Removed
 - Delete outdated and unused `.gitlab-ci.yml` file

@@ -24,6 +24,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
 from typing import Dict, List, Tuple
 
 from Deeploy.AbstractDataTypes import PointerClass
@@ -135,25 +136,22 @@ def serializeTilingSolution(
 
         # Every output is constructed by a pair of inputs. Reconstruct this pair.
         for cube in outputCubes:
+            MOffset, OOffset = cube.offset[-2:]
+            MSize, OSize = cube.dims[-2:]
 
-            BSize = 1
-            BOffset = 0
-            BatchSize = 1
-            BatchOffset = 0
-
-            if len(cube.offset) == 2:
-                (MOffset, OOffset) = cube.offset
-                (MSize, OSize) = cube.dims
-            elif len(cube.offset) == 3:
-                (BatchOffset, MOffset, OOffset) = cube.offset
-                (BatchSize, MSize, OSize) = cube.dims
+            if len(cube.offset) > 2:
+                BatchSize = math.prod(cube.dims[:-2])
+
+                if len(cube.offset) > 3:
+                    assert all(off == 0 for off in cube.offset[:-3]), (
+                        f"Unsupported tiling across leading batch dims: offsets={cube.offset}. "
+                        "Only the last batch dim (besides M/O) may be tiled.")
             else:
-                (BatchOffset, BOffset, MOffset, OOffset) = cube.offset
-                (BatchSize, BSize, MSize, OSize) = cube.dims
+                BatchSize = 1
 
             replacements["M"].append(MSize)
             replacements["O"].append(OSize)
-            replacements["batch"].append(BSize)
+            replacements["batch"].append(BatchSize)
 
             if transA == 0:
                 AMatrixOffsets = (MOffset, NOffset)
@@ -162,49 +160,30 @@ def serializeTilingSolution(
                 AMatrixOffsets = (NOffset, MOffset)
                 AMatrixShape = (NSize, MSize)
 
+            if len(buffA.shape) > 2:
+                batchDimCount = len(buffA.shape) - 2
+                AMatrixOffsets = tuple(cube.offset[:-2][-batchDimCount:]) + AMatrixOffsets
+                AMatrixShape = tuple(cube.dims[:-2][-batchDimCount:]) + AMatrixShape
+
+            ACube = HyperRectangle(AMatrixOffsets, AMatrixShape)
+            inputACubes.append(ACube)
+
             if transB == 0:
                 BMatrixOffsets = (NOffset, OOffset)
                 BMatrixShape = (NSize, OSize)
             else:
                 BMatrixOffsets = (OOffset, NOffset)
                 BMatrixShape = (OSize, NSize)
 
-            if len(buffA.shape) == 2:
-                ACube = HyperRectangle(AMatrixOffsets, AMatrixShape)
-            elif len(buffA.shape) == 3:
-                ACube = HyperRectangle((BatchOffset,) + AMatrixOffsets, (BatchSize,) + AMatrixShape)
-            else:
-                ACube = HyperRectangle(
-                    (
-                        BatchOffset,
-                        BOffset,
-                    ) + AMatrixOffsets,
-                    (
-                        BatchSize,
-                        BSize,
-                    ) + AMatrixShape,
-                )
-
-            if len(buffB.shape) == 2:
-                BCube = HyperRectangle(BMatrixOffsets, BMatrixShape)
-            elif len(buffB.shape) == 3:
-                BCube = HyperRectangle((BatchOffset,) + BMatrixOffsets, (BatchSize,) + BMatrixShape)
-            else:
-                BCube = HyperRectangle(
-                    (
-                        BatchOffset,
-                        BOffset,
-                    ) + BMatrixOffsets,
-                    (
-                        BatchSize,
-                        BSize,
-                    ) + BMatrixShape,
-                )
-
-            RequantCube = HyperRectangle((OOffset,), (OSize,))
+            if len(buffB.shape) > 2:
+                batchDimCount = len(buffB.shape) - 2
+                BMatrixOffsets = tuple(cube.offset[:-2][-batchDimCount:]) + BMatrixOffsets
+                BMatrixShape = tuple(cube.dims[:-2][-batchDimCount:]) + BMatrixShape
 
-            inputACubes.append(ACube)
+            BCube = HyperRectangle(BMatrixOffsets, BMatrixShape)
             inputBCubes.append(BCube)
+
+            RequantCube = HyperRectangle((OOffset,), (OSize,))
             inputMulCubes.append(RequantCube)
             inputAddCubes.append(RequantCube)
 
@@ -231,40 +210,6 @@ def serializeTilingSolution(
         return VariableReplacementScheme(replacements, replacementTypes), schedule
 
 
-class MatrixVecTileConstraint(GEMMTileConstraint):
-
-    @staticmethod
-    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-
-        tm = GEMMTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
-
-        return tm
-
-    @staticmethod
-    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-
-        tm = GEMMTileConstraint.addPolicyConstraint(tilerModel, parseDict, ctxt)
-
-        return tm
-
-
-class TallGEMMTileConstraint(GEMMTileConstraint):
-
-    @staticmethod
-    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-
-        tm = GEMMTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
-
-        return tm
-
-    @staticmethod
-    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-
-        tm = GEMMTileConstraint.addPolicyConstraint(tilerModel, parseDict, ctxt)
-
-        return tm
-
-
 class FloatGEMMTileConstraint(TileConstraint):
 
     @staticmethod
@@ -367,25 +312,22 @@ def serializeTilingSolution(
 
         # Every output is constructed by a pair of inputs. Reconstruct this pair.
         for cube in outputCubes:
+            MOffset, OOffset = cube.offset[-2:]
+            MSize, OSize = cube.dims[-2:]
 
-            BSize = 1
-            BOffset = 0
-            BatchSize = 1
-            BatchOffset = 0
-
-            if len(cube.offset) == 2:
-                (MOffset, OOffset) = cube.offset
-                (MSize, OSize) = cube.dims
-            elif len(cube.offset) == 3:
-                (BatchOffset, MOffset, OOffset) = cube.offset
-                (BatchSize, MSize, OSize) = cube.dims
+            if len(cube.offset) > 2:
+                BatchSize = math.prod(cube.dims[:-2])
+
+                if len(cube.offset) > 3:
+                    assert all(off == 0 for off in cube.offset[:-3]), (
+                        f"Unsupported tiling across leading batch dims: offsets={cube.offset}. "
+                        "Only the last batch dim (besides M/O) may be tiled.")
             else:
-                (BatchOffset, BOffset, MOffset, OOffset) = cube.offset
-                (BatchSize, BSize, MSize, OSize) = cube.dims
+                BatchSize = 1
 
             replacements["M"].append(MSize)
             replacements["O"].append(OSize)
-            replacements["batch"].append(BSize)
+            replacements["batch"].append(BatchSize)
 
             if transA == 0:
                 AMatrixOffsets = (MOffset, NOffset)
@@ -394,57 +336,38 @@ def serializeTilingSolution(
                 AMatrixOffsets = (NOffset, MOffset)
                 AMatrixShape = (NSize, MSize)
 
+            if len(buffA.shape) > 2:
+                batchDimCount = len(buffA.shape) - 2
+                AMatrixOffsets = tuple(cube.offset[:-2][-batchDimCount:]) + AMatrixOffsets
+                AMatrixShape = tuple(cube.dims[:-2][-batchDimCount:]) + AMatrixShape
+
+            ACube = HyperRectangle(AMatrixOffsets, AMatrixShape)
+            inputACubes.append(ACube)
+
             if transB == 0:
                 BMatrixOffsets = (NOffset, OOffset)
                 BMatrixShape = (NSize, OSize)
             else:
                 BMatrixOffsets = (OOffset, NOffset)
                 BMatrixShape = (OSize, NSize)
 
-            if len(buffA.shape) == 2:
-                ACube = HyperRectangle(AMatrixOffsets, AMatrixShape)
-            elif len(buffA.shape) == 3:
-                ACube = HyperRectangle((BatchOffset,) + AMatrixOffsets, (BatchSize,) + AMatrixShape)
-            else:
-                ACube = HyperRectangle(
-                    (
-                        BatchOffset,
-                        BOffset,
-                    ) + AMatrixOffsets,
-                    (
-                        BatchSize,
-                        BSize,
-                    ) + AMatrixShape,
-                )
-
-            if len(buffB.shape) == 2:
-                BCube = HyperRectangle(BMatrixOffsets, BMatrixShape)
-            elif len(buffB.shape) == 3:
-                BCube = HyperRectangle((BatchOffset,) + BMatrixOffsets, (BatchSize,) + BMatrixShape)
-            else:
-                BCube = HyperRectangle(
-                    (
-                        BatchOffset,
-                        BOffset,
-                    ) + BMatrixOffsets,
-                    (
-                        BatchSize,
-                        BSize,
-                    ) + BMatrixShape,
-                )
+            if len(buffB.shape) > 2:
+                batchDimCount = len(buffB.shape) - 2
+                BMatrixOffsets = tuple(cube.offset[:-2][-batchDimCount:]) + BMatrixOffsets
+                BMatrixShape = tuple(cube.dims[:-2][-batchDimCount:]) + BMatrixShape
+
+            BCube = HyperRectangle(BMatrixOffsets, BMatrixShape)
+            inputBCubes.append(BCube)
 
             CMatrixOffsets = (MOffset, OOffset)
             CMatrixShape = (MSize, OSize)
 
-            if len(buffC.shape) == 2:
-                CCube = HyperRectangle(CMatrixOffsets, CMatrixShape)
-            elif len(buffC.shape) == 3:
-                CCube = HyperRectangle((BatchOffset,) + CMatrixOffsets, (BatchSize,) + CMatrixShape)
-            else:
-                CCube = HyperRectangle((BatchOffset, BOffset) + CMatrixOffsets, (BatchSize, BSize) + CMatrixShape)
+            if len(buffC.shape) > 2:
+                batchDimCount = len(buffC.shape) - 2
+                CMatrixOffsets = tuple(cube.offset[:-2][-batchDimCount:]) + CMatrixOffsets
+                CMatrixShape = tuple(cube.dims[:-2][-batchDimCount:]) + CMatrixShape
 
-            inputACubes.append(ACube)
-            inputBCubes.append(BCube)
+            CCube = HyperRectangle(CMatrixOffsets, CMatrixShape)
             inputAddCubes.append(CCube)
 
         inputLoadSchedule = []

@@ -24,6 +24,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
 from typing import Dict, List, Tuple
 
 from Deeploy.AbstractDataTypes import PointerClass
@@ -125,65 +126,43 @@ def serializeTilingSolution(
 
         # Every output is constructed by a pair of inputs. Reconstruct this pair.
         for cube in outputCubes:
+            MOffset, OOffset = cube.offset[-2:]
+            MSize, OSize = cube.dims[-2:]
 
-            BSize = 1
-            BOffset = 0
-            BatchSize = 1
-            BatchOffset = 0
-
-            if len(cube.offset) == 2:
-                (MOffset, OOffset) = cube.offset
-                (MSize, OSize) = cube.dims
-            elif len(cube.offset) == 3:
-                (BatchOffset, MOffset, OOffset) = cube.offset
-                (BatchSize, MSize, OSize) = cube.dims
+            if len(cube.offset) > 2:
+                BatchSize = math.prod(cube.dims[:-2])
+
+                if len(cube.offset) > 3:
+                    assert all(off == 0 for off in cube.offset[:-3]), (
+                        f"Unsupported tiling across leading batch dims: offsets={cube.offset}. "
+                        "Only the last batch dim (besides M/O) may be tiled.")
             else:
-                (BatchOffset, BOffset, MOffset, OOffset) = cube.offset
-                (BatchSize, BSize, MSize, OSize) = cube.dims
+                BatchSize = 1
 
             replacements["M"].append(MSize)
             replacements["O"].append(OSize)
-            replacements["batch"].append(BSize)
+            replacements["batch"].append(BatchSize)
 
             AMatrixOffsets = (MOffset, NOffset)
             AMatrixShape = (MSize, NSize)
 
+            if len(buffA.shape) > 2:
+                batchDimCount = len(buffA.shape) - 2
+                AMatrixOffsets = tuple(cube.offset[:-2][-batchDimCount:]) + AMatrixOffsets
+                AMatrixShape = tuple(cube.dims[:-2][-batchDimCount:]) + AMatrixShape
+
+            ACube = HyperRectangle(AMatrixOffsets, AMatrixShape)
+            inputACubes.append(ACube)
+
             BMatrixOffsets = (NOffset, OOffset)
             BMatrixShape = (NSize, OSize)
 
-            if len(buffA.shape) == 2:
-                ACube = HyperRectangle(AMatrixOffsets, AMatrixShape)
-            elif len(buffA.shape) == 3:
-                ACube = HyperRectangle((BatchOffset,) + AMatrixOffsets, (BatchSize,) + AMatrixShape)
-            else:
-                ACube = HyperRectangle(
-                    (
-                        BatchOffset,
-                        BOffset,
-                    ) + AMatrixOffsets,
-                    (
-                        BatchSize,
-                        BSize,
-                    ) + AMatrixShape,
-                )
-
-            if len(buffB.shape) == 2:
-                BCube = HyperRectangle(BMatrixOffsets, BMatrixShape)
-            elif len(buffB.shape) == 3:
-                BCube = HyperRectangle((BatchOffset,) + BMatrixOffsets, (BatchSize,) + BMatrixShape)
-            else:
-                BCube = HyperRectangle(
-                    (
-                        BatchOffset,
-                        BOffset,
-                    ) + BMatrixOffsets,
-                    (
-                        BatchSize,
-                        BSize,
-                    ) + BMatrixShape,
-                )
+            if len(buffB.shape) > 2:
+                batchDimCount = len(buffB.shape) - 2
+                BMatrixOffsets = tuple(cube.offset[:-2][-batchDimCount:]) + BMatrixOffsets
+                BMatrixShape = tuple(cube.dims[:-2][-batchDimCount:]) + BMatrixShape
 
-            inputACubes.append(ACube)
+            BCube = HyperRectangle(BMatrixOffsets, BMatrixShape)
             inputBCubes.append(BCube)
 
         inputLoadSchedule = []