pulp-platform · Victor-Jung · Feb 11, 2025 · Jan 29, 2025 · Jan 29, 2025 · Jan 29, 2025
@@ -115,6 +115,7 @@ jobs:
         testRQGEMM
         TestRQAdd
         testRQGEMMTransB
+        testFloatSoftmax
       num-cores: 9
       simulators: |
         banshee
@@ -141,9 +142,24 @@ jobs:
             "name": "testRQGEMM",
             "L1": [2000, 5000]
           },
+          {
+            "name": "testFloatSoftmax",
+            "L1": [2000, 5000, 10000]
+          },
+
           {
             "name": "TestRQAdd",
             "L1": [5000, 10000]
+          },
+
+          {
+            "name": "testFloatGEMM",
+            "L1": [2000, 5000, 10000]
+          },
+
+          {
+            "name": "testFloatGEMMtransB",
+            "L1": [2000, 5000, 10000]
           }
         ]
       simulators: |

@@ -112,4 +112,17 @@ Change main.c to use OUTPUTTYPE instead of float
 
 ### Fixed
 - MaxPool Padding Extract Pass for float and interger
-- Testinput, testoutput, weight type casted from double to float warning
+- Testinput, testoutput, weight type casted from double to float warning
+
+## Add Float GEMM and Softmax for Snitch platform
+
+### Added
+- New templates for GEMM and Softmax.
+- Added GEMM and Softmax to TargetLibraries, including case for GEMM with a transposed B matrix.
+- Added new CI tests for GEMM and Softmax.
+
+### Changed
+- Adapted snitch Bindings and Platform files.
+
+### Fixed
+- Relaxed the error threshold between expected and actual values in deeploytest.
@@ -6,4 +6,5 @@ All contributors have agreed to an open-source release of their work in the Deep
 * Luka Macan
 * Alberto Dequino
 * Francesco Conti
-* Run Wang
+* Run Wang
+* Taha El Bayad
@@ -549,7 +549,6 @@ def __init__(self):
     def parseNode(self, node: gs.Node) -> bool:
 
         ret = all([len(node.inputs) == 1, len(node.outputs) == 1])
-
         return ret
 
     def parseNodeCtxt(self,

@@ -29,14 +29,15 @@
 from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration
 from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
     MemoryManagementGeneration
-from Deeploy.CommonExtensions.DataTypes import int8_t, int32_t, uint8_t
+from Deeploy.CommonExtensions.DataTypes import float32_t, int8_t, int32_t, uint8_t
 from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
 from Deeploy.Targets.Generic.Templates import iNoNormTemplate
 from Deeploy.Targets.Generic.TypeCheckers import AddChecker, GEMMChecker, RQAddChecker, SoftmaxChecker, iNoNormChecker
 from Deeploy.Targets.Snitch.CodeTransformationPasses import SnitchClusterTiling, SnitchCoreFilterPass, \
     SnitchProfileExecutionBlockPass, SnitchSynchCoresPass
-from Deeploy.Targets.Snitch.Templates import AddTemplate, RQAddTemplate, iSoftmaxTemplate
+from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, RQAddTemplate, iSoftmaxTemplate
+from Deeploy.Targets.Snitch.Templates.FloatSoftmaxTemplate import FloatSoftmax_Template
 from Deeploy.Targets.Snitch.Templates.GemmTemplate import SnitchGemm_Template
 from Deeploy.Targets.Snitch.Templates.RqGemmTemplate import SnitchRqGemm_Template
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement
@@ -69,7 +70,11 @@
 SnitchiSoftmaxBindings = [
     NodeBinding(SoftmaxChecker([PointerClass(_type)], [PointerClass(uint8_t)]), iSoftmaxTemplate.referenceTemplate,
                 TiledTransformer) for _type in [int8_t, uint8_t]
+] + [
+    NodeBinding(SoftmaxChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatSoftmax_Template,
+                TiledTransformer)
 ]
+
 SnitchiNoNormBindings = [
     NodeBinding(
         iNoNormChecker([PointerClass(_type), PointerClass(int8_t),
@@ -88,6 +93,11 @@
     NodeBinding(
         GEMMChecker([PointerClass(int8_t), PointerClass(int8_t),
                      PointerClass(int32_t)], [PointerClass(int32_t)]), SnitchGemm_Template, TiledTransformer)
+] + [
+    NodeBinding(
+        GEMMChecker([PointerClass(float32_t), PointerClass(float32_t),
+                     PointerClass(float32_t)], [PointerClass(float32_t)]), FloatGemmTemplate.referenceTemplate,
+        TiledTransformer)
 ]
 SnitchRqGemmBindings = [
     NodeBinding(

@@ -35,7 +35,7 @@
 from Deeploy.Targets.Generic.Layers import AddLayer, GatherLayer, GEMMLayer, LayerNormLayer, MatMulLayer, PadLayer, \
     ReshapeLayer, RQGEMMLayer, RQIntegerDivLayer, SoftmaxLayer, iNoNormLayer
 from Deeploy.Targets.Generic.Parsers import AddParser, GatherParser, MatMulParser, Pad1DParser, Pad2DParser, \
-    RQAddParser, RQIntegerDivParser, UnsqueezeParser, iLayerNormParser, iNoNormParser, iSoftmaxParser
+    RQAddParser, RQIntegerDivParser, SoftmaxParser, UnsqueezeParser, iLayerNormParser, iNoNormParser, iSoftmaxParser
 from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate
 from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import AddRequantMergePass, GEMMRequantMergePass, \
     IntegerDivRequantMergePass, MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, \
@@ -58,6 +58,7 @@
 GemmMapper = NodeMapper(SnitchGEMMParser(), SnitchGemmTilingReadyBindings)
 RqGemmMapper = NodeMapper(SnitchRQGEMMParser(), SnitchRqGemmTilingReadyBindings)
 iSoftmaxMapper = NodeMapper(iSoftmaxParser(), SnitchiSoftmaxTilingReadyBindings)
+SoftmaxMapper = NodeMapper(SoftmaxParser(), SnitchiSoftmaxTilingReadyBindings)
 iNoNormMapper = NodeMapper(iNoNormParser(), SnitchiNoNormTilingReadyBindings)
 iLayerNormMapper = NodeMapper(iLayerNormParser(), BasicLayerNormBindings)
 RQAddMapper = NodeMapper(RQAddParser(), SnitchRQAddTilingReadyBindings)
@@ -72,6 +73,7 @@
     'Gemm': GEMMLayer([GemmMapper]),
     'RQGemm': RQGEMMLayer([RqGemmMapper]),
     'iSoftmax': SoftmaxLayer([iSoftmaxMapper]),
+    'Softmax': SoftmaxLayer([SoftmaxMapper]),
     'iNoNorm': iNoNormLayer([iNoNormMapper]),
     'iLayerNorm': LayerNormLayer([iLayerNormMapper]),
     'RequantizedAdd': AddLayer([RQAddMapper]),

@@ -0,0 +1,11 @@
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+uint32_t compute_num = snrt_cluster_compute_core_num();
+
+% if transB:
+gemm_fp32_transB_opt(${M} / compute_num, ${O}, ${N}, ${A}, ${N} * compute_num, ${B}, ${N}, ${C}, ${O} * compute_num, ${data_out}, 1, 1 );
+% else:                                 
+gemm_fp32_opt(${M} / compute_num, ${O}, ${N}, ${A}, ${N} * compute_num, ${B}, ${O}, ${C}, ${O} * compute_num, ${data_out}, 1, 1 );
+%endif
+""")
@@ -0,0 +1,57 @@
+# ----------------------------------------------------------------------
+#
+# File: iSoftmaxTemplate.py
+#
+# Last edited: 30.05.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class FloatSoftmaxTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation["data_in"])
+        operatorRepresentation["seq_len"] = data_in.shape[2]
+        operatorRepresentation["input_samples"] = data_in.shape[-1]
+
+        operatorRepresentation["kernelName"] = "Softmax_fp32"
+
+        return ctxt, operatorRepresentation, []
+
+
+FloatSoftmaxTemplateStr = r"""
+    uint32_t batch_size = ${size} / ${lastDimLength};
+    uint32_t compute_num = snrt_cluster_compute_core_num();
+    int32_t ldI = compute_num * ${input_samples};
+    int32_t batch_offset = ${seq_len} * ${input_samples};
+
+    ${kernelName}(${data_in}, ${data_out}, ldI, batch_offset, batch_size, ${seq_len}, ${input_samples});
+"""
+
+FloatSoftmax_Template = FloatSoftmaxTemplate(FloatSoftmaxTemplateStr)
@@ -141,4 +141,4 @@ void main(void) {
 
 printf("Runtime: %u cycles\r\n", getCycles());
 printf("Errors: %u out of %u \r\n", tot_err, tot_tested);
-}
+}
@@ -132,23 +132,46 @@ int main(void) {
 #ifndef NOTEST
     int32_t tot_err = 0;
     uint32_t tot = 0;
-    int32_t diff;
-    int32_t expected, actual;
-    for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
-
-      tot += DeeployNetwork_outputs_bytes[buf];
-      for (uint32_t i = 0; i < DeeployNetwork_outputs_bytes[buf]; i++) {
-        expected = ((char *)testOutputVector[buf])[i];
-        actual = ((char *)DeeployNetwork_outputs[buf])[i];
-        diff = expected - actual;
-
-        if (diff) {
-          tot_err += 1;
+    if (ISOUTPUTFLOAT) {
+      float32_t diff;
+      float32_t expected, actual;
+      for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
+
+        tot += DeeployNetwork_outputs_bytes[buf] / sizeof(float32_t);
+        for (uint32_t i = 0;
+             i < DeeployNetwork_outputs_bytes[buf] / sizeof(float32_t); i++) {
+          expected = ((float32_t *)testOutputVector[buf])[i];
+          actual = ((float32_t *)DeeployNetwork_outputs[buf])[i];
+          diff = expected - actual;
+
+          if (diff < -1.2e-5 || diff > 1.2e-5) {
+            tot_err += 1;
+#ifndef CI
+            printf("Expected: %f  ", expected);
+            printf("Actual: %f  ", actual);
+            printf("Diff: %f at Index %12u in Output %u\r\n", diff, i, buf);
+#endif
+          }
+        }
+      }
+    } else {
+      int32_t diff;
+      int32_t expected, actual;
+      for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
+
+        tot += DeeployNetwork_outputs_bytes[buf];
+        for (uint32_t i = 0; i < DeeployNetwork_outputs_bytes[buf]; i++) {
+          expected = ((char *)testOutputVector[buf])[i];
+          actual = ((char *)DeeployNetwork_outputs[buf])[i];
+          diff = expected - actual;
+          if (diff) {
+            tot_err += 1;
 #ifndef CI
-          printf("Expected: %4d  ", expected);
-          printf("Actual: %4d  ", actual);
-          printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf);
+            printf("Expected: %4d  ", expected);
+            printf("Actual: %4d  ", actual);
+            printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf);
 #endif
+          }
         }
       }
     }

@@ -0,0 +1,51 @@
+#ifndef __DEEPLOY_MATH_GEMM_KERNEL_HEADER_
+#define __DEEPLOY_MATH_GEMM_KERNEL_HEADER_
+
+#include "DeeploySnitchMath.h"
+
+/*
+ * TILING ONLY due to ssr loop
+ *
+ *
+ *
+ * FP32 GEMM with the following format:
+ * A is an M x K matrix, B is a K x N matrix, and C is a M x N matrix
+ *
+ * A' = transpose(A) if transA else A
+ * B' = transpose(B) if transB else B
+ *
+ * Y =  A' * B' + C
+ *
+ */
+
+/*
+ *
+ * transposed A    = no
+ * transposed B    = yes
+ * multi-core      = yes
+ * unrolling       = yes
+ * simd            = yes
+ * parallelization = row-wise
+ */
+
+void gemm_fp32_transB_opt(uint32_t M, uint32_t N, uint32_t K, float32_t *A,
+                          uint32_t ldA, float32_t *B, uint32_t ldB,
+                          float32_t *C, uint32_t ldC, float32_t *Y,
+                          uint32_t BETA, uint32_t setup_SSR);
+
+/*
+ *
+ * transposed A    = no
+ * transposed B    = no
+ * multi-core      = yes
+ * unrolling       = yes
+ * simd            = yes
+ * parallelization = row-wise
+ */
+
+void gemm_fp32_opt(uint32_t M, uint32_t N, uint32_t K, float32_t *A,
+                   uint32_t ldA, float32_t *B, uint32_t ldB, float32_t *C,
+                   uint32_t ldC, float32_t *Y, uint32_t BETA,
+                   uint32_t setup_SSR);
+
+#endif //__DEEPLOY_MATH_GEMM_KERNEL_HEADER_
@@ -0,0 +1,5 @@
+#include "DeeploySnitchMath.h"
+
+void softmax_fp32(float *input, float *output, int32_t ldI,
+                  int32_t batch_offset, int32_t batch_size, int32_t seq_len,
+                  int32_t input_samples);
-Original file line number
+Diff line change
@@ Expand Up / @@ -141,4 +141,4 @@ void main(void) { @@
     printf("Runtime: %u cycles\r\n", getCycles());
     printf("Errors: %u out of %u \r\n", tot_err, tot_tested);
-    }
+    }