Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ jobs:
testRQGEMM
TestRQAdd
testRQGEMMTransB
testFloatSoftmax
num-cores: 9
simulators: |
banshee
Expand All @@ -141,9 +142,24 @@ jobs:
"name": "testRQGEMM",
"L1": [2000, 5000]
},
{
"name": "testFloatSoftmax",
"L1": [2000, 5000, 10000]
},

{
"name": "TestRQAdd",
"L1": [5000, 10000]
},

{
"name": "testFloatGEMM",
"L1": [2000, 5000, 10000]
},

{
"name": "testFloatGEMMtransB",
"L1": [2000, 5000, 10000]
}
]
simulators: |
Expand Down
15 changes: 14 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,17 @@ Change main.c to use OUTPUTTYPE instead of float

### Fixed
- MaxPool Padding Extract Pass for float and interger
- Testinput, testoutput, weight type casted from double to float warning
- Testinput, testoutput, weight type casted from double to float warning

## Add Float GEMM and Softmax for Snitch platform

### Added
- New templates for GEMM and Softmax.
- Added GEMM and Softmax to TargetLibraries, including case for GEMM with a transposed B matrix.
- Added new CI tests for GEMM and Softmax.

### Changed
- Adapted snitch Bindings and Platform files.

### Fixed
- Relaxed the error threshold between expected and actual values in deeploytest.
3 changes: 2 additions & 1 deletion CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ All contributors have agreed to an open-source release of their work in the Deep
* Luka Macan
* Alberto Dequino
* Francesco Conti
* Run Wang
* Run Wang
* Taha El Bayad
1 change: 0 additions & 1 deletion Deeploy/Targets/Generic/Parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,6 @@ def __init__(self):
def parseNode(self, node: gs.Node) -> bool:

ret = all([len(node.inputs) == 1, len(node.outputs) == 1])

return ret

def parseNodeCtxt(self,
Expand Down
14 changes: 12 additions & 2 deletions Deeploy/Targets/Snitch/Bindings.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,15 @@
from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration
from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
MemoryManagementGeneration
from Deeploy.CommonExtensions.DataTypes import int8_t, int32_t, uint8_t
from Deeploy.CommonExtensions.DataTypes import float32_t, int8_t, int32_t, uint8_t
from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
from Deeploy.Targets.Generic.Templates import iNoNormTemplate
from Deeploy.Targets.Generic.TypeCheckers import AddChecker, GEMMChecker, RQAddChecker, SoftmaxChecker, iNoNormChecker
from Deeploy.Targets.Snitch.CodeTransformationPasses import SnitchClusterTiling, SnitchCoreFilterPass, \
SnitchProfileExecutionBlockPass, SnitchSynchCoresPass
from Deeploy.Targets.Snitch.Templates import AddTemplate, RQAddTemplate, iSoftmaxTemplate
from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, RQAddTemplate, iSoftmaxTemplate
from Deeploy.Targets.Snitch.Templates.FloatSoftmaxTemplate import FloatSoftmax_Template
from Deeploy.Targets.Snitch.Templates.GemmTemplate import SnitchGemm_Template
from Deeploy.Targets.Snitch.Templates.RqGemmTemplate import SnitchRqGemm_Template
from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement
Expand Down Expand Up @@ -69,7 +70,11 @@
SnitchiSoftmaxBindings = [
NodeBinding(SoftmaxChecker([PointerClass(_type)], [PointerClass(uint8_t)]), iSoftmaxTemplate.referenceTemplate,
TiledTransformer) for _type in [int8_t, uint8_t]
] + [
NodeBinding(SoftmaxChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatSoftmax_Template,
TiledTransformer)
]

SnitchiNoNormBindings = [
NodeBinding(
iNoNormChecker([PointerClass(_type), PointerClass(int8_t),
Expand All @@ -88,6 +93,11 @@
NodeBinding(
GEMMChecker([PointerClass(int8_t), PointerClass(int8_t),
PointerClass(int32_t)], [PointerClass(int32_t)]), SnitchGemm_Template, TiledTransformer)
] + [
NodeBinding(
GEMMChecker([PointerClass(float32_t), PointerClass(float32_t),
PointerClass(float32_t)], [PointerClass(float32_t)]), FloatGemmTemplate.referenceTemplate,
TiledTransformer)
]
SnitchRqGemmBindings = [
NodeBinding(
Expand Down
4 changes: 3 additions & 1 deletion Deeploy/Targets/Snitch/Platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from Deeploy.Targets.Generic.Layers import AddLayer, GatherLayer, GEMMLayer, LayerNormLayer, MatMulLayer, PadLayer, \
ReshapeLayer, RQGEMMLayer, RQIntegerDivLayer, SoftmaxLayer, iNoNormLayer
from Deeploy.Targets.Generic.Parsers import AddParser, GatherParser, MatMulParser, Pad1DParser, Pad2DParser, \
RQAddParser, RQIntegerDivParser, UnsqueezeParser, iLayerNormParser, iNoNormParser, iSoftmaxParser
RQAddParser, RQIntegerDivParser, SoftmaxParser, UnsqueezeParser, iLayerNormParser, iNoNormParser, iSoftmaxParser
from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate
from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import AddRequantMergePass, GEMMRequantMergePass, \
IntegerDivRequantMergePass, MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, \
Expand All @@ -58,6 +58,7 @@
GemmMapper = NodeMapper(SnitchGEMMParser(), SnitchGemmTilingReadyBindings)
RqGemmMapper = NodeMapper(SnitchRQGEMMParser(), SnitchRqGemmTilingReadyBindings)
iSoftmaxMapper = NodeMapper(iSoftmaxParser(), SnitchiSoftmaxTilingReadyBindings)
SoftmaxMapper = NodeMapper(SoftmaxParser(), SnitchiSoftmaxTilingReadyBindings)
iNoNormMapper = NodeMapper(iNoNormParser(), SnitchiNoNormTilingReadyBindings)
iLayerNormMapper = NodeMapper(iLayerNormParser(), BasicLayerNormBindings)
RQAddMapper = NodeMapper(RQAddParser(), SnitchRQAddTilingReadyBindings)
Expand All @@ -72,6 +73,7 @@
'Gemm': GEMMLayer([GemmMapper]),
'RQGemm': RQGEMMLayer([RqGemmMapper]),
'iSoftmax': SoftmaxLayer([iSoftmaxMapper]),
'Softmax': SoftmaxLayer([SoftmaxMapper]),
'iNoNorm': iNoNormLayer([iNoNormMapper]),
'iLayerNorm': LayerNormLayer([iLayerNormMapper]),
'RequantizedAdd': AddLayer([RQAddMapper]),
Expand Down
11 changes: 11 additions & 0 deletions Deeploy/Targets/Snitch/Templates/FloatGemmTemplate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from Deeploy.DeeployTypes import NodeTemplate

referenceTemplate = NodeTemplate("""
uint32_t compute_num = snrt_cluster_compute_core_num();

% if transB:
gemm_fp32_transB_opt(${M} / compute_num, ${O}, ${N}, ${A}, ${N} * compute_num, ${B}, ${N}, ${C}, ${O} * compute_num, ${data_out}, 1, 1 );
% else:
gemm_fp32_opt(${M} / compute_num, ${O}, ${N}, ${A}, ${N} * compute_num, ${B}, ${O}, ${C}, ${O} * compute_num, ${data_out}, 1, 1 );
%endif
""")
57 changes: 57 additions & 0 deletions Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# ----------------------------------------------------------------------
#
# File: iSoftmaxTemplate.py
#
# Last edited: 30.05.2024
#
# Copyright (C) 2024, ETH Zurich and University of Bologna.
#
# Author:
# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
#
# ----------------------------------------------------------------------
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict, List, Tuple

from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation


class FloatSoftmaxTemplate(NodeTemplate):

def __init__(self, templateStr):
super().__init__(templateStr)

def alignToContext(self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:

data_in = ctxt.lookup(operatorRepresentation["data_in"])
operatorRepresentation["seq_len"] = data_in.shape[2]
operatorRepresentation["input_samples"] = data_in.shape[-1]

operatorRepresentation["kernelName"] = "Softmax_fp32"

return ctxt, operatorRepresentation, []


FloatSoftmaxTemplateStr = r"""
uint32_t batch_size = ${size} / ${lastDimLength};
uint32_t compute_num = snrt_cluster_compute_core_num();
int32_t ldI = compute_num * ${input_samples};
int32_t batch_offset = ${seq_len} * ${input_samples};

${kernelName}(${data_in}, ${data_out}, ldI, batch_offset, batch_size, ${seq_len}, ${input_samples});
"""

FloatSoftmax_Template = FloatSoftmaxTemplate(FloatSoftmaxTemplateStr)
2 changes: 1 addition & 1 deletion DeeployTest/Platforms/Siracusa/src/deeploytest.c
Original file line number Diff line number Diff line change
Expand Up @@ -141,4 +141,4 @@ void main(void) {

printf("Runtime: %u cycles\r\n", getCycles());
printf("Errors: %u out of %u \r\n", tot_err, tot_tested);
}
}
53 changes: 38 additions & 15 deletions DeeployTest/Platforms/Snitch/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -132,23 +132,46 @@ int main(void) {
#ifndef NOTEST
int32_t tot_err = 0;
uint32_t tot = 0;
int32_t diff;
int32_t expected, actual;
for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {

tot += DeeployNetwork_outputs_bytes[buf];
for (uint32_t i = 0; i < DeeployNetwork_outputs_bytes[buf]; i++) {
expected = ((char *)testOutputVector[buf])[i];
actual = ((char *)DeeployNetwork_outputs[buf])[i];
diff = expected - actual;

if (diff) {
tot_err += 1;
if (ISOUTPUTFLOAT) {
float32_t diff;
float32_t expected, actual;
for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {

tot += DeeployNetwork_outputs_bytes[buf] / sizeof(float32_t);
for (uint32_t i = 0;
i < DeeployNetwork_outputs_bytes[buf] / sizeof(float32_t); i++) {
expected = ((float32_t *)testOutputVector[buf])[i];
actual = ((float32_t *)DeeployNetwork_outputs[buf])[i];
diff = expected - actual;

if (diff < -1.2e-5 || diff > 1.2e-5) {
tot_err += 1;
#ifndef CI
printf("Expected: %f ", expected);
printf("Actual: %f ", actual);
printf("Diff: %f at Index %12u in Output %u\r\n", diff, i, buf);
#endif
}
}
}
} else {
int32_t diff;
int32_t expected, actual;
for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {

tot += DeeployNetwork_outputs_bytes[buf];
for (uint32_t i = 0; i < DeeployNetwork_outputs_bytes[buf]; i++) {
expected = ((char *)testOutputVector[buf])[i];
actual = ((char *)DeeployNetwork_outputs[buf])[i];
diff = expected - actual;
if (diff) {
tot_err += 1;
#ifndef CI
printf("Expected: %4d ", expected);
printf("Actual: %4d ", actual);
printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf);
printf("Expected: %4d ", expected);
printf("Actual: %4d ", actual);
printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf);
#endif
}
}
}
}
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
51 changes: 51 additions & 0 deletions TargetLibraries/Snitch/inc/kernel/Gemm_fp32.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#ifndef __DEEPLOY_MATH_GEMM_KERNEL_HEADER_
#define __DEEPLOY_MATH_GEMM_KERNEL_HEADER_

#include "DeeploySnitchMath.h"

/*
* TILING ONLY due to ssr loop
*
*
*
* FP32 GEMM with the following format:
* A is an M x K matrix, B is a K x N matrix, and C is a M x N matrix
*
* A' = transpose(A) if transA else A
* B' = transpose(B) if transB else B
*
* Y = A' * B' + C
*
*/

/*
*
* transposed A = no
* transposed B = yes
* multi-core = yes
* unrolling = yes
* simd = yes
* parallelization = row-wise
*/

void gemm_fp32_transB_opt(uint32_t M, uint32_t N, uint32_t K, float32_t *A,
uint32_t ldA, float32_t *B, uint32_t ldB,
float32_t *C, uint32_t ldC, float32_t *Y,
uint32_t BETA, uint32_t setup_SSR);

/*
*
* transposed A = no
* transposed B = no
* multi-core = yes
* unrolling = yes
* simd = yes
* parallelization = row-wise
*/

void gemm_fp32_opt(uint32_t M, uint32_t N, uint32_t K, float32_t *A,
uint32_t ldA, float32_t *B, uint32_t ldB, float32_t *C,
uint32_t ldC, float32_t *Y, uint32_t BETA,
uint32_t setup_SSR);

#endif //__DEEPLOY_MATH_GEMM_KERNEL_HEADER_
5 changes: 5 additions & 0 deletions TargetLibraries/Snitch/inc/kernel/Softmax.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#include "DeeploySnitchMath.h"

void softmax_fp32(float *input, float *output, int32_t ldI,
int32_t batch_offset, int32_t batch_size, int32_t seq_len,
int32_t input_samples);
Loading
Loading