Skip to content

Commit

Permalink
[microNPU][ETHOSU] Add Vela's logic to select configuration block (ap…
Browse files Browse the repository at this point in the history
…ache#15186)

For the case when cascader is enabled, the logic of choosing the optimal configuration block from TVM will be used in other cases, the Vela's logic will be used except the cases when dev_force_block_config option is specified.
  • Loading branch information
Aleksei-grovety authored Jul 4, 2023
1 parent a60b815 commit 5dc25af
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 9 deletions.
86 changes: 84 additions & 2 deletions python/tvm/relay/backend/contrib/ethosu/vela_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,12 @@

import numpy as np # type: ignore
from ethosu.vela import api as vapi # type: ignore
from ethosu.vela.architecture_allocator import find_block_config
from ethosu.vela.architecture_features import Accelerator, create_default_arch
from ethosu.vela.operation import NpuBlockType
from ethosu.vela.register_command_stream_generator import resampling_mode_map
from ethosu.vela.register_command_stream_util import to_kernel
from ethosu.vela.shape4d import Shape4D

import tvm
from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator as tirtocs
Expand Down Expand Up @@ -56,6 +61,9 @@ def get_optimal_block_config(
Therefore, we need to pick an optimal block configuration considering bandwidth
to bring IFM blocks and the number of OFM block computes need to happen
to cover the OFM as indicated by the npu op.
For the case when cascader is enabled, the logic of choosing the optimal configuration block
from TVM will be used in other cases, the Vela's logic will be used except
the cases when dev_force_block_config option is specified.
Parameters
----------
Expand All @@ -73,8 +81,82 @@ def get_optimal_block_config(
if options and options.dev_force_block_config:
block_config = [int(v) for v in options.dev_force_block_config.split("x")]
return vapi.NpuShape3D(height=block_config[0], width=block_config[1], depth=block_config[2])
all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_config)
return _get_optimal_block_config(all_valid_block_configs)
elif options and options.enable_cascader:
all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_config)
return _get_optimal_block_config(all_valid_block_configs)
else:
return _find_block_config_with_vela(npu_op, accel_config)


def _find_block_config_with_vela(
npu_op: vapi.NpuOperation, accelerator: vapi.NpuAccelerator
) -> vapi.NpuShape3D:
"""An internal function to get block config using Vela's logic.
Parameters
----------
npu_op : ethosu.vela.api.NpuOperation
The NPU operation
accelerator : ethosu.vela.api.NpuAccelerator
The NPU accelerator
Returns
-------
ethosu.vela.api.NpuShape3D :
The optimal block config for the operator
"""
if isinstance(npu_op, vapi.NpuConv2DOperation):
block_type = NpuBlockType.ConvolutionMxN
elif isinstance(npu_op, vapi.NpuConvDepthWiseOperation):
block_type = NpuBlockType.ConvolutionDepthWise
elif isinstance(npu_op, vapi.NpuPoolingOperation):
block_type = (
NpuBlockType.ReduceSum
if npu_op.sub_op_type == vapi.NpuPoolingOp.REDUCE_SUM
else NpuBlockType.Pooling
)
elif isinstance(npu_op, vapi.NpuElementWiseOperation):
block_type = NpuBlockType.ElementWise
else:
assert 0, "Unsupported operation"

ifm_shape = Shape4D(1, npu_op.ifm.shape.height, npu_op.ifm.shape.width, npu_op.ifm.shape.depth)
ifm2_shape = None
if npu_op.ifm2:
ifm2_shape = Shape4D(
1, npu_op.ifm2.shape.height, npu_op.ifm2.shape.width, npu_op.ifm2.shape.depth
)
ofm_shape = Shape4D(1, npu_op.ofm.shape.height, npu_op.ofm.shape.width, npu_op.ofm.shape.depth)

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
ifm_bits = npu_op.ifm.data_type.size_in_bits()
lut_banks = 0
if npu_op.activation:
lut_banks = 2 if npu_op.activation.op_type == vapi.NpuActivationOp.TABLE_LOOKUP else 0

has_scaling = True
for tensor in [npu_op.ifm, npu_op.ifm2, npu_op.ofm]:
if tensor and tensor.quantization is None:
has_scaling = False
break

arch = create_default_arch(Accelerator.from_npu_accelerator(accelerator))

cfg = find_block_config(
arch,
block_type,
ofm_shape,
ifm_shape,
ifm2_shape,
npu_op.ifm2_scalar is not None,
ifm_bits,
to_kernel(npu_op.kernel),
lut_banks,
has_scaling,
ifm_resampling_mode,
)
assert cfg is not None, f"There is no configuration suitable for {accelerator}"
return vapi.NpuShape3D(cfg.ofm_block.height, cfg.ofm_block.width, cfg.ofm_block.depth)


def _get_optimal_block_config(all_valid_block_configs: List[vapi.NpuShape3D]) -> vapi.NpuShape3D:
Expand Down
4 changes: 2 additions & 2 deletions tests/python/contrib/test_ethosu/test_networks.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@
@pytest.mark.parametrize(
"accel_type, model_url, workspace_size",
[
("ethos-u65-256", MOBILENET_V1_URL, 2338848),
("ethos-u65-256", MOBILENET_V1_URL, 2338864),
("ethos-u65-256", MOBILENET_V2_URL, 2264320),
("ethos-u55-256", MOBILENET_V1_URL, 1793376),
("ethos-u55-256", MOBILENET_V1_URL, 1793392),
("ethos-u55-256", MOBILENET_V2_URL, 2217152),
("ethos-u55-128", MOBILENET_V2_URL, 2217152),
("ethos-u55-64", MOBILENET_V2_URL, 2217152),
Expand Down
14 changes: 9 additions & 5 deletions tests/python/contrib/test_ethosu/test_replace_conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,11 +633,15 @@ def _get_func(

reference_mod = trial[0]
params = trial[1:]
func = _get_func(*params[:-1])
mod, _ = _lower_to_tir(func, cascader=total_cascader(params[-1]))
script = mod.script()
mod = tvm.script.from_source(script)
tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
config = {
"enable_cascader": True,
}
with tvm.transform.PassContext(opt_level=3, config={"relay.ext.ethos-u.options": config}):
func = _get_func(*params[:-1])
mod, _ = _lower_to_tir(func, cascader=total_cascader(params[-1]))
script = mod.script()
mod = tvm.script.from_source(script)
tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)


# fmt: off
Expand Down
50 changes: 50 additions & 0 deletions tests/python/contrib/test_ethosu/test_vela_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,28 @@ def main(
__tvm_meta__ = None


# fmt: off
@tvm.script.ir_module
class Module3:
@T.prim_func
def main(ethos_u_0_i0: T.Buffer((1, 299, 299, 2), "int8"), ethosu_write: T.Buffer((1, 299, 299, 3), "int8")):
T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)})
p2_global = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
ax0_ax1_fused_ax2_fused_ax3_fused = T.int32()
p2_global_1 = T.Buffer((128,), "uint8", data=p2_global)
with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1056):
p1_encoded = T.Buffer((128,), "uint8")
T.call_extern("handle", "ethosu_copy", p1_encoded[0], 128, p2_global_1[0])
nn = T.int32()
T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", T.int64(179570))
ethos_u_0_i0_1 = T.Buffer((178802,), "int8", data=ethos_u_0_i0.data)
ethosu_write_1 = T.Buffer((268203,), "int8", data=ethosu_write.data)
T.call_extern("handle", "ethosu_conv2d", "int8", 299, 299, 2, 299, 0, 299, ethos_u_0_i0_1[0], 0, 0, 0, T.float32(0.0039215683937072754), -128, "NHWC", 598, 2, 1, "int8", 299, 299, 3, 299, 0, 299, ethosu_write_1[0], 0, 0, 0, T.float32(0.025585981085896492), -128, "NHWC", 897, 3, 1, 2, 3, 1, 1, 1, 2, p2_global_1[0], 96, T.int8(-1), T.int8(-1), 0, p2_global_1[96], 32, T.int8(-1), T.int8(-1), 2, 0, 2, 1, "NONE", 0, 0, "TFL", "NONE", 32, 12, 8)

__tvm_meta__ = None
# fmt: on


def test_get_optimal_block_config():
block_configs_cases = [
{
Expand Down Expand Up @@ -559,5 +581,33 @@ def verify(test_vec, mock_enc_w):
verify(_test_vec, _mock_enc_w)


def test_find_block_config_with_vela():
block_configs_cases = [
{
"accel_type": vapi.NpuAccelerator.Ethos_U55_256,
"ref": vapi.NpuShape3D(30, 12, 8),
},
{
"accel_type": vapi.NpuAccelerator.Ethos_U55_128,
"ref": vapi.NpuShape3D(17, 10, 8),
},
{
"accel_type": vapi.NpuAccelerator.Ethos_U55_64,
"ref": vapi.NpuShape3D(25, 5, 8),
},
{
"accel_type": vapi.NpuAccelerator.Ethos_U55_32,
"ref": vapi.NpuShape3D(25, 5, 4),
},
]

mod = Module3
ethosu_conv2d_call = mod["main"].body.body.seq[1].body.value
npu_op, _ = tirtocs.translate_ethosu_conv2d(ethosu_conv2d_call)

for case in block_configs_cases:
assert vela_api._find_block_config_with_vela(npu_op, case["accel_type"]) == case["ref"]


if __name__ == "__main__":
tvm.testing.main()

0 comments on commit 5dc25af

Please sign in to comment.