Skip to content

Commit

Permalink
[microNPU][ETHOSU] Fix CopyComputeReordering pass arguments (apache#1…
Browse files Browse the repository at this point in the history
…5063)

No arguments were passed to CopyComputeReordering pass and the same parameters were used for all targets. This fix takes arguments for CopyComputeReordering pass from Vela.
In networks tests, the amount of memory used for U65 has increased because now after CopyComputeReordering pass, the number of DMA commands executed in parallel has increased.
  • Loading branch information
Aleksei-grovety authored Jun 19, 2023
1 parent fa223b0 commit e280e01
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 59 deletions.
7 changes: 4 additions & 3 deletions python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@
"""The integration of the Arm(R) Ethos(TM)-U NPU TIR compiler."""
import tvm
from tvm import relay
from tvm.relay.expr_functor import ExprMutator
from tvm.driver.build_module import schedule_to_module
from tvm.relay.backend.contrib.ethosu import vela_api as vapi
from tvm.relay.expr_functor import ExprMutator

from .. import util
from . import passes as ethosu_passes
from .scheduler import schedule
from .. import util


def lower_ethosu(sch, args, const_dict, name="main"):
Expand Down Expand Up @@ -92,7 +93,7 @@ def lower_ethosu(sch, args, const_dict, name="main"):
mod = ethosu_passes.HoistAllocates()(mod)
mod = tvm.tir.transform.RemoveNoOp()(mod)
mod, const_dict = ethosu_passes.MergeConstants(const_dict)(mod)
mod = ethosu_passes.CopyComputeReordering()(mod)
mod = ethosu_passes.CopyComputeReordering(vapi.get_max_copy_movements())(mod)

disable_storage_rewrite = curr_cfg.get("tir.disable_storage_rewrite", False)
if not disable_storage_rewrite:
Expand Down
15 changes: 13 additions & 2 deletions python/tvm/relay/backend/contrib/ethosu/vela_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@
"""
import logging
import math
from typing import Tuple, Optional, List
from typing import List, Optional, Tuple

import numpy as np # type: ignore
from ethosu.vela import api as vapi # type: ignore
from ethosu.vela.architecture_features import Accelerator, create_default_arch

import tvm
from tvm.relay.backend.contrib.ethosu import util # type: ignore
from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator as tirtocs
from tvm.relay.backend.contrib.ethosu import util # type: ignore

# pylint: disable=invalid-name
logger = logging.getLogger("Ethos-U")
Expand Down Expand Up @@ -400,3 +402,12 @@ def get_accelerator_config() -> vapi.NpuAccelerator:
accel_config_str = compiler_attrs.accelerator_config
assert accel_config_str in npu_accel_str_map.keys(), f"{accel_config_str} is not supported"
return npu_accel_str_map[accel_config_str]


def get_max_copy_movements() -> int:
"""Get maximum copy movements for CopyComputeReordering pass.
max_outstanding_dma from architecture features indicates how many
DMA operations can be in-progress.
"""
arch = create_default_arch(Accelerator.from_npu_accelerator(get_accelerator_config()))
return arch.max_outstanding_dma
102 changes: 51 additions & 51 deletions tests/python/contrib/test_ethosu/test_encode_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,30 +66,29 @@ def main(input_placeholder: T.Buffer((1, 16, 16, 32), "int8"), input_ethosu_writ
@tvm.script.ir_module
class WeightStreamOnlyU65:
@T.prim_func
def main(input_placeholder: T.Buffer((1, 16, 16, 32), "int8"), input_ethosu_write: T.Buffer((1, 16, 16, 8), "int8")) -> None:
# function attr dict
T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
# buffer definition
placeholder = T.Buffer([8192], dtype="int8", data=input_placeholder.data)
ethosu_write = T.Buffer([2048], dtype="int8", data=input_ethosu_write.data)
buffer_encoded_1 = T.Buffer([192], dtype="uint8")
buffer_encoded_2_1 = T.Buffer([192], dtype="uint8")
buffer_encoded_4_1 = T.Buffer([208], dtype="uint8")
buffer_encoded_6_1 = T.Buffer([192], dtype="uint8")
# body
p1_data = T.allocate([208], "uint8", "global", annotations={"disable_lower_builtin":True})
p1 = T.Buffer([208], "uint8", data=p1_data)
p2_data = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin":True})
p2 = T.Buffer([192], "uint8", data=p2_data)
p3 = T.Buffer([192], dtype="uint8", data=p1.data)
T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 192, p3[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 192, p2[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 80, p3[80], 80, 12, p3[160], 16, p3[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 208, p1[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, p2[80], 80, 12, p2[160], 16, p2[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 192, p2[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 96, p1[96], 80, 12, p1[176], 16, p1[192], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, p2[80], 80, 12, p2[160], 16, p2[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
def main(ifm: T.Buffer((1, 16, 16, 32), "int8"), ethosu_write: T.Buffer((1, 16, 16, 8), "int8")):
T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)})
p2_global_6 = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
p2_global_4 = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
p2_global_5 = T.allocate([208], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
buffer_encoded = T.Buffer((192,), "uint8")
p2_global_3 = T.Buffer((192,), "uint8", data=p2_global_6)
T.call_extern("handle", "ethosu_copy", buffer_encoded[0], 192, p2_global_3[0])
buffer_encoded_1 = T.Buffer((192,), "uint8")
p2_global_4_1 = T.Buffer((192,), "uint8", data=p2_global_4)
T.call_extern("handle", "ethosu_copy", buffer_encoded_1[0], 192, p2_global_4_1[0])
buffer_encoded_2 = T.Buffer((208,), "uint8")
p2_global_5_1 = T.Buffer((208,), "uint8", data=p2_global_5)
T.call_extern("handle", "ethosu_copy", buffer_encoded_2[0], 208, p2_global_5_1[0])
ifm_1 = T.Buffer((8192,), "int8", data=ifm.data)
ethosu_write_1 = T.Buffer((2048,), "int8", data=ethosu_write.data)
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2_global_3[0], 80, p2_global_3[80], 80, 12, p2_global_3[160], 16, p2_global_3[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
buffer_encoded_3 = T.Buffer((192,), "uint8")
p2_global_6_1 = T.Buffer((192,), "uint8", data=p2_global_6)
T.call_extern("handle", "ethosu_copy", buffer_encoded_3[0], 192, p2_global_6_1[0])
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_1[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2_global_4_1[0], 80, p2_global_4_1[80], 80, 12, p2_global_4_1[160], 16, p2_global_4_1[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_1[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2_global_5_1[0], 96, p2_global_5_1[96], 80, 12, p2_global_5_1[176], 16, p2_global_5_1[192], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_1[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2_global_6_1[0], 80, p2_global_6_1[80], 80, 12, p2_global_6_1[160], 16, p2_global_6_1[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
__tvm_meta__ = None
# fmt: on

Expand Down Expand Up @@ -387,33 +386,34 @@ def main(input_ifm: T.Buffer((1,16,16,32), "int8"), input_ethosu_write: T.Buffer
@tvm.script.ir_module
class MixedReadU65:
@T.prim_func
def main(input_ifm: T.Buffer((1,16,16,32), "int8"), input_ethosu_write: T.Buffer((1,16,16,8), "int8")) -> None:
# function attr dict
T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
# buffer definition
ifm = T.Buffer([8192], dtype="int8", data=input_ifm.data)
ethosu_write = T.Buffer([2048], dtype="int8", data=input_ethosu_write.data)
buffer1 = T.Buffer([128], dtype="uint8")
buffer2 = T.Buffer([128], dtype="uint8")
buffer3 = T.Buffer([128], dtype="uint8")
buffer4 = T.Buffer([608], dtype="uint8")
buffer5 = T.Buffer([160], dtype="uint8")
buffer6 = T.Buffer([128], dtype="uint8")
p1_data = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
p1 = T.Buffer([128], "uint8", data=p1_data)
p2_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
p2 = T.Buffer([4096], "int8", data=p2_data)
p3_data = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
p3 = T.Buffer([128], "uint8", data=p3_data)
T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 128, p1[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer4[0], 304, buffer4[304], 304, 12, buffer5[0], 80, buffer5[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p3[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, p1[48], 48, 12, p1[96], 16, p1[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 128, p1[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 48, p3[48], 48, 12, p3[96], 16, p3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 128, p3[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, p1[48], 48, 12, p1[96], 16, p1[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 48, p3[48], 48, 12, p3[96], 16, p3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
def main(ifm: T.Buffer((1, 16, 16, 32), "int8"), ethosu_write: T.Buffer((1, 16, 16, 8), "int8")):
T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)})
p5_global = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
p5_global_1 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin": T.bool(True)})
p5_global_2 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
buffer_encoded = T.Buffer((128,), "uint8")
p5_global_3 = T.Buffer((128,), "uint8", data=p5_global)
T.call_extern("handle", "ethosu_copy", buffer_encoded[0], 128, p5_global_3[0])
buffer_encoded_1 = T.Buffer((128,), "uint8")
p5_global_4 = T.Buffer((128,), "uint8", data=p5_global_1)
T.call_extern("handle", "ethosu_copy", buffer_encoded_1[0], 128, p5_global_4[0])
ifm_1 = T.Buffer((8192,), "int8", data=ifm.data)
ethosu_write_2 = T.Buffer((4096,), "int8", data=ethosu_write_1)
p1_encoded = T.Buffer((608,), "uint8")
p2_encoded = T.Buffer((160,), "uint8")
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, p1_encoded[0], 304, p1_encoded[304], 304, 12, p2_encoded[0], 80, p2_encoded[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
buffer_encoded_2 = T.Buffer((128,), "uint8")
p5_global_5 = T.Buffer((128,), "uint8", data=p5_global_2)
T.call_extern("handle", "ethosu_copy", buffer_encoded_2[0], 128, p5_global_5[0])
ethosu_write_3 = T.Buffer((2048,), "int8", data=ethosu_write.data)
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_3[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5_global_3[0], 48, p5_global_3[48], 48, 12, p5_global_3[96], 16, p5_global_3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
buffer_encoded_3 = T.Buffer((128,), "uint8")
p5_global_6 = T.Buffer((128,), "uint8", data=p5_global)
T.call_extern("handle", "ethosu_copy", buffer_encoded_3[0], 128, p5_global_6[0])
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_3[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5_global_4[0], 48, p5_global_4[48], 48, 12, p5_global_4[96], 16, p5_global_4[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_3[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5_global_5[0], 48, p5_global_5[48], 48, 12, p5_global_5[96], 16, p5_global_5[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_3[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5_global_6[0], 48, p5_global_6[48], 48, 12, p5_global_6[96], 16, p5_global_6[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
__tvm_meta__ = None
# fmt: on

Expand Down
6 changes: 3 additions & 3 deletions tests/python/contrib/test_ethosu/test_networks.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
@pytest.mark.parametrize(
"accel_type, model_url, workspace_size",
[
("ethos-u65-256", MOBILENET_V1_URL, 1793376),
("ethos-u65-256", MOBILENET_V2_URL, 2217152),
("ethos-u65-256", MOBILENET_V1_URL, 2338848),
("ethos-u65-256", MOBILENET_V2_URL, 2264320),
("ethos-u55-256", MOBILENET_V1_URL, 1793376),
("ethos-u55-256", MOBILENET_V2_URL, 2217152),
("ethos-u55-128", MOBILENET_V2_URL, 2217152),
Expand All @@ -71,7 +71,7 @@ def test_networks_without_usmp(accel_type, model_url, workspace_size):
@pytest.mark.parametrize(
"accel_type, model_url, workspace_size",
[
("ethos-u65-256", MOBILENET_V1_URL, 1206880),
("ethos-u65-256", MOBILENET_V1_URL, 1311200),
("ethos-u55-256", MOBILENET_V2_URL, 1509408),
],
)
Expand Down

0 comments on commit e280e01

Please sign in to comment.