Skip to content

Commit ae45b04

Browse files
authored
[Relay][Strategy] Use x86 dense schedules for arm_cpu (#15470)
Currently the fallback used when compiling a dense operation with targets such as `llvm -device=arm_cpu` is `dense.generic`. This results very poor performance. Although #13775 meant that x86 schedules are used in cases where no strategy is provided by arm_cpu, the dense strategy is registered due to the existance of specialized schedules for arm_cpu e.g. a schedule for embedded devices. This commit ensures x86 schedules are used inplace of a generic schedule which yeilds much better performance. The commit also follows the same approach for the `dense.generic` schedule as the x86 strategy. This will only be used when autoscheduler is enabled. A test has been added to check the intended schedules are picked when compiling with `arm_cpu`. Change-Id: I8697f630d4acfab71a9626cf9e0dc3086987f163
1 parent 40bac57 commit ae45b04

File tree

2 files changed

+79
-21
lines changed

2 files changed

+79
-21
lines changed

python/tvm/relay/op/strategy/arm_cpu.py

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -559,33 +559,53 @@ def schedule_dense_arm_cpu(attrs, inputs, out_type, target):
559559
wrap_topi_schedule(topi.arm_cpu.schedule_dense_dsp),
560560
name="dense_dsp.arm_cpu",
561561
)
562-
else:
563-
# For dynamic matrix-vector multiply we use a hand written kernel.
564-
if (
565-
isinstance(inputs[0].shape[0], (int, tir.IntImm))
566-
and inputs[0].shape[0] == 1
567-
and (
568-
topi.utils.is_dynamic_shape(inputs[0].shape)
569-
or topi.utils.is_dynamic_shape(inputs[1].shape)
570-
)
571-
):
572-
strategy.add_implementation(
573-
wrap_compute_dense(topi.x86.dense_dynamic),
574-
wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
575-
name="dense_dynamic.x86",
576-
plevel=20,
577-
)
578-
return strategy
579-
logger.warning("dense is not optimized for arm cpu.")
562+
return strategy
563+
564+
# For dynamic matrix-vector multiply we use a hand written kernel.
565+
if (
566+
isinstance(inputs[0].shape[0], (int, tir.IntImm))
567+
and inputs[0].shape[0] == 1
568+
and (
569+
topi.utils.is_dynamic_shape(inputs[0].shape)
570+
or topi.utils.is_dynamic_shape(inputs[1].shape)
571+
)
572+
):
573+
strategy.add_implementation(
574+
wrap_compute_dense(topi.x86.dense_dynamic),
575+
wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
576+
name="dense_dynamic.x86",
577+
plevel=20,
578+
)
579+
return strategy
580+
581+
need_auto_scheduler_layout = is_auto_scheduler_enabled()
582+
need_meta_schedule_layout = is_meta_schedule_enabled()
583+
if need_auto_scheduler_layout or need_meta_schedule_layout:
580584
strategy.add_implementation(
581585
wrap_compute_dense(
582586
topi.nn.dense,
583-
need_auto_scheduler_layout=is_auto_scheduler_enabled(),
584-
need_meta_schedule_layout=is_meta_schedule_enabled(),
587+
need_auto_scheduler_layout=need_auto_scheduler_layout,
588+
need_meta_schedule_layout=need_meta_schedule_layout,
585589
),
586-
wrap_topi_schedule(topi.generic.schedule_dense),
590+
naive_schedule,
587591
name="dense.generic",
592+
plevel=11,
588593
)
594+
595+
# Fallback to x86 schedules as there is currently no arm_cpu schedule for dense
596+
strategy.add_implementation(
597+
wrap_compute_dense(topi.x86.dense_nopack),
598+
wrap_topi_schedule(topi.x86.schedule_dense_nopack),
599+
name="dense_nopack.x86",
600+
plevel=5,
601+
)
602+
strategy.add_implementation(
603+
wrap_compute_dense(topi.x86.dense_pack),
604+
wrap_topi_schedule(topi.x86.schedule_dense_pack),
605+
name="dense_pack.x86",
606+
plevel=10,
607+
)
608+
589609
return strategy
590610

591611

tests/python/relay/strategy/test_select_implementation.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616
# under the License.
1717

1818
""" Tests strategy selection for Relay ops """
19+
1920
import pytest
21+
import numpy as np
22+
2023
import tvm
2124
from tvm import relay
2225
from tvm import te
@@ -149,5 +152,40 @@ def test_int8_depthwise_conv2d(target, expected_impl):
149152
assert impl.name == expected_impl
150153

151154

155+
@pytest.mark.parametrize(
156+
"target,expected_valid_impl,expected_impl",
157+
[("llvm -device=arm_cpu", ["dense_pack.x86", "dense_nopack.x86"], "dense_pack.x86")],
158+
)
159+
def test_dense(target, expected_valid_impl, expected_impl):
160+
target = tvm.target.Target(target)
161+
162+
data_shape = (30, 40)
163+
weight_shape = (30, 40)
164+
dtype = "float32"
165+
166+
out = relay.nn.dense(
167+
relay.var("data", shape=data_shape, dtype=dtype),
168+
relay.var("weight", shape=weight_shape, dtype=dtype),
169+
out_dtype=dtype,
170+
)
171+
out = run_infer_type(out)
172+
173+
with target:
174+
args = [
175+
out.op,
176+
out.attrs,
177+
[te.placeholder(data_shape, dtype), te.placeholder(weight_shape, dtype)],
178+
out.checked_type,
179+
target,
180+
]
181+
valid_impl = relay.backend.te_compiler.get_valid_implementations(*args)
182+
selected_impl, _ = relay.backend.te_compiler.select_implementation(*args, use_autotvm=False)
183+
184+
assert len(valid_impl) == len(expected_valid_impl)
185+
for impl in valid_impl:
186+
assert impl.name in expected_valid_impl
187+
assert selected_impl.name == expected_impl
188+
189+
152190
if __name__ == "__main__":
153191
tvm.testing.main()

0 commit comments

Comments
 (0)