[Relay][Strategy] Use x86 dense schedules for arm_cpu

lhutton1 · lhutton1 · commit b70c6621a3dd · 2023-08-03T16:47:25.000Z
Currently the fallback used when compiling a dense operation with targets such as `llvm -device=arm_cpu` is `dense.generic`. This results very poor performance. Although apache#13775 meant that x86 schedules are used in cases where no strategy is provided by arm_cpu, the dense strategy is registered due to the existance of specialized schedules for arm_cpu e.g. a schedule for embedded devices. This commit ensures x86 schedules are used inplace of a generic schedule which yeilds much better performance. The commit also follows the same approach for the `dense.generic` schedule as the x86 strategy. This will only be used when autoscheduler is enabled. A test has been added to check the intended schedules are picked when compiling with `arm_cpu`. Change-Id: I8697f630d4acfab71a9626cf9e0dc3086987f163
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -557,33 +557,53 @@ def schedule_dense_arm_cpu(attrs, inputs, out_type, target):
             wrap_topi_schedule(topi.arm_cpu.schedule_dense_dsp),
             name="dense_dsp.arm_cpu",
         )
-    else:
-        # For dynamic matrix-vector multiply we use a hand written kernel.
-        if (
-            isinstance(inputs[0].shape[0], (int, tir.IntImm))
-            and inputs[0].shape[0] == 1
-            and (
-                topi.utils.is_dynamic_shape(inputs[0].shape)
-                or topi.utils.is_dynamic_shape(inputs[1].shape)
-            )
-        ):
-            strategy.add_implementation(
-                wrap_compute_dense(topi.x86.dense_dynamic),
-                wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
-                name="dense_dynamic.x86",
-                plevel=20,
-            )
-            return strategy
-        logger.warning("dense is not optimized for arm cpu.")
+        return strategy
+
+    # For dynamic matrix-vector multiply we use a hand written kernel.
+    if (
+        isinstance(inputs[0].shape[0], (int, tir.IntImm))
+        and inputs[0].shape[0] == 1
+        and (
+            topi.utils.is_dynamic_shape(inputs[0].shape)
+            or topi.utils.is_dynamic_shape(inputs[1].shape)
+        )
+    ):
+        strategy.add_implementation(
+            wrap_compute_dense(topi.x86.dense_dynamic),
+            wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
+            name="dense_dynamic.x86",
+            plevel=20,
+        )
+        return strategy
+
+    need_auto_scheduler_layout = is_auto_scheduler_enabled()
+    need_meta_schedule_layout = is_meta_schedule_enabled()
+    if need_auto_scheduler_layout or need_meta_schedule_layout:
         strategy.add_implementation(
             wrap_compute_dense(
                 topi.nn.dense,
-                need_auto_scheduler_layout=is_auto_scheduler_enabled(),
-                need_meta_schedule_layout=is_meta_schedule_enabled(),
+                need_auto_scheduler_layout=need_auto_scheduler_layout,
+                need_meta_schedule_layout=need_meta_schedule_layout,
             ),
-            wrap_topi_schedule(topi.generic.schedule_dense),
+            naive_schedule,
             name="dense.generic",
+            plevel=11,
         )
+
+    # Fallback to x86 schedules as there is currently no arm_cpu schedule for dense
+    strategy.add_implementation(
+        wrap_compute_dense(topi.x86.dense_nopack),
+        wrap_topi_schedule(topi.x86.schedule_dense_nopack),
+        name="dense_nopack.x86",
+        plevel=5,
+    )
+    strategy.add_implementation(
+        wrap_compute_dense(topi.x86.dense_pack),
+        wrap_topi_schedule(topi.x86.schedule_dense_pack),
+        name="dense_pack.x86",
+        plevel=10,
+    )
+
     return strategy
 
 
diff --git a/tests/python/relay/strategy/test_select_implementation.py b/tests/python/relay/strategy/test_select_implementation.py
@@ -16,7 +16,10 @@
 # under the License.
 
 """ Tests strategy selection for Relay ops """
+
 import pytest
+import numpy as np
+
 import tvm
 from tvm import relay
 from tvm import te
@@ -52,5 +55,40 @@ def test_concatenate(target, expected_implementation):
     assert impl.name == expected_implementation
 
 
+@pytest.mark.parametrize(
+    "target,expected_valid_impl,expected_impl",
+    [("llvm -device=arm_cpu", ["dense_pack.x86", "dense_nopack.x86"], "dense_pack.x86")],
+)
+def test_dense(target, expected_valid_impl, expected_impl):
+    target = tvm.target.Target(target)
+
+    data_shape = (30, 40)
+    weight_shape = (30, 40)
+    dtype = "float32"
+
+    out = relay.nn.dense(
+        relay.var("data", shape=data_shape, dtype=dtype),
+        relay.var("weight", shape=weight_shape, dtype=dtype),
+        out_dtype=dtype,
+    )
+    out = run_infer_type(out)
+
+    with target:
+        args = [
+            out.op,
+            out.attrs,
+            [te.placeholder(data_shape, dtype), te.placeholder(weight_shape, dtype)],
+            out.checked_type,
+            target,
+        ]
+        valid_impl = relay.backend.te_compiler.get_valid_implementations(*args)
+        selected_impl, _ = relay.backend.te_compiler.select_implementation(*args, use_autotvm=False)
+
+    assert len(valid_impl) == len(expected_valid_impl)
+    for impl in valid_impl:
+        assert impl.name in expected_valid_impl
+    assert selected_impl.name == expected_impl
+
+
 if __name__ == "__main__":
     tvm.testing.main()