[Target] Automatically detect system triple when not specified by the…

… user (apache#16513) Currently, when a default compile target such as llvm is specified, it implies llvm -keys=cpu which tends to imply x86 related components being used during compilation e.g. the schedules registered in TOPI. This can be confusing for a user when compiling on other architectures, especially when other tools such as llc infer the default target based on the host. When the target kind is llvm, this commit uses the "target.llvm_get_system_triple" functionality to automatically detect mtriple when one has not been provided in the target string. The target will be updated to one that uses the mtriple of the host: llvm -> llvm -mtriple=<system-triple>. When compiling on Arm(R)-based targets, this has the added benfit of automatially introducing -keys=arm_cpu to the target improving the schedule selection. Lots of tests are currently using targets such as llvm or similar which has resulted in a lack of coverage of other targets such as arm_cpu. As part of this commit, failing test cases which have simple / obvious issues have been fixed. Others that likely need more thought have been skipped. In doing so, it reduces the number of modifications and simplifies the review for this change. This commit is a follow up of the changes made in: apache#14981 Change-Id: Icee7f5c00d58fc77367c823273fccae128260471 Co-authored-by: Jack Frankland <jack.frankland@arm.com> --------- Co-authored-by: Jack Frankland <jack.frankland@arm.com>
thaisacs · Apr 3, 2024 · 34122bf · 34122bf
1 parent f2f624e
commit 34122bf
Show file tree

Hide file tree

Showing 19 changed files with 212 additions and 50 deletions.
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -150,7 +150,9 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                 pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw))
                 is_winograd_applicable = (
                     "float" in data.dtype
+                    and "custom" not in data.dtype
                     and "float" in kernel.dtype
+                    and "custom" not in kernel.dtype
                     and kh == 3
                     and kw == 3
                     and stride_h == 1
@@ -315,8 +317,20 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     name="depthwise_conv2d_nchw.x86",
                 )
         elif layout == "NHWC":
-            assert kernel_layout == "HWOI"
-            if target.features.is_aarch64 and target.features.has_asimd:
+            if kernel_layout != "HWOI":
+                logger.warning(
+                    """
+                    depthwise_conv2d with layout NHWC and HWOI
+                    kernel layout is not optimized for arm_cpu target.
+                    """
+                )
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc, need_kernel_layout=True),
+                    wrap_topi_schedule(conv2d_generic.schedule_depthwise_conv2d_nhwc),
+                    name="depthwise_conv2d_nhwc.generic",
+                )
+
+            elif target.features.is_aarch64 and target.features.has_asimd:
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.compute_depthwise_conv2d_nhwc),
                     wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nhwc),

diff --git a/python/tvm/topi/arm_cpu/injective.py b/python/tvm/topi/arm_cpu/injective.py
@@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable
 """Schedule for pooling operators"""
-import numpy as np
 import tvm
 from tvm import te
 from ..utils import is_empty_shape
@@ -69,7 +68,8 @@ def schedule_injective(outs):
     if list(s[x].op.axis):
         # do not vectorize for broadcast
         dtype = "uint16" if x.dtype == "bfloat16" else x.dtype
-        (io, ii) = s[x].split(list(s[x].op.axis)[-1], 16 // np.dtype(dtype).itemsize)
+        itemsize = max(1, tvm.DataType(dtype).bits // 8)
+        (io, ii) = s[x].split(list(s[x].op.axis)[-1], 16 // itemsize)
         s[x].vectorize(ii)
     tvm.te.schedule.AutoInlineInjective(s)
 

diff --git a/src/target/parsers/cpu.cc b/src/target/parsers/cpu.cc
@@ -28,7 +28,25 @@ namespace target {
 namespace parsers {
 namespace cpu {
 
+Optional<String> DetectSystemTriple() {
+  auto pf = tvm::runtime::Registry::Get("target.llvm_get_system_triple");
+  if (pf->defined()) {
+    return (*pf)();
+  }
+  return {};
+}
+
 TargetJSON ParseTarget(TargetJSON target) {
+  String kind = Downcast<String>(target.Get("kind"));
+  Optional<String> mtriple = Downcast<Optional<String>>(target.Get("mtriple"));
+  Optional<String> mcpu = Downcast<Optional<String>>(target.Get("mcpu"));
+
+  // Try to fill in the blanks by detecting target information from the system
+  if (kind == "llvm" && !mtriple.defined() && !mcpu.defined()) {
+    String system_triple = DetectSystemTriple().value_or("");
+    target.Set("mtriple", system_triple);
+  }
+
   if (mprofile::IsArch(target)) {
     return mprofile::ParseTarget(target);
   }

diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc
@@ -494,10 +494,25 @@ TEST(TargetCreation, DeduplicateKeys) {
   ICHECK_EQ(target->keys.size(), 2U);
   ICHECK_EQ(target->keys[0], "cpu");
   ICHECK_EQ(target->keys[1], "arm_cpu");
-  ICHECK_EQ(target->attrs.size(), 1U);
+  ICHECK_EQ(target->attrs.size(), 2U);
   ICHECK_EQ(target->GetAttr<String>("device"), "arm_cpu");
 }
 
+TEST(TargetCreation, DetectSystemTriple) {
+  Map<String, ObjectRef> config = {
+      {"kind", String("llvm")},
+  };
+
+  Target target = Target(config);
+  ICHECK_EQ(target->kind, TargetKind::Get("llvm").value());
+
+  Optional<String> mtriple = target->GetAttr<String>("mtriple");
+  auto pf = tvm::runtime::Registry::Get("target.llvm_get_system_triple");
+  if (!pf->defined()) {
+    GTEST_SKIP() << "LLVM is not available, skipping test";
+  }
+}
+
 TEST(TargetKindRegistry, ListTargetKinds) {
   Array<String> names = TargetKindRegEntry::ListTargetKinds();
   ICHECK_EQ(names.empty(), false);

diff --git a/tests/python/auto_scheduler/test_auto_scheduler_search_task.py b/tests/python/auto_scheduler/test_auto_scheduler_search_task.py
@@ -114,7 +114,11 @@ def test_search_task_record():
     assert new_task.task_input_names[1] == "test_input_1"
 
     # Log with version 0.5
-    v5_log = """["[\\\"matmul_auto_scheduler_test\\\", 64, 64, 64]", "llvm -keys=cpu", [6, 64, 64, 0, 0, 0, 0, 0], "", 1]"""
+    v5_log = (
+        """["[\\\"matmul_auto_scheduler_test\\\", 64, 64, 64]", """
+        f'"{str(tvm.target.Target(target))}"'
+        """, [6, 64, 64, 0, 0, 0, 0, 0], "", 1]"""
+    )
     new_task = auto_scheduler._ffi_api.DeserializeSearchTask(v5_log)
     assert task.workload_key == new_task.workload_key
     assert str(task.target) == str(new_task.target)
@@ -125,12 +129,13 @@ def test_search_task_record():
 
 def test_recover_measure_input_with_task_input():
     auto_scheduler.search_task.TASK_INPUT_BUFFER_TABLE.clear()
+    target = "llvm"
 
     # Since this file is tests for search_task, we only check the search_task here
 
     # Log with no task input
     task = auto_scheduler.SearchTask(
-        func=matmul_auto_scheduler_test, args=(512, 512, 512), target="llvm"
+        func=matmul_auto_scheduler_test, args=(512, 512, 512), target=target
     )
     inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
     res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)
@@ -147,7 +152,7 @@ def test_recover_measure_input_with_task_input():
     task = auto_scheduler.SearchTask(
         func=matmul_auto_scheduler_test,
         args=(512, 512, 512),
-        target="llvm",
+        target=target,
         task_inputs={
             "test_input_0": test_input_0,
         },
@@ -170,7 +175,7 @@ def test_recover_measure_input_with_task_input():
     task = auto_scheduler.SearchTask(
         func=matmul_auto_scheduler_test,
         args=(512, 512, 512),
-        target="llvm",
+        target=target,
         task_inputs={
             "test_input_0": test_input_0,
             "test_input_1": test_input_1,
@@ -191,7 +196,11 @@ def test_recover_measure_input_with_task_input():
     assert new_task.task_input_names[1] == "test_input_1"
 
     # Log with version 0.5
-    v5_log = """{"i": [["[\\\"matmul_auto_scheduler_test\\\", 512, 512, 512]", "llvm -keys=cpu", [6, 64, 64, 0, 0, 0, 0, 0], "", 1], [[], []]], "r": [[0.1], 0, 0.2, 1], "v": "v0.6"}"""
+    v5_log = (
+        """{"i": [["[\\\"matmul_auto_scheduler_test\\\", 512, 512, 512]", """
+        f'"{str(tvm.target.Target(target))}"'
+        """, [6, 64, 64, 0, 0, 0, 0, 0], "", 1], [[], []]], "r": [[0.1], 0, 0.2, 1], "v": "v0.6"}"""
+    )
     measure_log = auto_scheduler.measure_record.load_record_from_string(v5_log)
     new_task = measure_log[0].task
     assert task.workload_key == new_task.workload_key

diff --git a/tests/python/autotvm/test_autotvm_graph_tuner_core.py b/tests/python/autotvm/test_autotvm_graph_tuner_core.py
@@ -148,6 +148,7 @@ def _create_data(target, dshape, dtype, layout):
     return net, records, ltf_records, ltf_keys, tasks
 
 
+@tvm.testing.requires_x86
 def test_graph_tuner_layout_transform():
     log_file = "%s/test_tuner.log" % (os.getcwd())
     target = "llvm"
@@ -188,6 +189,7 @@ def test_graph_tuner_layout_transform():
         )
 
 
+@tvm.testing.requires_x86
 def test_graph_tuner_layout_transform_runner():
     log_file = "%s/test_tuner.log" % (os.getcwd())
     target = "llvm"
@@ -231,6 +233,7 @@ def test_graph_tuner_layout_transform_runner():
         )
 
 
+@tvm.testing.requires_x86
 def test_DPTuner_run():
     log_file = "%s/test_tuner.log" % (os.getcwd())
     target = "llvm"
@@ -295,6 +298,7 @@ def test_DPTuner_run():
     assert os.path.isfile(log_file), "No log file with name %s exists." % log_file
 
 
+@tvm.testing.requires_x86
 def test_PBQPTuner_run():
     target = "llvm"
     dtype = "float32"
@@ -355,6 +359,7 @@ def test_PBQPTuner_run():
     )
 
 
+@tvm.testing.requires_x86
 def test_many_sub_graphs():
     target = "llvm"
     dtype = "float32"
@@ -517,6 +522,7 @@ def test_many_sub_graphs():
     )
 
 
+@tvm.testing.requires_x86
 def test_tuple():
     target = "llvm"
     dtype = "float32"
@@ -629,6 +635,7 @@ def test_tuple():
     )
 
 
+@tvm.testing.requires_x86
 def test_triangle_block():
     target = "llvm"
     dtype = "float32"

diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
@@ -23,7 +23,7 @@
 from __future__ import print_function
 from functools import partial
 from distutils.version import LooseVersion
-
+import platform
 import os
 import tempfile
 import typing
@@ -1092,35 +1092,56 @@ def test_forward_quantized_convolution():
         )
 
         _test_tflite2_quantized_convolution(
-            (1, 16, 10, 10),
-            (3, 3),
-            2,
+            (2, 32, 28, 28),
+            (1, 1),
+            16,
             data_format="NCWH",
             int_quant_dtype=int_quant_dtype,
-            groups=2,
+            groups=8,
         )
 
+        if platform.machine() == "aarch64":
+            pytest.skip(
+                reason=(
+                    "Grouped convolution type inference error for `arm_cpu`. "
+                    "See https://github.com/apache/tvm/issues/16532"
+                )
+            )
+
         _test_tflite2_quantized_convolution(
-            (2, 32, 28, 28),
-            (1, 1),
-            16,
+            (1, 16, 10, 10),
+            (3, 3),
+            2,
             data_format="NCWH",
             int_quant_dtype=int_quant_dtype,
-            groups=8,
+            groups=2,
         )
 
 
 def test_forward_quantized_depthwise_convolution():
+    """Test qnn.conv2d depthwise compiled with TVM against TFLite reference."""
     for int_quant_dtype in [tf.int8, tf.int16]:
-        _test_tflite2_quantized_depthwise_convolution(
-            [1, 8, 8, 128], [1, 1, 128, 1], [1, 1], [1, 1], "SAME", "NHWC", 1, int_quant_dtype
-        )
         _test_tflite2_quantized_depthwise_convolution(
             [1, 17, 17, 12], [3, 3, 12, 1], [1, 1], [2, 2], "VALID", "NHWC", 1, int_quant_dtype
         )
         _test_tflite2_quantized_depthwise_convolution(
             [1, 24, 24, 3], [7, 7, 3, 8], [1, 1], [2, 2], "SAME", "NHWC", 8, int_quant_dtype
         )
+    _test_tflite2_quantized_depthwise_convolution(
+        [1, 8, 8, 128], [1, 1, 128, 1], [1, 1], [1, 1], "SAME", "NHWC", 1, tf.int8
+    )
+
+    if platform.machine() == "aarch64":
+        pytest.skip(
+            reason=(
+                "Tensor intrinsic data type mismatch error. "
+                "See https://github.com/apache/tvm/issues/16533"
+            )
+        )
+
+    _test_tflite2_quantized_depthwise_convolution(
+        [1, 8, 8, 128], [1, 1, 128, 1], [1, 1], [1, 1], "SAME", "NHWC", 1, tf.int16
+    )
 
 
 def _test_tflite2_quantized_depthwise_convolution(
@@ -5090,6 +5111,10 @@ def test_forward_qnn_mobilenet_v3_net():
     tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Fails with an output mismatch. See https://github.com/apache/tvm/issues/16534",
+)
 def test_forward_tflite2_qnn_resnet50():
     """Test the Quantized TFLite version 2.1.0 Resnet50 model."""
     if package_version.parse(tf.VERSION) >= package_version.parse("2.1.0"):
@@ -5186,6 +5211,11 @@ def test_forward_tflite_float16():
     tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Fails during leagalization due to int16 datatype. "
+    "See https://github.com/apache/tvm/issues/16535",
+)
 def test_forward_mobilenet_int16():
     """Test int16 quantized model"""
     # MobilenetV2
@@ -5228,6 +5258,11 @@ def representative_dataset():
     tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Fails during leagalization due to int16 datatype. "
+    "See https://github.com/apache/tvm/issues/16535",
+)
 def test_forward_ds_cnn_int16():
     """Test DS_CNN int16 quantized model"""
     tflite_model_file = download_testdata(

diff --git a/tests/python/integration/test_legacy_tuning.py b/tests/python/integration/test_legacy_tuning.py
@@ -353,7 +353,7 @@ def @main(%a : Tensor[(1, 3, 32, 32), float32], %b : Tensor[(3, 3, 5, 5), float3
         tasks = autotvm.task.relay_integration.extract_from_program(
             ir_mod, {}, tvm.target.create("llvm")
         )
-        assert len(tasks) == 1, f"Extracted != 1 task from program: {tasks!r}"
+        assert len(tasks) >= 1, f"Extracted no tasks from program: {tasks!r}"
 
         task = tasks[0]