apache · ekalda · Mar 14, 2024 · Feb 2, 2024 · Feb 5, 2024 · Feb 6, 2024
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -150,7 +150,9 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                 pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw))
                 is_winograd_applicable = (
                     "float" in data.dtype
+                    and "custom" not in data.dtype
                     and "float" in kernel.dtype
+                    and "custom" not in kernel.dtype
                     and kh == 3
                     and kw == 3
                     and stride_h == 1
@@ -315,8 +317,20 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     name="depthwise_conv2d_nchw.x86",
                 )
         elif layout == "NHWC":
-            assert kernel_layout == "HWOI"
-            if target.features.is_aarch64 and target.features.has_asimd:
+            if kernel_layout != "HWOI":
+                logger.warning(
+                    """
+                    depthwise_conv2d with layout NHWC and HWOI
+                    kernel layout is not optimized for arm_cpu target.
+                    """
+                )
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc, need_kernel_layout=True),
+                    wrap_topi_schedule(conv2d_generic.schedule_depthwise_conv2d_nhwc),
+                    name="depthwise_conv2d_nhwc.generic",
+                )
+
+            elif target.features.is_aarch64 and target.features.has_asimd:
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.compute_depthwise_conv2d_nhwc),
                     wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nhwc),

diff --git a/python/tvm/topi/arm_cpu/injective.py b/python/tvm/topi/arm_cpu/injective.py
@@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable
 """Schedule for pooling operators"""
-import numpy as np
 import tvm
 from tvm import te
 from ..utils import is_empty_shape
@@ -69,7 +68,8 @@ def schedule_injective(outs):
     if list(s[x].op.axis):
         # do not vectorize for broadcast
         dtype = "uint16" if x.dtype == "bfloat16" else x.dtype
-        (io, ii) = s[x].split(list(s[x].op.axis)[-1], 16 // np.dtype(dtype).itemsize)
+        itemsize = max(1, tvm.DataType(dtype).bits // 8)
+        (io, ii) = s[x].split(list(s[x].op.axis)[-1], 16 // itemsize)
         s[x].vectorize(ii)
     tvm.te.schedule.AutoInlineInjective(s)
 

diff --git a/src/target/parsers/cpu.cc b/src/target/parsers/cpu.cc
@@ -28,7 +28,25 @@ namespace target {
 namespace parsers {
 namespace cpu {
 
+Optional<String> DetectSystemTriple() {
+  auto pf = tvm::runtime::Registry::Get("target.llvm_get_system_triple");
+  if (pf->defined()) {
+    return (*pf)();
+  }
+  return {};
+}
+
 TargetJSON ParseTarget(TargetJSON target) {
+  String kind = Downcast<String>(target.Get("kind"));
+  Optional<String> mtriple = Downcast<Optional<String>>(target.Get("mtriple"));
+  Optional<String> mcpu = Downcast<Optional<String>>(target.Get("mcpu"));
+
+  // Try to fill in the blanks by detecting target information from the system
+  if (kind == "llvm" && !mtriple.defined() && !mcpu.defined()) {
+    String system_triple = DetectSystemTriple().value_or("");
+    target.Set("mtriple", system_triple);
+  }
+
   if (mprofile::IsArch(target)) {
     return mprofile::ParseTarget(target);
   }

diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc
@@ -494,10 +494,25 @@ TEST(TargetCreation, DeduplicateKeys) {
   ICHECK_EQ(target->keys.size(), 2U);
   ICHECK_EQ(target->keys[0], "cpu");
   ICHECK_EQ(target->keys[1], "arm_cpu");
-  ICHECK_EQ(target->attrs.size(), 1U);
+  ICHECK_EQ(target->attrs.size(), 2U);
   ICHECK_EQ(target->GetAttr<String>("device"), "arm_cpu");
 }
 
+TEST(TargetCreation, DetectSystemTriple) {
+  Map<String, ObjectRef> config = {
+      {"kind", String("llvm")},
+  };
+
+  Target target = Target(config);
+  ICHECK_EQ(target->kind, TargetKind::Get("llvm").value());
+
+  Optional<String> mtriple = target->GetAttr<String>("mtriple");
+  auto pf = tvm::runtime::Registry::Get("target.llvm_get_system_triple");
+  if (!pf->defined()) {
+    GTEST_SKIP() << "LLVM is not available, skipping test";
+  }
+}
+
 TEST(TargetKindRegistry, ListTargetKinds) {
   Array<String> names = TargetKindRegEntry::ListTargetKinds();
   ICHECK_EQ(names.empty(), false);

diff --git a/tests/python/auto_scheduler/test_auto_scheduler_search_task.py b/tests/python/auto_scheduler/test_auto_scheduler_search_task.py
@@ -114,7 +114,11 @@ def test_search_task_record():
     assert new_task.task_input_names[1] == "test_input_1"
 
     # Log with version 0.5
-    v5_log = """["[\\\"matmul_auto_scheduler_test\\\", 64, 64, 64]", "llvm -keys=cpu", [6, 64, 64, 0, 0, 0, 0, 0], "", 1]"""
+    v5_log = (
+        """["[\\\"matmul_auto_scheduler_test\\\", 64, 64, 64]", """
+        f'"{str(tvm.target.Target(target))}"'
+        """, [6, 64, 64, 0, 0, 0, 0, 0], "", 1]"""
+    )
     new_task = auto_scheduler._ffi_api.DeserializeSearchTask(v5_log)
     assert task.workload_key == new_task.workload_key
     assert str(task.target) == str(new_task.target)
@@ -125,12 +129,13 @@ def test_search_task_record():
 
 def test_recover_measure_input_with_task_input():
     auto_scheduler.search_task.TASK_INPUT_BUFFER_TABLE.clear()
+    target = "llvm"
 
     # Since this file is tests for search_task, we only check the search_task here
 
     # Log with no task input
     task = auto_scheduler.SearchTask(
-        func=matmul_auto_scheduler_test, args=(512, 512, 512), target="llvm"
+        func=matmul_auto_scheduler_test, args=(512, 512, 512), target=target
     )
     inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
     res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)
@@ -147,7 +152,7 @@ def test_recover_measure_input_with_task_input():
     task = auto_scheduler.SearchTask(
         func=matmul_auto_scheduler_test,
         args=(512, 512, 512),
-        target="llvm",
+        target=target,
         task_inputs={
             "test_input_0": test_input_0,
         },
@@ -170,7 +175,7 @@ def test_recover_measure_input_with_task_input():
     task = auto_scheduler.SearchTask(
         func=matmul_auto_scheduler_test,
         args=(512, 512, 512),
-        target="llvm",
+        target=target,
         task_inputs={
             "test_input_0": test_input_0,
             "test_input_1": test_input_1,
@@ -191,7 +196,11 @@ def test_recover_measure_input_with_task_input():
     assert new_task.task_input_names[1] == "test_input_1"
 
     # Log with version 0.5
-    v5_log = """{"i": [["[\\\"matmul_auto_scheduler_test\\\", 512, 512, 512]", "llvm -keys=cpu", [6, 64, 64, 0, 0, 0, 0, 0], "", 1], [[], []]], "r": [[0.1], 0, 0.2, 1], "v": "v0.6"}"""
+    v5_log = (
+        """{"i": [["[\\\"matmul_auto_scheduler_test\\\", 512, 512, 512]", """
+        f'"{str(tvm.target.Target(target))}"'
+        """, [6, 64, 64, 0, 0, 0, 0, 0], "", 1], [[], []]], "r": [[0.1], 0, 0.2, 1], "v": "v0.6"}"""
+    )
     measure_log = auto_scheduler.measure_record.load_record_from_string(v5_log)
     new_task = measure_log[0].task
     assert task.workload_key == new_task.workload_key

diff --git a/tests/python/autotvm/test_autotvm_graph_tuner_core.py b/tests/python/autotvm/test_autotvm_graph_tuner_core.py
@@ -148,6 +148,7 @@ def _create_data(target, dshape, dtype, layout):
     return net, records, ltf_records, ltf_keys, tasks
 
 
+@tvm.testing.requires_x86
 def test_graph_tuner_layout_transform():
     log_file = "%s/test_tuner.log" % (os.getcwd())
     target = "llvm"
@@ -188,6 +189,7 @@ def test_graph_tuner_layout_transform():
         )
 
 
+@tvm.testing.requires_x86
 def test_graph_tuner_layout_transform_runner():
     log_file = "%s/test_tuner.log" % (os.getcwd())
     target = "llvm"
@@ -231,6 +233,7 @@ def test_graph_tuner_layout_transform_runner():
         )
 
 
+@tvm.testing.requires_x86
 def test_DPTuner_run():
     log_file = "%s/test_tuner.log" % (os.getcwd())
     target = "llvm"
@@ -295,6 +298,7 @@ def test_DPTuner_run():
     assert os.path.isfile(log_file), "No log file with name %s exists." % log_file
 
 
+@tvm.testing.requires_x86
 def test_PBQPTuner_run():
     target = "llvm"
     dtype = "float32"
@@ -355,6 +359,7 @@ def test_PBQPTuner_run():
     )
 
 
+@tvm.testing.requires_x86
 def test_many_sub_graphs():
     target = "llvm"
     dtype = "float32"
@@ -517,6 +522,7 @@ def test_many_sub_graphs():
     )
 
 
+@tvm.testing.requires_x86
 def test_tuple():
     target = "llvm"
     dtype = "float32"
@@ -629,6 +635,7 @@ def test_tuple():
     )
 
 
+@tvm.testing.requires_x86
 def test_triangle_block():
     target = "llvm"
     dtype = "float32"

diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
@@ -23,7 +23,7 @@
 from __future__ import print_function
 from functools import partial
 from distutils.version import LooseVersion
-
+import platform
 import os
 import tempfile
 import typing
@@ -1092,35 +1092,56 @@ def test_forward_quantized_convolution():
         )
 
         _test_tflite2_quantized_convolution(
-            (1, 16, 10, 10),
-            (3, 3),
-            2,
+            (2, 32, 28, 28),
+            (1, 1),
+            16,
             data_format="NCWH",
             int_quant_dtype=int_quant_dtype,
-            groups=2,
+            groups=8,
         )
 
+        if platform.machine() == "aarch64":
+            pytest.skip(
+                reason=(
+                    "Grouped convolution type inference error for `arm_cpu`. "
+                    "See https://github.com/apache/tvm/issues/16532"
+                )
+            )
+
         _test_tflite2_quantized_convolution(
-            (2, 32, 28, 28),
-            (1, 1),
-            16,
+            (1, 16, 10, 10),
+            (3, 3),
+            2,
             data_format="NCWH",
             int_quant_dtype=int_quant_dtype,
-            groups=8,
+            groups=2,
         )
 
 
 def test_forward_quantized_depthwise_convolution():
+    """Test qnn.conv2d depthwise compiled with TVM against TFLite reference."""
     for int_quant_dtype in [tf.int8, tf.int16]:
-        _test_tflite2_quantized_depthwise_convolution(
-            [1, 8, 8, 128], [1, 1, 128, 1], [1, 1], [1, 1], "SAME", "NHWC", 1, int_quant_dtype
-        )
         _test_tflite2_quantized_depthwise_convolution(
             [1, 17, 17, 12], [3, 3, 12, 1], [1, 1], [2, 2], "VALID", "NHWC", 1, int_quant_dtype
         )
         _test_tflite2_quantized_depthwise_convolution(
             [1, 24, 24, 3], [7, 7, 3, 8], [1, 1], [2, 2], "SAME", "NHWC", 8, int_quant_dtype
         )
+    _test_tflite2_quantized_depthwise_convolution(
+        [1, 8, 8, 128], [1, 1, 128, 1], [1, 1], [1, 1], "SAME", "NHWC", 1, tf.int8
+    )
+
+    if platform.machine() == "aarch64":
+        pytest.skip(
+            reason=(
+                "Tensor intrinsic data type mismatch error. "
+                "See https://github.com/apache/tvm/issues/16533"
+            )
+        )
+
+    _test_tflite2_quantized_depthwise_convolution(
+        [1, 8, 8, 128], [1, 1, 128, 1], [1, 1], [1, 1], "SAME", "NHWC", 1, tf.int16
+    )
 
 
 def _test_tflite2_quantized_depthwise_convolution(
@@ -5090,6 +5111,10 @@ def test_forward_qnn_mobilenet_v3_net():
     tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Fails with an output mismatch. See https://github.com/apache/tvm/issues/16534",
+)
 def test_forward_tflite2_qnn_resnet50():
     """Test the Quantized TFLite version 2.1.0 Resnet50 model."""
     if package_version.parse(tf.VERSION) >= package_version.parse("2.1.0"):
@@ -5186,6 +5211,11 @@ def test_forward_tflite_float16():
     tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Fails during leagalization due to int16 datatype. "
+    "See https://github.com/apache/tvm/issues/16535",
+)
 def test_forward_mobilenet_int16():
     """Test int16 quantized model"""
     # MobilenetV2
@@ -5228,6 +5258,11 @@ def representative_dataset():
     tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Fails during leagalization due to int16 datatype. "
+    "See https://github.com/apache/tvm/issues/16535",
+)
 def test_forward_ds_cnn_int16():
     """Test DS_CNN int16 quantized model"""
     tflite_model_file = download_testdata(

diff --git a/tests/python/integration/test_legacy_tuning.py b/tests/python/integration/test_legacy_tuning.py
@@ -353,7 +353,7 @@ def @main(%a : Tensor[(1, 3, 32, 32), float32], %b : Tensor[(3, 3, 5, 5), float3
         tasks = autotvm.task.relay_integration.extract_from_program(
             ir_mod, {}, tvm.target.create("llvm")
         )
-        assert len(tasks) == 1, f"Extracted != 1 task from program: {tasks!r}"
+        assert len(tasks) >= 1, f"Extracted no tasks from program: {tasks!r}"
 
         task = tasks[0]