Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Autoscheduler][Sparse] Add sparse dense end to end model tuning support for x86/arm cpu & Some bug fix #7635

Merged
merged 27 commits into from
Mar 30, 2021
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions python/tvm/auto_scheduler/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,7 +775,7 @@ def register(myf):
return register


def _prepare_input_map(args):
def prepare_input_map(args):
"""This function deals with special task inputs. Map the input Tensor of a TVM subgraph
to a specific buffer name in the global buffer map.

Expand Down Expand Up @@ -861,7 +861,7 @@ def _timed_eval_func(
random_fill = tvm.get_global_func("tvm.contrib.random.random_fill", True)
assert random_fill, "Please make sure USE_RANDOM is ON in the config.cmake"

tensor_input_map = _prepare_input_map(build_res.args) if task_input_names else {}
tensor_input_map = prepare_input_map(build_res.args) if task_input_names else {}
args = []
task_inputs_count = 0
for arg in build_res.args:
Expand Down Expand Up @@ -1076,7 +1076,7 @@ def _timed_rpc_run(
random_fill
), "Please make sure USE_RANDOM is ON in the config.cmake on the remote devices"

tensor_input_map = _prepare_input_map(build_res.args) if task_input_names else {}
tensor_input_map = prepare_input_map(build_res.args) if task_input_names else {}
args = []
task_inputs_count = 0
for arg in build_res.args:
Expand Down
30 changes: 28 additions & 2 deletions python/tvm/auto_scheduler/relay_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,12 @@ def extract_tasks(
# When auto scheduler is used in end to end network, try to apply layout rewrite
# to improve the overall performance
layout_rewrite_option=LayoutRewriteOption.get_target_default(target, True),
task_inputs=(
env.wkl_key_to_input_names[wkl_key]
if wkl_key in env.wkl_key_to_input_names
else None
),
task_inputs_save_to_file=True,
)
)
weights.append(weight)
Expand All @@ -166,6 +172,7 @@ def __init__(self, tracing_mode):
self.tracing_mode = tracing_mode
self.relay_disable_build_cache = "false"
self.wkl_key_to_weight = {}
self.wkl_key_to_input_names = {}

def __enter__(self):
TracingEnvironment.current = self
Expand All @@ -175,17 +182,30 @@ def __exit__(self, exc_type, exc_val, exc_tb):
TracingEnvironment.current = None

def add_workload_key(self, workload_key):
"""Add the workload key of a search task
"""Add the workload key of a search task.

Parameters
----------
workload_key: str
The workload key of a task
The workload key of a task.
"""
if workload_key not in self.wkl_key_to_weight:
self.wkl_key_to_weight[workload_key] = 0
self.wkl_key_to_weight[workload_key] += 1

def add_workload_input_names(self, workload_key, input_names):
"""Add special task inputs to this workload.

Parameters
----------
workload_key : str
The workload key of a task.

input_names : List[str]
A list of input names.
"""
self.wkl_key_to_input_names[workload_key] = input_names


@tvm._ffi.register_func("auto_scheduler.enter_layout_rewrite")
def enter_layout_rewrite():
Expand Down Expand Up @@ -274,6 +294,9 @@ def auto_schedule_topi(outs):
None in the tracing mode so that the fallback topi schedule will be used.
"""
# pylint: disable=import-outside-toplevel
from tvm.auto_scheduler.measure import (
prepare_input_map,
) # lazily import to avoid recursive dependency

io_tensors, has_layout_free, has_complex_op = traverse_to_get_io_tensors(outs)
if not io_tensors: # The compute includes dynamic shapes which are not supported yet.
Expand Down Expand Up @@ -305,6 +328,9 @@ def auto_schedule_topi(outs):
# in the task extraction mode
if has_complex_op or env.tracing_mode == TracingMode.EXTRACT_TASK:
env.add_workload_key(key)
input_map = prepare_input_map(io_tensors)
if input_map:
env.add_workload_input_names(key, list(input_map.values()))
elif env.tracing_mode == TracingMode.PREPARE_LAYOUT_REWRITE:
# in prepare_layout_rewrite mode
if (
Expand Down
9 changes: 7 additions & 2 deletions python/tvm/auto_scheduler/search_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,13 +299,18 @@ def get_task_input_buffer(workload_key, input_name):
TASK_INPUT_BUFFER_TABLE[workload_key] = {}
input_table = TASK_INPUT_BUFFER_TABLE[workload_key]

if input_name not in input_table.keys():
if input_name not in input_table:
# Try to load buffer data from local file
tensor_from_file = _try_load_buffer_from_file(input_name)
if tensor_from_file:
input_table[input_name] = tensor_from_file

if input_name in input_table.keys():
# Then check for the default table, the input names extracted from a relay model will be
# stored here for we're not able to get the workload_key at that time
if input_name not in input_table:
input_table = TASK_INPUT_BUFFER_TABLE["default"]

if input_name in input_table:
return input_table[input_name]

raise ValueError(
Expand Down
23 changes: 23 additions & 0 deletions python/tvm/relay/analysis/sparse_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ def process_params(expr, params, block_size, sparsity_threshold):
ret : Namedtuple[weight_name: Array[String], weight_shape: Array[Array[IntImm]]]
return names of qualified dense weight and the shape in BSR format
"""

# pylint: disable=import-outside-toplevel
from tvm.auto_scheduler.search_task import (
register_task_input_buffer,
) # lazily import to avoid recursive dependency

memo = SparseAnalysisResult(weight_name=[], weight_shape=[])
weight_names = _search_dense_op_weight(expr)
for name in weight_names:
Expand All @@ -92,6 +98,23 @@ def process_params(expr, params, block_size, sparsity_threshold):
params[name + ".data"] = tvm.nd.array(sparse_weight.data)
params[name + ".indices"] = tvm.nd.array(sparse_weight.indices)
params[name + ".indptr"] = tvm.nd.array(sparse_weight.indptr)

prefix = "sparse_dense_bsr_%d_%d_%d_%d_%.2f_" % (
w_np.shape[0],
w_np.shape[1],
block_size[0],
block_size[1],
1 - sparsity,
)
register_task_input_buffer(
"default", prefix + "W_data", tvm.runtime.ndarray.array(sparse_weight.data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets not hard-code it, we can use the {name + ".data", name + ".indices", name + ".indptr"}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is that we cannot get the "name" during measuring.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, thanks for clarification. But i just wonder if name is not available than, how the logic above prefix is working (i mean the line number 98). Its in the same flow right ? Please let me know in case i am mistaken.

)
register_task_input_buffer(
"default", prefix + "W_indices", tvm.runtime.ndarray.array(sparse_weight.indices)
)
register_task_input_buffer(
"default", prefix + "W_indptr", tvm.runtime.ndarray.array(sparse_weight.indptr)
)
ret = SparseAnalysisResult(
weight_name=tvm.runtime.convert(memo.weight_name),
weight_shape=tvm.runtime.convert(memo.weight_shape),
Expand Down
2 changes: 1 addition & 1 deletion python/tvm/relay/frontend/tflite.py
Original file line number Diff line number Diff line change
Expand Up @@ -1872,7 +1872,7 @@ def convert_fully_connected(self, op):
out_dtype="int32",
)
else:
out = _op.nn.dense(in_expr, weight_expr)
out = _op.nn.dense(in_expr, weight_expr, units=weight_shape[0])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One small suggestion, if possible, can we add a test case which will indicate the issue fixed here. So that it will help in future breaks.


# if we have bias
if len(input_tensors) == 3:
Expand Down
37 changes: 36 additions & 1 deletion python/tvm/relay/op/strategy/arm_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import logging

from tvm import topi
from tvm.auto_scheduler import is_auto_scheduler_enabled
from ....target import arm_isa
from .generic import *
from .. import op as _op
Expand Down Expand Up @@ -135,6 +136,38 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
name="conv2d_direct_simd.micro_dev",
)
elif kernel_layout == "HWIO":
if is_auto_scheduler_enabled():
jcf94 marked this conversation as resolved.
Show resolved Hide resolved
strategy.add_implementation(
wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True),
naive_schedule,
name="conv2d_nhwc.arm_cpu",
plevel=100,
)
judge_winograd_auto_scheduler = False
if len(kernel.shape) == 4:
kernel_h, kernel_w, _, co = get_const_tuple(kernel.shape)
judge_winograd_auto_scheduler = (
"float" in data.dtype
and "float" in kernel.dtype
and kernel_h == 3
and kernel_w == 3
and stride_h == 1
and stride_w == 1
and dilation_h == 1
and dilation_w == 1
and 64 <= co < 512
)
# register auto-scheduler implementations
if judge_winograd_auto_scheduler:
strategy.add_implementation(
wrap_compute_conv2d(
topi.nn.conv2d_winograd_nhwc, need_auto_scheduler_layout=True
),
naive_schedule, # this implementation should never be picked by autotvm
name="conv2d_nhwc.winograd.arm_cpu",
plevel=101,
)

is_aarch64 = topi.arm_cpu.arm_utils.is_aarch64_arm()
has_dot_prod = topi.arm_cpu.arm_utils.is_dotprod_available()
if has_dot_prod and data.dtype in ["int8", "uint8"]:
Expand Down Expand Up @@ -198,7 +231,9 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
elif layout == "NHWC":
assert kernel_layout == "HWOI"
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.compute_depthwise_conv2d_nhwc),
wrap_compute_conv2d(
topi.arm_cpu.compute_depthwise_conv2d_nhwc, need_auto_scheduler_layout=True
jcf94 marked this conversation as resolved.
Show resolved Hide resolved
),
wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nhwc),
name="depthwise_conv2d_nhwc.arm_cpu",
)
Expand Down
2 changes: 1 addition & 1 deletion python/tvm/relay/op/strategy/x86.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
"depthwise_conv2d NHWC layout is not optimized for x86 with autotvm."
)
strategy.add_implementation(
wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc, need_auto_scheduler_layout=True),
jcf94 marked this conversation as resolved.
Show resolved Hide resolved
wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc),
name="depthwise_conv2d_nhwc.generic",
)
Expand Down
2 changes: 1 addition & 1 deletion python/tvm/topi/nn/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ def _process_inputs(input_tensors, m, n, prefix_init):
density *= i
density /= k * n
density = density.value
sparse_prefix = "%s_%d_%d_%d_%d_%d_%.2f_" % (prefix_init, m, n, k, bs_r, bs_c, density)
sparse_prefix = "%s_%d_%d_%d_%d_%.2f_" % (prefix_init, n, k, bs_r, bs_c, density)

visited = set()

Expand Down
49 changes: 49 additions & 0 deletions python/tvm/topi/sparse/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Some utils for Sparse operation."""


def random_bsr_matrix(m, n, bs_r, bs_c, density, dtype):
"""Generate a random sparse matrix in bsr format.

Returns
-------
scipy.sparse.bsr_matrix
"""
# pylint: disable=import-outside-toplevel
import numpy as np
import itertools
import scipy.sparse as sp

y = np.zeros((m, n), dtype=dtype)
assert m % bs_r == 0
assert n % bs_c == 0
nnz = int(density * m * n)
num_blocks = int(nnz / (bs_r * bs_c)) + 1
candidate_blocks = np.asarray(list(itertools.product(range(0, m, bs_r), range(0, n, bs_c))))
assert candidate_blocks.shape[0] == m // bs_r * n // bs_c
chosen_blocks = candidate_blocks[
np.random.choice(candidate_blocks.shape[0], size=num_blocks, replace=False)
]
# pylint: disable=invalid-name
for (r, c) in chosen_blocks:
y[r : r + bs_r, c : c + bs_c] = np.random.randn(bs_r, bs_c)
s = sp.bsr_matrix(y, blocksize=(bs_r, bs_c))
assert s.data.shape == (num_blocks, bs_r, bs_c)
assert s.indices.shape == (num_blocks,)
assert s.indptr.shape == (m // bs_r + 1,)
return s
Loading