forked from apache/tvm
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[MetaSchedule] Schedule Rule: Parallelize-Vectorize-Unroll (apache#10033
) Co-authored-by: Junru Shao <junrushao1994@gmail.com> Co-authored-by: Xiyou Zhou <xiyou@octoml.ai> Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com> Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn> Co-authored-by: Hongyi Jin <3231950289@qq.com> Co-authored-by: Wuwei Lin <wuwei@apache.org> Co-authored-by: Junru Shao <junrushao1994@gmail.com> Co-authored-by: Xiyou Zhou <xiyou@octoml.ai> Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com> Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn> Co-authored-by: Hongyi Jin <3231950289@qq.com> Co-authored-by: Wuwei Lin <wuwei@apache.org>
- Loading branch information
Showing
7 changed files
with
347 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
64 changes: 64 additions & 0 deletions
64
python/tvm/meta_schedule/schedule_rule/parallel_vectorize_unroll.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
"""Rule that mark parallelize, vectorize and unroll to the root block. The mark will be applied to | ||
each block in a follow-up post processor""" | ||
from typing import List, Optional | ||
|
||
from tvm._ffi import register_object | ||
|
||
from .. import _ffi_api | ||
from .schedule_rule import ScheduleRule | ||
|
||
|
||
@register_object("meta_schedule.ParallelizeVectorizeUnroll") | ||
class ParallelizeVectorizeUnroll(ScheduleRule): | ||
"""Rule that mark parallelize, vectorize and unroll to the root block. The mark will be applied | ||
to each block in a follow-up post processor | ||
Parameters | ||
---------- | ||
max_jobs_per_core: int | ||
The maximum number of jobs to be launched per CPU core. It sets the upper limit of CPU | ||
parallelism, i.e. `num_cores * max_jobs_per_core`. | ||
Use -1 to disable parallelism. | ||
max_vectorize_extent: int | ||
The maximum extent to be vectorized. It sets the upper limit of the hardware target | ||
vectorization. | ||
Use -1 to disable vectorization. | ||
unroll_max_steps: Optional[List[int]] | ||
The options of the maximum number of unroll steps to be done. | ||
Use None to disable unroll | ||
unroll_explicit: bool | ||
Whether to explicitly unroll the loop, or just add an "unroll" pragma | ||
""" | ||
|
||
def __init__( | ||
self, | ||
max_jobs_per_core: int = 16, | ||
max_vectorize_extent: int = 16, | ||
unroll_max_steps: Optional[List[int]] = None, | ||
unroll_explicit: bool = True, | ||
) -> None: | ||
if unroll_max_steps is None: | ||
unroll_max_steps = [] | ||
self.__init_handle_by_constructor__( | ||
_ffi_api.ScheduleRuleParallelizeVectorizeUnroll, # type: ignore # pylint: disable=no-member | ||
max_jobs_per_core, | ||
max_vectorize_extent, | ||
unroll_max_steps, | ||
unroll_explicit, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
129 changes: 129 additions & 0 deletions
129
src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
#include "../utils.h" | ||
|
||
namespace tvm { | ||
namespace tir { | ||
|
||
bool IsRootBlock(const Schedule& sch, const BlockRV& block_rv) { | ||
StmtSRef block_sref = sch->GetSRef(block_rv); | ||
return block_sref->parent == nullptr; | ||
} | ||
|
||
} // namespace tir | ||
} // namespace tvm | ||
|
||
namespace tvm { | ||
namespace meta_schedule { | ||
|
||
class ParallelizeVectorizeUnrollNode : public ScheduleRuleNode { | ||
public: | ||
// Inherited from ScheduleRuleNode | ||
void InitializeWithTuneContext(const TuneContext& context) final { | ||
ICHECK(context->target.defined()); | ||
if (this->max_jobs_per_core != -1) { | ||
Target target = context->target.value(); | ||
this->max_parallel_extent_ = GetTargetNumCores(target) * max_jobs_per_core; | ||
} | ||
} | ||
|
||
// Inherited from ScheduleRuleNode | ||
Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& root_rv) { | ||
// Currently only mark the root block with annotations. | ||
if (!tir::IsRootBlock(sch, root_rv)) { | ||
return {sch}; | ||
} | ||
|
||
// Parallelization | ||
if (max_jobs_per_core != -1) { | ||
sch->Annotate(root_rv, tir::attr::meta_schedule_parallel, | ||
Integer(this->max_parallel_extent_)); | ||
} | ||
// Vectorization | ||
if (max_vectorize_extent != -1) { | ||
sch->Annotate(root_rv, tir::attr::meta_schedule_vectorize, Integer(max_vectorize_extent)); | ||
} | ||
// Unroll | ||
if (!unroll_max_steps.empty()) { | ||
int n = unroll_max_steps.size(); | ||
double prob = 1.0 / n; | ||
Array<FloatImm> probs(n, FloatImm(DataType::Float(64), prob)); | ||
PrimExpr max_step = sch->SampleCategorical(unroll_max_steps, probs); | ||
if (unroll_explicit) { | ||
sch->Annotate(root_rv, tir::attr::meta_schedule_unroll_explicit, max_step); | ||
} else { | ||
sch->Annotate(root_rv, tir::attr::meta_schedule_unroll_implicit, max_step); | ||
} | ||
} | ||
return {sch}; | ||
} | ||
|
||
public: | ||
/*! | ||
* \brief The maximum number of jobs to be launched per CPU core. It sets the | ||
* upper limit of CPU parallelism, i.e. `num_cores * max_jobs_per_core`. Use -1 to disable | ||
* parallelism. | ||
*/ | ||
int64_t max_jobs_per_core; | ||
/*! | ||
* \brief The maximum extent to be vectorized. | ||
* It sets the upper limit of the hardware target vectorization. Use -1 to disable vectorization. | ||
*/ | ||
int max_vectorize_extent; | ||
/*! | ||
* \brief The options of the maximum number of unroll steps to be done. | ||
* Use an empty array to disable unroll. | ||
*/ | ||
Array<Integer> unroll_max_steps; | ||
/*! \brief Whether to explicitly unroll the loop, or just add an "unroll" pragma. */ | ||
bool unroll_explicit; | ||
/*! \brief The number of maximum available jobs in CPU. */ | ||
int64_t max_parallel_extent_; | ||
|
||
void VisitAttrs(tvm::AttrVisitor* v) { | ||
v->Visit("max_jobs_per_core", &max_jobs_per_core); | ||
v->Visit("max_vectorize_extent", &max_vectorize_extent); | ||
v->Visit("unroll_max_steps", &unroll_max_steps); | ||
v->Visit("unroll_explicit", &unroll_explicit); | ||
// `max_parallel_extent_` is not visited | ||
} | ||
|
||
static constexpr const char* _type_key = "meta_schedule.ParallelizeVectorizeUnroll"; | ||
TVM_DECLARE_FINAL_OBJECT_INFO(ParallelizeVectorizeUnrollNode, ScheduleRuleNode); | ||
}; | ||
|
||
ScheduleRule ScheduleRule::ParallelizeVectorizeUnroll(int max_jobs_per_core, | ||
int max_vectorize_extent, | ||
Array<Integer> unroll_max_steps, | ||
bool unroll_explicit) { | ||
ObjectPtr<ParallelizeVectorizeUnrollNode> n = make_object<ParallelizeVectorizeUnrollNode>(); | ||
n->max_jobs_per_core = max_jobs_per_core; | ||
n->max_vectorize_extent = max_vectorize_extent; | ||
n->unroll_max_steps = unroll_max_steps; | ||
n->unroll_explicit = unroll_explicit; | ||
n->max_parallel_extent_ = -1; | ||
return ScheduleRule(n); | ||
} | ||
|
||
TVM_REGISTER_NODE_TYPE(ParallelizeVectorizeUnrollNode); | ||
TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleParallelizeVectorizeUnroll") | ||
.set_body_typed(ScheduleRule::ParallelizeVectorizeUnroll); | ||
|
||
} // namespace meta_schedule | ||
} // namespace tvm |
105 changes: 105 additions & 0 deletions
105
tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring | ||
import tvm | ||
from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply | ||
from tvm.meta_schedule.testing.schedule_rule import parallel_vectorize_unroll | ||
from tvm.meta_schedule.testing.space_generation import check_trace | ||
from tvm.meta_schedule.tune_context import TuneContext | ||
from tvm.script import tir as T | ||
from tvm.target import Target | ||
|
||
# fmt: off | ||
# pylint: disable=no-member,invalid-name,unused-variable,no-self-argument,line-too-long,chained-comparison,not-callable,too-many-nested-blocks | ||
|
||
@tvm.script.ir_module | ||
class Matmul: | ||
@T.prim_func | ||
def main(a: T.handle, b: T.handle, c: T.handle) -> None: | ||
T.func_attr({"global_symbol": "main"}) | ||
A = T.match_buffer(a, (1024, 1024), "float32") | ||
B = T.match_buffer(b, (1024, 1024), "float32") | ||
C = T.match_buffer(c, (1024, 1024), "float32") | ||
for i, j, k in T.grid(1024, 1024, 1024): | ||
with T.block("matmul"): | ||
vi, vj, vk = T.axis.remap("SSR", [i, j, k]) | ||
with T.init(): | ||
C[vi, vj] = 0.0 | ||
C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj] | ||
|
||
|
||
@tvm.script.ir_module | ||
class ParallelizeVectorizeUnroll: | ||
@T.prim_func | ||
def main(a: T.handle, b: T.handle, c: T.handle) -> None: | ||
T.func_attr({"global_symbol": "main"}) | ||
A = T.match_buffer(a, (1024, 1024), "float32") | ||
B = T.match_buffer(b, (1024, 1024), "float32") | ||
C = T.match_buffer(c, (1024, 1024), "float32") | ||
with T.block("root"): | ||
T.reads([]) | ||
T.writes([]) | ||
T.block_attr({"meta_schedule.parallel": 128, "meta_schedule.vectorize": 16, "meta_schedule.unroll_explicit": 2}) | ||
for i, j, k in T.grid(1024, 1024, 1024): | ||
with T.block("matmul"): | ||
vi, vj, vk = T.axis.remap("SSR", [i, j, k]) | ||
with T.init(): | ||
C[vi, vj] = 0.0 | ||
C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj] | ||
|
||
# pylint: enable=no-member,invalid-name,unused-variable,no-self-argument,line-too-long,chained-comparison,not-callable,too-many-nested-blocks | ||
# fmt: on | ||
|
||
|
||
def _create_context(mod, target, rule): | ||
ctx = TuneContext( | ||
mod=mod, | ||
target=target, | ||
space_generator=PostOrderApply(), | ||
sch_rules=[rule], | ||
task_name="test", | ||
) | ||
ctx.space_generator.initialize_with_tune_context(ctx) | ||
for sch_rule in ctx.sch_rules: | ||
sch_rule.initialize_with_tune_context(ctx) | ||
return ctx | ||
|
||
|
||
def test_parallel_vectorize_unroll(): | ||
expected = [ | ||
[ | ||
'b0 = sch.get_block(name="root", func_name="main")', | ||
'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.parallel", ann_val=512)', | ||
'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.vectorize", ann_val=32)', | ||
"v1 = sch.sample_categorical(candidates=[0, 16, 64, 512], probs=[0.25, 0.25, 0.25, 0.25])", | ||
'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.unroll_explicit", ann_val=v1)', | ||
] | ||
] | ||
mod = Matmul | ||
target = Target("llvm --num-cores=32") | ||
ctx = _create_context( | ||
mod=mod, | ||
target=target, | ||
rule=parallel_vectorize_unroll(target=target), | ||
) | ||
spaces = ctx.space_generator.generate_design_space(mod=mod) | ||
assert len(spaces) == 1 | ||
check_trace(spaces, expected) | ||
|
||
|
||
if __name__ == "__main__": | ||
test_parallel_vectorize_unroll() |