Skip to content

Commit d76cf77

Browse files
echuraevylc
authored andcommitted
[OpenCL] Add vectorization to cuda conv2d_nhwc schedule (apache#8636)
* Add vectorization to cuda conv2d_nhwc schedule Adding vectorization significantly improved performance. About 6-7x boost. * Apply comment * Move schedule to topi/gpu dir * Add vectorization to inner loop * Update values of vectorization factor
1 parent 980d4a8 commit d76cf77

File tree

7 files changed

+62
-32
lines changed

7 files changed

+62
-32
lines changed

python/tvm/relay/op/strategy/cuda.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,9 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
183183
elif layout == "NHWC":
184184
assert kernel_layout == "HWIO"
185185
strategy.add_implementation(
186-
wrap_compute_conv2d(topi.cuda.conv2d_nhwc),
187-
wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
188-
name="conv2d_nhwc.cuda",
186+
wrap_compute_conv2d(topi.gpu.conv2d_nhwc),
187+
wrap_topi_schedule(topi.gpu.schedule_conv2d_nhwc),
188+
name="conv2d_nhwc.gpu",
189189
)
190190

191191
N, H, W, _ = get_const_tuple(data.shape)

python/tvm/relay/op/strategy/rocm.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,9 @@ def conv2d_strategy_rocm(attrs, inputs, out_type, target):
6969
elif layout == "NHWC":
7070
assert kernel_layout == "HWIO"
7171
strategy.add_implementation(
72-
wrap_compute_conv2d(topi.cuda.conv2d_nhwc),
73-
wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
74-
name="conv2d_nhwc.cuda",
72+
wrap_compute_conv2d(topi.gpu.conv2d_nhwc),
73+
wrap_topi_schedule(topi.gpu.schedule_conv2d_nhwc),
74+
name="conv2d_nhwc.gpu",
7575
)
7676
N, H, W, _ = get_const_tuple(data.shape)
7777
KH, KW, CI, CO = get_const_tuple(kernel.shape)

python/tvm/topi/cuda/conv2d.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
from ..nn.utils import get_pad_tuple
2626
from ..utils import get_const_tuple, traverse_inline
2727
from .conv2d_direct import schedule_direct_cuda
28-
from .conv2d_nhwc import schedule_conv2d_nhwc_direct
2928

3029

3130
@autotvm.register_topi_compute("conv2d_nchw.cuda")
@@ -48,26 +47,6 @@ def _callback(op):
4847
return s
4948

5049

51-
@autotvm.register_topi_compute("conv2d_nhwc.cuda")
52-
def conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float32"):
53-
"""Compute conv2d with NHWC layout"""
54-
return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
55-
56-
57-
@autotvm.register_topi_schedule("conv2d_nhwc.cuda")
58-
def schedule_conv2d_nhwc(cfg, outs):
59-
"""Create the schedule for conv2d_nhwc"""
60-
outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
61-
s = te.create_schedule([x.op for x in outs])
62-
63-
def _callback(op):
64-
if op.tag == "conv2d_nhwc":
65-
schedule_conv2d_nhwc_direct(cfg, s, op.output(0))
66-
67-
traverse_inline(s, outs[0].op, _callback)
68-
return s
69-
70-
7150
@autotvm.register_topi_compute("conv2d_cudnn.cuda")
7251
def conv2d_cudnn(
7352
cfg, data, kernel, strides, padding, dilation, groups=1, layout="NCHW", out_dtype="float32"

python/tvm/topi/gpu/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,4 @@
1818
# pylint: disable=redefined-builtin, wildcard-import
1919
"""GPU specific declaration and schedules."""
2020
from .dense import *
21+
from .conv2d import *

python/tvm/topi/gpu/conv2d.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
# pylint: disable=invalid-name, unused-argument
18+
"""Schedule for conv2d operator"""
19+
from tvm import te, autotvm
20+
21+
from .. import nn
22+
from ..utils import traverse_inline
23+
from .conv2d_nhwc import schedule_conv2d_nhwc_direct
24+
25+
26+
@autotvm.register_topi_compute("conv2d_nhwc.gpu")
27+
def conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float32"):
28+
"""Compute conv2d with NHWC layout"""
29+
return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
30+
31+
32+
@autotvm.register_topi_schedule("conv2d_nhwc.gpu")
33+
def schedule_conv2d_nhwc(cfg, outs):
34+
"""Create the schedule for conv2d_nhwc"""
35+
outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
36+
s = te.create_schedule([x.op for x in outs])
37+
38+
def _callback(op):
39+
if op.tag == "conv2d_nhwc":
40+
schedule_conv2d_nhwc_direct(cfg, s, op.output(0))
41+
42+
traverse_inline(s, outs[0].op, _callback)
43+
return s

python/tvm/topi/cuda/conv2d_nhwc.py renamed to python/tvm/topi/gpu/conv2d_nhwc.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,13 @@ def schedule_conv2d_nhwc_direct(cfg, s, Conv):
5454
cfg.define_knob("vthread_n", [1] if dynamic_batch else [1, 2])
5555
cfg.define_knob("vthread_c", [1, 2])
5656
cfg.define_knob("step", [16, 3, 32, 64])
57+
cfg.define_knob("vectorize", [1, 2, 4, 8])
5758

5859
# fallback support
5960
target = tvm.target.Target.current()
6061
if cfg.is_fallback:
6162
ref_log = autotvm.tophub.load_reference_log(
62-
target.kind.name, target.model, "conv2d_nhwc.cuda"
63+
target.kind.name, target.model, "conv2d_nhwc.gpu"
6364
)
6465
cfg.fallback_with_reference_log(ref_log)
6566

@@ -70,6 +71,7 @@ def schedule_conv2d_nhwc_direct(cfg, s, Conv):
7071
vthread_n = cfg["vthread_n"].val
7172
vthread_c = cfg["vthread_c"].val
7273
step = cfg["step"].val
74+
vec_factor = cfg["vectorize"].val
7375
block_factor_c = tile_c * num_thread_c * vthread_c
7476

7577
offset = 8
@@ -85,15 +87,17 @@ def schedule_conv2d_nhwc_direct(cfg, s, Conv):
8587
thread_yz = te.thread_axis((0, vthread_n), "vthread", name="vy")
8688

8789
# Schedule for output
88-
ni, hi, wi, fi = s[output].op.axis
89-
bx = s[output].fuse(hi, wi)
90+
ni, _, wi, fi = s[output].op.axis
91+
bx = wi
92+
fi, vec = s[output].split(fi, factor=vec_factor)
93+
s[output].vectorize(vec)
9094
tx, fi = s[output].split(fi, factor=tile_c)
9195
txz, tx = s[output].split(tx, factor=num_thread_c)
9296
bz, txz = s[output].split(txz, factor=vthread_c)
9397
ty, ni = s[output].split(ni, factor=tile_n)
9498
tyz, ty = s[output].split(ty, factor=num_thread_n)
9599
by, tyz = s[output].split(tyz, factor=vthread_n)
96-
s[output].reorder(bx, by, bz, tyz, txz, ty, tx, ni, fi)
100+
s[output].reorder(bx, by, bz, tyz, txz, ty, tx, ni, fi, vec)
97101
s[output].bind(bz, block_z)
98102
s[output].bind(by, block_y)
99103
s[output].bind(bx, block_x)
@@ -106,6 +110,7 @@ def schedule_conv2d_nhwc_direct(cfg, s, Conv):
106110
ni, yi, xi, fi = s[OL].op.axis
107111
ry, rx, rc = s[OL].op.reduce_axis
108112
rco, rci = s[OL].split(rc, factor=step)
113+
s[OL].vectorize(fi)
109114
s[OL].reorder(rco, ry, rx, rci, ni, fi)
110115

111116
s[AA].compute_at(s[OL], rx)
@@ -125,6 +130,8 @@ def schedule_conv2d_nhwc_direct(cfg, s, Conv):
125130
_, _, ic, o = s[WW].op.axis
126131
t = s[WW].fuse(ic, o)
127132
s[WW].storage_align(ic, W_align - 1, W_align)
133+
t, vec = s[WW].split(t, factor=vec_factor)
134+
s[WW].vectorize(vec)
128135
ty, tx = s[WW].split(t, factor=num_thread_c)
129136
_, ty = s[WW].split(ty, factor=num_thread_n)
130137
s[WW].bind(tx, thread_x)

tests/python/topi/python/test_topi_conv2d_nhwc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
_conv2d_nhwc_implement = {
3030
"generic": (topi.nn.conv2d_nhwc, topi.generic.schedule_conv2d_nhwc),
31-
"gpu": (topi.cuda.conv2d_nhwc, topi.cuda.schedule_conv2d_nhwc),
31+
"gpu": (topi.gpu.conv2d_nhwc, topi.gpu.schedule_conv2d_nhwc),
3232
"cpu": (topi.nn.conv2d_nhwc, topi.x86.schedule_conv2d_nhwc),
3333
"arm_cpu": (
3434
topi.arm_cpu.conv2d_nhwc_spatial_pack,

0 commit comments

Comments
 (0)