Skip to content

Commit

Permalink
[OpenCL] Add vectorization to cuda conv2d_nhwc schedule (#8636)
Browse files Browse the repository at this point in the history
* Add vectorization to cuda conv2d_nhwc schedule

Adding vectorization significantly improved performance. About 6-7x
boost.

* Apply comment

* Move schedule to topi/gpu dir

* Add vectorization to inner loop

* Update values of vectorization factor
  • Loading branch information
echuraev authored Sep 30, 2021
1 parent dda75fe commit 719d2f6
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 32 deletions.
6 changes: 3 additions & 3 deletions python/tvm/relay/op/strategy/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,9 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
elif layout == "NHWC":
assert kernel_layout == "HWIO"
strategy.add_implementation(
wrap_compute_conv2d(topi.cuda.conv2d_nhwc),
wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
name="conv2d_nhwc.cuda",
wrap_compute_conv2d(topi.gpu.conv2d_nhwc),
wrap_topi_schedule(topi.gpu.schedule_conv2d_nhwc),
name="conv2d_nhwc.gpu",
)

N, H, W, _ = get_const_tuple(data.shape)
Expand Down
6 changes: 3 additions & 3 deletions python/tvm/relay/op/strategy/rocm.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ def conv2d_strategy_rocm(attrs, inputs, out_type, target):
elif layout == "NHWC":
assert kernel_layout == "HWIO"
strategy.add_implementation(
wrap_compute_conv2d(topi.cuda.conv2d_nhwc),
wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
name="conv2d_nhwc.cuda",
wrap_compute_conv2d(topi.gpu.conv2d_nhwc),
wrap_topi_schedule(topi.gpu.schedule_conv2d_nhwc),
name="conv2d_nhwc.gpu",
)
N, H, W, _ = get_const_tuple(data.shape)
KH, KW, CI, CO = get_const_tuple(kernel.shape)
Expand Down
21 changes: 0 additions & 21 deletions python/tvm/topi/cuda/conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from ..nn.utils import get_pad_tuple
from ..utils import get_const_tuple, traverse_inline
from .conv2d_direct import schedule_direct_cuda
from .conv2d_nhwc import schedule_conv2d_nhwc_direct


@autotvm.register_topi_compute("conv2d_nchw.cuda")
Expand All @@ -48,26 +47,6 @@ def _callback(op):
return s


@autotvm.register_topi_compute("conv2d_nhwc.cuda")
def conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float32"):
"""Compute conv2d with NHWC layout"""
return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)


@autotvm.register_topi_schedule("conv2d_nhwc.cuda")
def schedule_conv2d_nhwc(cfg, outs):
"""Create the schedule for conv2d_nhwc"""
outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
s = te.create_schedule([x.op for x in outs])

def _callback(op):
if op.tag == "conv2d_nhwc":
schedule_conv2d_nhwc_direct(cfg, s, op.output(0))

traverse_inline(s, outs[0].op, _callback)
return s


@autotvm.register_topi_compute("conv2d_cudnn.cuda")
def conv2d_cudnn(
cfg, data, kernel, strides, padding, dilation, groups=1, layout="NCHW", out_dtype="float32"
Expand Down
1 change: 1 addition & 0 deletions python/tvm/topi/gpu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@
# pylint: disable=redefined-builtin, wildcard-import
"""GPU specific declaration and schedules."""
from .dense import *
from .conv2d import *
43 changes: 43 additions & 0 deletions python/tvm/topi/gpu/conv2d.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=invalid-name, unused-argument
"""Schedule for conv2d operator"""
from tvm import te, autotvm

from .. import nn
from ..utils import traverse_inline
from .conv2d_nhwc import schedule_conv2d_nhwc_direct


@autotvm.register_topi_compute("conv2d_nhwc.gpu")
def conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float32"):
"""Compute conv2d with NHWC layout"""
return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)


@autotvm.register_topi_schedule("conv2d_nhwc.gpu")
def schedule_conv2d_nhwc(cfg, outs):
"""Create the schedule for conv2d_nhwc"""
outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
s = te.create_schedule([x.op for x in outs])

def _callback(op):
if op.tag == "conv2d_nhwc":
schedule_conv2d_nhwc_direct(cfg, s, op.output(0))

traverse_inline(s, outs[0].op, _callback)
return s
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,13 @@ def schedule_conv2d_nhwc_direct(cfg, s, Conv):
cfg.define_knob("vthread_n", [1] if dynamic_batch else [1, 2])
cfg.define_knob("vthread_c", [1, 2])
cfg.define_knob("step", [16, 3, 32, 64])
cfg.define_knob("vectorize", [1, 2, 4, 8])

# fallback support
target = tvm.target.Target.current()
if cfg.is_fallback:
ref_log = autotvm.tophub.load_reference_log(
target.kind.name, target.model, "conv2d_nhwc.cuda"
target.kind.name, target.model, "conv2d_nhwc.gpu"
)
cfg.fallback_with_reference_log(ref_log)

Expand All @@ -70,6 +71,7 @@ def schedule_conv2d_nhwc_direct(cfg, s, Conv):
vthread_n = cfg["vthread_n"].val
vthread_c = cfg["vthread_c"].val
step = cfg["step"].val
vec_factor = cfg["vectorize"].val
block_factor_c = tile_c * num_thread_c * vthread_c

offset = 8
Expand All @@ -85,15 +87,17 @@ def schedule_conv2d_nhwc_direct(cfg, s, Conv):
thread_yz = te.thread_axis((0, vthread_n), "vthread", name="vy")

# Schedule for output
ni, hi, wi, fi = s[output].op.axis
bx = s[output].fuse(hi, wi)
ni, _, wi, fi = s[output].op.axis
bx = wi
fi, vec = s[output].split(fi, factor=vec_factor)
s[output].vectorize(vec)
tx, fi = s[output].split(fi, factor=tile_c)
txz, tx = s[output].split(tx, factor=num_thread_c)
bz, txz = s[output].split(txz, factor=vthread_c)
ty, ni = s[output].split(ni, factor=tile_n)
tyz, ty = s[output].split(ty, factor=num_thread_n)
by, tyz = s[output].split(tyz, factor=vthread_n)
s[output].reorder(bx, by, bz, tyz, txz, ty, tx, ni, fi)
s[output].reorder(bx, by, bz, tyz, txz, ty, tx, ni, fi, vec)
s[output].bind(bz, block_z)
s[output].bind(by, block_y)
s[output].bind(bx, block_x)
Expand All @@ -106,6 +110,7 @@ def schedule_conv2d_nhwc_direct(cfg, s, Conv):
ni, yi, xi, fi = s[OL].op.axis
ry, rx, rc = s[OL].op.reduce_axis
rco, rci = s[OL].split(rc, factor=step)
s[OL].vectorize(fi)
s[OL].reorder(rco, ry, rx, rci, ni, fi)

s[AA].compute_at(s[OL], rx)
Expand All @@ -125,6 +130,8 @@ def schedule_conv2d_nhwc_direct(cfg, s, Conv):
_, _, ic, o = s[WW].op.axis
t = s[WW].fuse(ic, o)
s[WW].storage_align(ic, W_align - 1, W_align)
t, vec = s[WW].split(t, factor=vec_factor)
s[WW].vectorize(vec)
ty, tx = s[WW].split(t, factor=num_thread_c)
_, ty = s[WW].split(ty, factor=num_thread_n)
s[WW].bind(tx, thread_x)
Expand Down
2 changes: 1 addition & 1 deletion tests/python/topi/python/test_topi_conv2d_nhwc.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

_conv2d_nhwc_implement = {
"generic": (topi.nn.conv2d_nhwc, topi.generic.schedule_conv2d_nhwc),
"gpu": (topi.cuda.conv2d_nhwc, topi.cuda.schedule_conv2d_nhwc),
"gpu": (topi.gpu.conv2d_nhwc, topi.gpu.schedule_conv2d_nhwc),
"cpu": (topi.nn.conv2d_nhwc, topi.x86.schedule_conv2d_nhwc),
"arm_cpu": (
topi.arm_cpu.conv2d_nhwc_spatial_pack,
Expand Down

0 comments on commit 719d2f6

Please sign in to comment.