# the version of each package
PACKAGE_VERSION = {
- 'arm_cpu': "v0.06",
+ 'arm_cpu': "v0.07",
'llvm': "v0.04",
- 'cuda': "v0.08",
+ 'cuda': "v0.09",
'rocm': "v0.05",
'opencl': "v0.04",
'mali': "v0.06",
'intel_graphics': "v0.02",
-
- 'vta': "v0.08",
+ 'vta': "v0.09",
'amd_apu': "v0.01",
}
reg.register_strategy("nn.conv2d_transpose", strategy.conv2d_transpose_strategy)
reg.register_pattern("nn.conv2d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE)
+
@reg.register_legalize("nn.conv2d_transpose")
def legalize_conv2d_transpose(attrs, inputs, types):
"""Legalize conv2d_transpose op.
Layout of the output, by default, out_layout is the same as data_layout
output_padding : Tuple[int], optional
- Additional zero-padding to be added to one side of the output.
+ Used to disambiguate the output shape.
out_dtype : str, optional
Specifies the output data type for mixed precision conv2d.
Layout of the output, by default, out_layout is the same as data_layout
output_padding : Tuple[int], optional
- Additional zero-padding to be added to one side of the output.
+ Used to disambiguate the output shape.
out_dtype : str, optional
Specifies the output data type for mixed precision conv2d.
out_dtype = attrs.out_dtype
out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
else out_dtype)
- out = topi_compute(
- inputs[0], inputs[1], strides, padding, out_dtype)
output_padding = get_const_tuple(attrs.output_padding)
- out = topi.nn.pad(out, [0, 0, 0, 0],
- [0, 0, output_padding[0], output_padding[1]])
+ out = topi_compute(
+ inputs[0], inputs[1], strides, padding, out_dtype, output_padding)
return [out]
return compute_conv2d_transpose
strides = get_const_tuple(attrs.strides)
out_dtype = attrs.out_dtype
out_dtype = (inputs[0].dtype if out_dtype in ("same", "") else out_dtype)
- out = topi_compute(inputs[0], inputs[1], strides, padding, out_dtype)
output_padding = get_const_tuple(attrs.output_padding)
- out = topi.nn.pad(out, [0, 0, 0], [0, 0, output_padding[0]])
+ out = topi_compute(inputs[0], inputs[1], strides, padding, out_dtype, output_padding)
return [out]
return _compute_conv1d_tranpsoe
def test_conv2d_transpose_nchw_run():
dshape = (1, 3, 18, 18)
kshape = (3, 10, 3, 3)
- oshape = (1, 10, 37, 37)
+ oshape = (1, 10, 36, 36)
x = relay.var("x", shape=dshape)
w = relay.var("w")
y = relay.nn.conv2d_transpose(x, w,
channels=10, kernel_size=(3,3), strides=(2,2),
- padding=(1,1), output_padding=(2, 2))
+ padding=(1,1), output_padding=(1, 1))
func = relay.Function([x, w], y)
dtype = "float32"
data = np.random.uniform(size=dshape).astype(dtype)
kernel = np.random.uniform(size=kshape).astype(dtype)
- c_np = topi.testing.conv2d_transpose_nchw_python(
- data, kernel, 2, 1)
- d_np = np.zeros(shape=oshape)
- d_np[:,:,0:c_np.shape[2],0:c_np.shape[3]] = c_np
- ref_res = d_np
+ ref_res = topi.testing.conv2d_transpose_nchw_python(
+ data, kernel, 2, 1, (1, 1))
for target, ctx in ctx_list():
intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
def test_conv2d_transpose_nhwc_run():
dshape_nhwc = (1, 18, 18, 3)
kshape_hwoi = (3, 3, 10, 3)
- oshape_nhwc = (1, 37, 37, 10)
+ oshape_nhwc = (1, 36, 36, 10)
x = relay.var("x", shape=dshape_nhwc)
w = relay.var("w")
# kshape and kernel_layout should have swapped IO.
# kshape is HWOI and kernel_layout is HWIO
y = relay.nn.conv2d_transpose(x, w,
channels=10, kernel_size=(3, 3), strides=(2, 2),
- padding=(1, 1), output_padding=(2, 2),
+ padding=(1, 1), output_padding=(1, 1),
data_layout="NHWC", kernel_layout="HWIO")
func = relay.Function([x, w], y)
dtype = "float32"
data = np.random.uniform(size=dshape_nhwc).astype(dtype)
kernel = np.random.uniform(size=kshape_hwoi).astype(dtype)
# use true kshape layout here - HWOI
- c_np = topi.testing.conv2d_transpose_nhwc_python(data, kernel, 'HWOI', 2, 1)
- d_np = np.zeros(shape=oshape_nhwc)
- d_np[:,0:c_np.shape[1],0:c_np.shape[2],:] = c_np
+
+ ref_res = topi.testing.conv2d_transpose_nhwc_python(data, kernel, 'HWOI',
+ 2, 1, output_padding=(1, 1))
+
+ for target, ctx in ctx_list():
+ intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+ op_res1 = intrp1.evaluate(func)(data, kernel)
+ tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
def test_conv1d_transpose_ncw_run():
dshape = (1, 3, 18)
kshape = (3, 10, 3)
- oshape = (1, 10, 37)
+ oshape = (1, 10, 36)
x = relay.var("x", shape=dshape)
w = relay.var("w")
y = relay.nn.conv1d_transpose(x, w,
channels=10, kernel_size=(3,), strides=(2,),
- padding=(1,), output_padding=(2,))
+ padding=(1,), output_padding=(1,))
func = relay.Function([x, w], y)
dtype = "float32"
data = np.random.uniform(size=dshape).astype(dtype)
kernel = np.random.uniform(size=kshape).astype(dtype)
- c_np = topi.testing.conv1d_transpose_ncw_python(
- data, kernel, 2, 1)
- d_np = np.zeros(shape=oshape)
- d_np[:,:,0:c_np.shape[2]] = c_np
- ref_res = d_np
+ ref_res = topi.testing.conv1d_transpose_ncw_python(
+ data, kernel, 2, 1, output_padding=(1,))
for target, ctx in ctx_list():
intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
from ..util import get_const_tuple, traverse_inline
from .conv2d_spatial_pack import schedule_conv2d_spatial_pack_nchw
+
+
@autotvm.register_topi_compute("conv2d_transpose_nchw.arm_cpu")
-def conv2d_transpose_nchw(cfg, Input, Filter, strides, padding, out_dtype):
+def conv2d_transpose_nchw(cfg, Input, Filter, strides, padding, out_dtype,
+ output_padding):
"""Transposed 2D convolution nchw forward operator.
Parameters
out_dtype: str
The output data type. This is used for mixed precision.
+ output_padding : tuple of int
+ Used to get the right output shape in gradients
+
Returns
-------
Output : tvm.te.Tensor
4-D with shape [batch, out_channel, out_height, out_width]
"""
- return _decl_spatial_pack(cfg, Input, Filter, strides, padding, "NCHW", out_dtype, 2)
+ return _decl_spatial_pack(cfg, Input, Filter, strides, padding, "NCHW", out_dtype, 2,
+ output_padding)
-def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile):
+def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile,
+ output_padding):
assert layout == "NCHW", "Only support NCHW"
out_dtype = out_dtype or data.dtype
N, CI, IH, IW = get_const_tuple(data.shape)
_, CO, KH, KW = get_const_tuple(kernel.shape)
+ HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+ opad_h, opad_w = output_padding
+ assert opad_h < HSTR and opad_w < WSTR
pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (KH, KW))
- bpad_top, bpad_bottom = KH - 1 - pad_top, KH - 1 - pad_bottom
- bpad_left, bpad_right = KW - 1 - pad_left, KW - 1 - pad_right
- HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+ bpad_top, bpad_bottom = KH - 1 - pad_top, KH - 1 - pad_bottom + opad_h
+ bpad_left, bpad_right = KW - 1 - pad_left, KW - 1 - pad_right + opad_w
- OH = (IH - 1) * HSTR - pad_top - pad_bottom + KH
- OW = (IW - 1) * WSTR - pad_left - pad_right + KW
+ OH = (IH - 1) * HSTR - pad_top - pad_bottom + KH + opad_h
+ OW = (IW - 1) * WSTR - pad_left - pad_right + KW + opad_w
dilated_input = dilate(data, [1, 1, HSTR, WSTR])
data_pad = pad(dilated_input, [0, 0, bpad_top, bpad_left], [0, 0, bpad_bottom, bpad_right])
from ..util import get_const_tuple, traverse_inline
@autotvm.task.register_topi_compute("conv1d_transpose_nchw.cuda")
-def conv1d_transpose_ncw(cfg, data, kernel, stride, padding, out_dtype):
+def conv1d_transpose_ncw(cfg, data, kernel, stride, padding, out_dtype,
+ output_padding):
"""Transposed 1D convolution ncw forward operator.
Parameters
string: ['VALID', 'SAME']
out_dtype: str
The output type. This is used in mixed precision
+ output_padding : ints
+ Used to disambiguate the output shape.
Returns
-------
"""
if isinstance(stride, (tuple, list)):
stride = stride[0]
+ if isinstance(output_padding, (tuple, list)):
+ output_padding = output_padding[0]
+ assert output_padding < stride
cfg.stride = stride
+ cfg.output_padding = output_padding
batch, inp_channels, inp_width = get_const_tuple(data.shape)
_, out_channels, kernel_size = get_const_tuple(kernel.shape)
pad_left, pad_right = nn.get_pad_tuple1d(padding, kernel_size)
- out_width = (inp_width - 1) * stride + kernel_size - pad_left - pad_right
+ out_width = (inp_width - 1) * stride + kernel_size - pad_left - pad_right + output_padding
pad_left = kernel_size - 1 - pad_left
- pad_right = kernel_size - 1 - pad_right
+ pad_right = kernel_size - 1 - pad_right + output_padding
dilated_width = stride * (inp_width - 1) + 1
data = te.compute(
(batch, inp_channels, pad_left + dilated_width + pad_right),
from ..util import get_const_tuple, traverse_inline
+
@autotvm.register_topi_compute("conv2d_transpose_nchw.cuda")
-def conv2d_transpose_nchw(cfg, data, kernel, stride, padding, out_dtype):
+def conv2d_transpose_nchw(cfg, data, kernel, stride, padding, out_dtype,
+ output_padding):
"""Transposed 2D convolution nchw forward operator.
Parameters
Padding size, or ['VALID', 'SAME']
out_dtype: str
The output type. This is used in mixed precision
+ output_padding : tuple of two ints
+ Used to disambiguate output shape.
Returns
-------
batch, inp_channels, inp_height, inp_width = get_const_tuple(data.shape)
_, out_channels, kernel_height, kernel_width = get_const_tuple(kernel.shape)
stride_height, stride_width = stride
+ outpad_height, outpad_width = output_padding
+ assert outpad_height < stride_height and outpad_width < stride_width
cfg.stride = stride
pad_top, pad_left, pad_bottom, pad_right = nn.get_pad_tuple(
padding, (kernel_height, kernel_width))
out_width = (inp_width - 1) * stride_width + \
- kernel_width - pad_left - pad_right
+ kernel_width - pad_left - pad_right + outpad_width
pad_left = kernel_width - 1 - pad_left
pad_right = kernel_width - 1 - pad_right
dilated_width = stride_width * (inp_width - 1) + 1
out_height = (inp_height - 1) * stride_height + \
- kernel_height - pad_top - pad_bottom
+ kernel_height - pad_top - pad_bottom + outpad_height
pad_top = kernel_height - 1 - pad_top
pad_bottom = kernel_height - 1 - pad_bottom
dilated_height = stride_height * (inp_height - 1) + 1
from .util import get_pad_tuple1d
-def conv1d_transpose_ncw(data, kernel, stride, padding, out_dtype):
+def conv1d_transpose_ncw(data, kernel, stride, padding, out_dtype,
+ output_padding):
"""Transposed 1D convolution ncw forward operator.
Parameters
out_dtype : str
The output data type. This is used for mixed precision.
+ output_padding : ints
+ Used to recover the actual output shape in case there are more
+ than one possible shape. Must be smaller than stride.
+
Returns
-------
output : tvm.te.Tensor
3-D with shape [batch, out_channel, out_width]
+
"""
# dilate and pad
if isinstance(stride, (tuple, list)):
stride = stride[0]
+ if isinstance(output_padding, (tuple, list)):
+ output_padding = output_padding[0]
batch, channels_in, data_width = data.shape
_, channels_out, kernel_width = kernel.shape
+ assert output_padding < stride
channels_out = simplify(channels_out)
data = dilate(data, [1, 1, stride], name='data_dilate')
pad_left, pad_right = get_pad_tuple1d(padding, (kernel_width,))
pad_left = kernel_width - 1 - pad_left
- pad_right = kernel_width - 1 - pad_right
+ pad_right = kernel_width - 1 - pad_right + output_padding
data = pad(data, [0, 0, pad_left], [0, 0, pad_right], name='data_pad')
# transpose kernel, switch kernel layout to IOW
from ..util import simplify
-def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype):
+
+def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype,
+ output_padding):
"""Transposed 2D convolution nchw forward operator.
Parameters
out_dtype : str
The output data type. This is used for mixed precision.
+ output_padding : tuple of ints
+ Used to get the right output shape for gradients
+
Returns
-------
Output : tvm.te.Tensor
4-D with shape [batch, out_channel, out_height, out_width]
"""
- return declaration_conv2d_transpose_impl(Input, Filter, strides, padding, out_dtype)
+ return declaration_conv2d_transpose_impl(Input, Filter, strides, padding, out_dtype,
+ output_padding=output_padding)
-def conv2d_transpose_nchw_preprocess(data, kernel, strides, padding, out_dtype):
+def conv2d_transpose_nchw_preprocess(data, kernel, strides, padding, out_dtype, output_padding):
"""Preprocess data and kernel to make the compute pattern
of conv2d_transpose the same as conv2d"""
batch, in_c, in_h, in_w = data.shape
_, out_c, filter_h, filter_w = kernel.shape
stride_h, stride_w = strides
+ opad_h, opad_w = output_padding
+ assert opad_h < stride_h and opad_w < stride_w
# dilate data
data_dilate = dilate(data, [1, 1, stride_h, stride_w], name='data_dilate')
# pad data
fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(padding, (filter_h, filter_w))
bpad_top = filter_h - 1 - fpad_top
- bpad_bottom = filter_h - 1 - fpad_bottom
+ bpad_bottom = filter_h - 1 - fpad_bottom + opad_h
bpad_left = filter_w - 1 - fpad_left
- bpad_right = filter_w - 1 - fpad_right
+ bpad_right = filter_w - 1 - fpad_right + opad_w
data_pad = pad(data_dilate, \
[0, 0, bpad_top, bpad_left], \
[0, 0, bpad_bottom, bpad_right], \
return data_pad, kernel_transform
-def declaration_conv2d_transpose_impl(data, kernel, strides, padding, out_dtype):
+def declaration_conv2d_transpose_impl(data, kernel, strides, padding, out_dtype, output_padding):
"""Implementation of conv2d transpose"""
data_pad, kernel_transform = \
- conv2d_transpose_nchw_preprocess(data, kernel, strides, padding, out_dtype)
+ conv2d_transpose_nchw_preprocess(data, kernel, strides, padding, out_dtype, output_padding)
batch, in_c, in_h, in_w = data_pad.shape
out_c, _, filter_h, filter_w = kernel_transform.shape
- stride_h, stride_w = strides
# convolution stage
out_c = simplify(out_c)
- out_h = simplify(in_h - filter_h + 1)
- out_w = simplify(in_w - filter_w + 1)
- dc = te.reduce_axis((0, in_c), name='dc')
- dh = te.reduce_axis((0, filter_h), name='dh')
- dw = te.reduce_axis((0, filter_w), name='dw')
+
+ out_h = simplify(in_h - filter_h + 1 + output_padding[0])
+ out_w = simplify(in_w - filter_w + 1 + output_padding[1])
+ dc = tvm.reduce_axis((0, in_c), name='dc')
+ dh = tvm.reduce_axis((0, filter_h), name='dh')
+ dw = tvm.reduce_axis((0, filter_w), name='dw')
Output = te.compute(
(batch, out_c, out_h, out_w),
import topi
from topi.nn.util import get_pad_tuple1d
-def conv1d_transpose_ncw_python(a_np, w_np, stride, padding):
+def conv1d_transpose_ncw_python(a_np, w_np, stride, padding, output_padding):
"""Transposed 1D convolution operator in NCW layout.
Parameters
tuple of 2 ints for left and right padding, or
['VALID', 'SAME']
+ output_padding : tuple
+ Used to recover the actual output shape in case more than one
+ is possible
+
Returns
-------
b_np : np.ndarray
3-D with shape [batch, out_channel, out_width]
+
"""
batch, in_c, in_w = a_np.shape
_, out_c, filter_w = w_np.shape
+ opad = output_padding[0]
if isinstance(stride, int):
stride_w = stride
else:
stride_w = stride[0]
+ assert opad < stride_w
fpad_left, fpad_right = get_pad_tuple1d(padding, filter_w)
# dilate stage
dilated_a_np = topi.testing.dilate_python(a_np, [1, 1, stride_w])
# padding stage
bpad_left = filter_w - 1 - fpad_left
- bpad_right = filter_w - 1 - fpad_right
+ bpad_right = filter_w - 1 - fpad_right + opad
padded_a_np = np.zeros((batch, in_c, dilated_a_np.shape[2]+bpad_left+bpad_right))
padded_a_np[:, :, bpad_left:dilated_a_np.shape[2]+bpad_left] = dilated_a_np
# convolution stage
- out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w
+ out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w + opad
b_np = np.zeros((batch, out_c, out_w))
for n in range(batch):
for f in range(out_c):
from topi.nn.util import get_pad_tuple
-def conv2d_transpose_nchw_python(a_np, w_np, stride, padding):
+def conv2d_transpose_nchw_python(a_np, w_np, stride, padding, output_padding):
"""Transposed convolution operator in NCHW layout.
Parameters
padding : int or str
Padding size, or ['VALID', 'SAME']
+ output_padding : int or a list/tuple of two ints
+ Use to disambiguate the output shape.
+
Returns
-------
b_np : np.ndarray
stride_h = stride_w = stride
else:
stride_h, stride_w = stride
+ if isinstance(output_padding, int):
+ opad_h = opad_w = output_padding
+ else:
+ opad_h, opad_w = output_padding
+ assert opad_h < stride_h and opad_w < stride_w
# dilate stage
dilated_a_np = topi.testing.dilate_python(a_np, [1, 1, stride_h, stride_w])
# padding stage
fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(padding, (filter_h, filter_w))
bpad_top = filter_h - 1 - fpad_top
- bpad_bottom = filter_h - 1 - fpad_bottom
+ bpad_bottom = filter_h - 1 - fpad_bottom + opad_h
bpad_left = filter_w - 1 - fpad_left
- bpad_right = filter_w - 1 - fpad_right
+ bpad_right = filter_w - 1 - fpad_right + opad_w
padded_a_np = np.zeros((batch, in_c, dilated_a_np.shape[2]+bpad_top+bpad_bottom, \
dilated_a_np.shape[3]+bpad_left+bpad_right))
padded_a_np[:, :, bpad_top:dilated_a_np.shape[2]+bpad_top, \
bpad_left:dilated_a_np.shape[3]+bpad_left] = dilated_a_np
# convolution stage
- out_h = (in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h
- out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w
+ out_h = (in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h + opad_h
+ out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w + opad_w
b_np = np.zeros((batch, out_c, out_h, out_w))
for n in range(batch):
for f in range(out_c):
return b_np
-def conv2d_transpose_nhwc_python(a_nhwc, weight, weight_format, stride, padding):
+def conv2d_transpose_nhwc_python(a_nhwc, weight, weight_format, stride, padding,
+ output_padding=(0, 0)):
"""Transposed convolution operator in NHWC layout.
Parameters
else:
raise ValueError('Valid weight_formats are HWIO, HWOI, OIHW or IOHW')
- res_nchw = conv2d_transpose_nchw_python(a_nchw, w_iohw, stride, padding)
+ res_nchw = conv2d_transpose_nchw_python(a_nchw, w_iohw, stride, padding,
+ output_padding=output_padding)
res_nhwc = np.transpose(res_nchw, (0, 2, 3, 1))
return res_nhwc
from .. import nn
from .conv2d import conv2d_nchw, schedule_conv2d_nchw
-def conv2d_transpose_nchw(data, kernel, strides, padding, out_dtype):
+
+def conv2d_transpose_nchw(data, kernel, strides, padding, out_dtype, output_padding):
data_pad, kernel_transform = \
- nn.conv2d_transpose_nchw_preprocess(data, kernel, strides, padding, out_dtype)
+ nn.conv2d_transpose_nchw_preprocess(data, kernel, strides, padding,
+ out_dtype, output_padding)
# reuse conv2d_nchw implementation
return conv2d_nchw(data_pad, kernel_transform, strides=(1, 1),
padding=(0, 0), dilation=(1, 1), out_dtype=out_dtype)
+
def schedule_conv2d_transpose_nchw(outs):
"""Create schedule for tensors"""
outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
"gpu": (topi.cuda.conv1d_transpose_ncw, topi.cuda.schedule_conv1d_transpose_ncw)
}
-def verify_conv1d_transpose_ncw(batch, in_channel, in_size, num_filter, kernel, stride, padding):
+def verify_conv1d_transpose_ncw(batch, in_channel, in_size, num_filter, kernel, stride, padding, output_padding):
in_width = in_size
A = te.placeholder((batch, in_channel, in_width), name='A')
W = te.placeholder((in_channel, num_filter, kernel), name='W')
def get_ref_data():
a_np = np.random.uniform(size=a_shape).astype(dtype)
w_np = np.random.uniform(size=w_shape).astype(dtype)
- b_np = topi.testing.conv1d_transpose_ncw_python(a_np, w_np, stride, padding)
+ b_np = topi.testing.conv1d_transpose_ncw_python(a_np, w_np, stride, padding, output_padding)
c_np = np.maximum(b_np, 0)
return a_np, w_np, b_np, c_np
return
with tvm.target.create(device):
fcompute, fschedule = topi.testing.dispatch(device, _conv1d_transpose_ncw_implement)
- B = fcompute(A, W, stride, padding, A.dtype)
+ B = fcompute(A, W, stride, padding, A.dtype, output_padding)
C = topi.nn.relu(B)
s1 = fschedule([B])
s2 = fschedule([C])
def test_conv1d_transpose_ncw():
- verify_conv1d_transpose_ncw(1, 3, 224, 32, 5, 1, 0)
- verify_conv1d_transpose_ncw(1, 3, 224, 32, 7, 1, 2)
- verify_conv1d_transpose_ncw(1, 3, 224, 32, 5, 2, 1)
- verify_conv1d_transpose_ncw(1, 3, 224, 32, 5, 2, 0)
- verify_conv1d_transpose_ncw(1, 32, 32, 128, 5, 1, 0)
- verify_conv1d_transpose_ncw(1, 32, 32, 128, 5, 2, 1)
- verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 1, 256)
- verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 2, 256)
- verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 5, 256)
- verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (0,3))
- verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (1,3))
- verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (2,3))
+ verify_conv1d_transpose_ncw(1, 3, 224, 32, 5, 1, 0, (0,))
+ verify_conv1d_transpose_ncw(1, 3, 224, 32, 7, 1, 2, (0,))
+ verify_conv1d_transpose_ncw(1, 3, 224, 32, 5, 2, 1, (0,))
+ verify_conv1d_transpose_ncw(1, 3, 224, 32, 5, 2, 1, (1,))
+ verify_conv1d_transpose_ncw(1, 3, 224, 32, 5, 2, 0, (0,))
+ verify_conv1d_transpose_ncw(1, 32, 32, 128, 5, 1, 0, (0,))
+ verify_conv1d_transpose_ncw(1, 32, 32, 128, 5, 2, 1, (0,))
+ verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 1, 256, (0,))
+ verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 2, 256, (0,))
+ verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 5, 256, (0,))
+ verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 5, 256, (3,))
+ verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (0,3), (0,))
+ verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (1,3), (0,))
+ verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (2,3), (0,))
if __name__ == "__main__":
test_conv1d_transpose_ncw()
from common import get_all_backend
+
_conv2d_transpose_nchw_implement = {
"generic": (topi.nn.conv2d_transpose_nchw, topi.generic.schedule_conv2d_transpose_nchw),
"cpu": (topi.x86.conv2d_transpose_nchw, topi.x86.schedule_conv2d_transpose_nchw),
"hls": (topi.nn.conv2d_transpose_nchw, topi.hls.schedule_conv2d_transpose_nchw),
}
-def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding):
+def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, output_padding):
in_height, in_width = in_size
kernel_height, kernel_width = kernel
stride_height, stride_width = stride
def get_ref_data():
a_np = np.random.uniform(size=a_shape).astype(dtype)
w_np = np.random.uniform(size=w_shape).astype(dtype)
- b_np = topi.testing.conv2d_transpose_nchw_python(a_np, w_np, stride, padding)
+ b_np = topi.testing.conv2d_transpose_nchw_python(a_np, w_np, stride, padding, output_padding)
c_np = np.maximum(b_np, 0)
return a_np, w_np, b_np, c_np
B = fcompute(A, W,
[stride_height, stride_width],
[pad_top, pad_left, pad_bottom, pad_right],
- A.dtype)
+ A.dtype, output_padding)
C = topi.nn.relu(B)
s1 = fschedule([B])
s2 = fschedule([C])
def test_conv2d_transpose_nchw():
- verify_conv2d_transpose_nchw(1, 3, (224, 224), 1, (1, 1), (1, 1), (0, 0, 0, 0))
- verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (3, 3), (1, 1), (0, 0, 0, 0))
- verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (3, 3), (3, 3), (0, 0, 0, 0))
- verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (3, 3), (1, 1), (0, 0, 0, 0))
- verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (3, 3), (2, 2), (1, 1, 1, 1))
- verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (2, 2), (2, 2), (0, 0, 0, 0))
- verify_conv2d_transpose_nchw(1, 32, (32, 32), 128, (5, 5), (1, 1), (0, 0, 0, 0))
- verify_conv2d_transpose_nchw(1, 32, (32, 32), 128, (5, 5), (2, 2), (1, 1, 1, 1))
- verify_conv2d_transpose_nchw(16, 32, (8192, 1), 8, (31, 1), (2, 1), (14, 0, 15, 0))
- verify_conv2d_transpose_nchw(16, 512, (8, 1), 128, (31, 1), (2, 1), (14, 0, 15, 0))
+ verify_conv2d_transpose_nchw(1, 3, (224, 224), 1, (1, 1), (1, 1), (0, 0, 0, 0), (0, 0))
+ verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (3, 3), (1, 1), (0, 0, 0, 0), (0, 0))
+ verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (3, 3), (3, 3), (0, 0, 0, 0), (0, 0))
+ verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (3, 3), (1, 1), (0, 0, 0, 0), (0, 0))
+ verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (3, 3), (2, 2), (1, 1, 1, 1), (0, 0))
+ verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (3, 3), (2, 2), (1, 1, 1, 1), (1, 0))
+ verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (2, 2), (2, 2), (0, 0, 0, 0), (0, 0))
+ verify_conv2d_transpose_nchw(1, 3, (224, 224), 32, (2, 2), (2, 2), (0, 0, 0, 0), (1, 1))
+ verify_conv2d_transpose_nchw(1, 32, (32, 32), 128, (5, 5), (1, 1), (0, 0, 0, 0), (0, 0))
+ verify_conv2d_transpose_nchw(1, 32, (32, 32), 128, (5, 5), (2, 2), (1, 1, 1, 1), (0, 0))
+ verify_conv2d_transpose_nchw(16, 32, (8192, 1), 8, (31, 1), (2, 1), (14, 0, 15, 0), (0, 0))
+ verify_conv2d_transpose_nchw(16, 512, (8, 1), 128, (31, 1), (2, 1), (14, 0, 15, 0), (0, 0))
+ verify_conv2d_transpose_nchw(16, 512, (8, 1), 128, (31, 1), (2, 1), (14, 0, 15, 0), (1, 0))
+
if __name__ == "__main__":
test_conv2d_transpose_nchw()
from ..environment import get_env
@autotvm.register_topi_compute("conv2d_transpose_packed.vta")
-def conv2d_transpose_packed(cfg, data, kernel, strides, padding, out_dtype):
+def conv2d_transpose_packed(cfg, data, kernel, strides, padding, out_dtype,
+ output_padding=(0, 0)):
"""Packed conv2d_transpose compute"""
ishape = get_const_tuple(data.shape)
kshape = get_const_tuple(kernel.shape)
b, c_i, i_h, i_w, t_b, t_ci = ishape
c_o, _, k_h, k_w, t_co, t_ci = kshape
stride_h, stride_w = strides
+ opad_h, opad_w = output_padding
+ # FIXME(tmoreau89): currently IR pass breaks when output padding != (0,0)
+ assert opad_h == 0 and opad_w == 0, "VTA does not support output padding for now"
# derive padding parameters
fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(padding, (k_h, k_w))
bpad_top = k_h - 1 - fpad_top
- bpad_bottom = k_h - 1 - fpad_bottom
+ bpad_bottom = k_h - 1 - fpad_bottom + opad_h
bpad_left = k_w - 1 - fpad_left
- bpad_right = k_w - 1 - fpad_right
+ bpad_right = k_w - 1 - fpad_right + opad_w
# padding stage
dilated_input = topi.nn.dilate(data, [1, 1, stride_h, stride_w, 1, 1])
[0, 0, bpad_bottom, bpad_right, 0, 0])
# convolution transpose stage
- out_h = (i_h - 1) * stride_h - fpad_top - fpad_bottom + k_h
- out_w = (i_w - 1) * stride_w - fpad_left - fpad_right + k_w
+ out_h = (i_h - 1) * stride_h - fpad_top - fpad_bottom + k_h + opad_h
+ out_w = (i_w - 1) * stride_w - fpad_left - fpad_right + k_w + opad_w
oshape = (b, c_o, out_h, out_w, t_b, t_co)
d_c = te.reduce_axis((0, c_i), name='d_c')
d_h = te.reduce_axis((0, k_h), name='d_h')
Workload = namedtuple("Conv2DTransposeWorkload",
['batch', 'height', 'width', 'in_filter', 'out_filter',
- 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
+ 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride',
+ 'o_hpad', 'o_wpad'])
+# DCGAN workloads
dcgan_wkls = [
# dcgan
- ('DCGAN.CT1', Workload(env.BATCH, 4, 4, 1024, 512, 4, 4, 1, 1, 2, 2)),
- ('DCGAN.CT2', Workload(env.BATCH, 8, 8, 512, 256, 4, 4, 1, 1, 2, 2)),
- ('DCGAN.CT3', Workload(env.BATCH, 16, 16, 256, 128, 4, 4, 1, 1, 2, 2)),
+ ('DCGAN.CT1', Workload(env.BATCH, 4, 4, 1024, 512, 4, 4, 1, 1, 2, 2, 0, 0)),
+ ('DCGAN.CT2', Workload(env.BATCH, 8, 8, 512, 256, 4, 4, 1, 1, 2, 2, 0, 0)),
+ ('DCGAN.CT3', Workload(env.BATCH, 16, 16, 256, 128, 4, 4, 1, 1, 2, 2, 0, 0)),
]
@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
return x
-def conv2d_transpose(N, CI, H, W, CO, KH, KW, strides, padding):
+def conv2d_transpose(N, CI, H, W, CO, KH, KW, strides, padding, opadding):
data_shape = (N//env.BATCH, CI//env.BLOCK_IN, H, W, env.BATCH, env.BLOCK_IN)
kernel_shape = (CO//env.BLOCK_OUT, CI//env.BLOCK_IN, KH, KW, env.BLOCK_OUT, env.BLOCK_IN)
Filter=kernel,
strides=strides,
padding=padding,
- out_dtype=env.acc_dtype)
+ out_dtype=env.acc_dtype,
+ output_padding=opadding
+ )
res = topi.right_shift(res, env.WGT_WIDTH)
res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
res = topi.cast(res, env.out_dtype)
KW = wl.wkernel
strides = (wl.hstride, wl.wstride)
padding = (wl.hpad, wl.wpad)
+ opadding = (wl.o_hpad, wl.o_wpad)
# Create task
task = autotvm.task.create(
conv2d_transpose,
- args=(N, CI, H, W, CO, KH, KW, strides, padding),
+ args=(N, CI, H, W, CO, KH, KW, strides, padding, opadding),
target=tvm.target.vta(),
target_host=env.target_host,
template_key='direct')
Workload = namedtuple("Conv2DTransposeWorkload",
['batch', 'height', 'width', 'in_filter', 'out_filter',
- 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
+ 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride',
+ 'o_hpad', 'o_wpad'])
# Get batch info from env
env = vta.get_env()
# DCGAN workloads
dcgan_wklds = [
# dcgan
- ('DCGAN.CT1', Workload(env.BATCH, 4, 4, 1024, 512, 4, 4, 1, 1, 2, 2)),
- ('DCGAN.CT2', Workload(env.BATCH, 8, 8, 512, 256, 4, 4, 1, 1, 2, 2)),
- ('DCGAN.CT3', Workload(env.BATCH, 16, 16, 256, 128, 4, 4, 1, 1, 2, 2)),
+ ('DCGAN.CT1', Workload(env.BATCH, 4, 4, 1024, 512, 4, 4, 1, 1, 2, 2, 0, 0)),
+ ('DCGAN.CT2', Workload(env.BATCH, 8, 8, 512, 256, 4, 4, 1, 1, 2, 2, 0, 0)),
+ ('DCGAN.CT3', Workload(env.BATCH, 16, 16, 256, 128, 4, 4, 1, 1, 2, 2, 0, 0)),
]
# FIXME: we need a custom clip operator to circumvent a pattern detection limitation
# Define base computation schedule
with target:
+
res = fcompute(
- data, kernel, (wl.hstride, wl.wstride), padding, env.acc_dtype)
+ data, kernel, (wl.hstride, wl.wstride), padding, env.acc_dtype,
+ (wl.o_hpad, wl.o_wpad))
res = topi.right_shift(res, env.WGT_WIDTH)
res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
res = topi.cast(res, env.out_dtype)
print(vta.lower(s, [data, kernel, res], simple_mode=True))
# Derive number of ops
- fout_height = (wl.height - 1) * wl.hstride - 2 * wl.hpad + wl.hkernel
- fout_width = (wl.width - 1) * wl.wstride - 2 * wl.wpad + wl.wkernel
+ fout_height = (wl.height - 1) * wl.hstride - 2 * wl.hpad + wl.hkernel + wl.o_hpad
+ fout_width = (wl.width - 1) * wl.wstride - 2 * wl.wpad + wl.wkernel + wl.o_wpad
num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter
# @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
w_np = np.random.randint(w_min, w_max, size=(wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel)).astype(kernel.dtype)
r_np = topi.testing.conv2d_transpose_nchw_python(
- a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype), (wl.hstride, wl.wstride), wl.hpad).astype(env.acc_dtype)
+ a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype), (wl.hstride, wl.wstride), wl.hpad, (wl.o_hpad, wl.o_wpad)).astype(env.acc_dtype)
return a_np, w_np, r_np
# Data in original format