idxm(b*VP + bb, nW) * m + nu],
name='d')
- # transform kernel
- if pre_computed:
- U = kernel
+ if autotvm.GLOBAL_SCOPE.in_tuning:
+ VC = cfg['tile_k'].size[-1]
+ kvshape = (KH + tile_size - 1, KW + tile_size - 1, idxd(CO, VC), CI, VC)
+ U = tvm.te.placeholder(kvshape, kernel.dtype, name="U")
else:
- r_kh = te.reduce_axis((0, KH), 'r_kh')
- r_kw = te.reduce_axis((0, KW), 'r_kw')
- U = te.compute((alpha, alpha, idxd(K, VK), C, VK), lambda eps, nu, k, c, kk:
- te.sum(kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype) *
- G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U')
+ # transform kernel
+ if pre_computed:
+ U = kernel
+ else:
+ r_kh = te.reduce_axis((0, KH), 'r_kh')
+ r_kw = te.reduce_axis((0, KW), 'r_kw')
+ U = te.compute((alpha, alpha, idxd(K, VK), C, VK), lambda eps, nu, k, c, kk:
+ te.sum(kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype) *
+ G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U')
# transform image
r_eps = te.reduce_axis((0, alpha), 'r_eps')
data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw],
name='data_vec')
- if pre_packed:
- kernel_vec = kernel
+ if autotvm.GLOBAL_SCOPE.in_tuning:
+ kernel_vec = tvm.te.placeholder(kvshape, kernel.dtype, name="kernel")
else:
- kernel_vec = te.compute(kvshape, lambda co, ci, kh, kw, vc:
- kernel[co*VC+vc][ci][kh][kw],
- name='kernel_vec')
+ if pre_packed:
+ kernel_vec = kernel
+ else:
+ kernel_vec = te.compute(kvshape, lambda co, ci, kh, kw, vc:
+ kernel[co*VC+vc][ci][kh][kw],
+ name='kernel_vec')
ci = te.reduce_axis((0, CI), name='ci')
kh = te.reduce_axis((0, KH), name='kh')
s[data_vec].parallel(h)
if kernel_vec.op.name == 'kernel_vec':
- co, _, _, _, _ = s[kernel_vec].op.axis
- if autotvm.GLOBAL_SCOPE.in_tuning:
- # kernel packing will be pre-computed during compilation, so we skip
- # this part to make tuning records correct
- s[kernel_vec].pragma(co, 'debug_skip_region')
- else:
+ if not autotvm.GLOBAL_SCOPE.in_tuning:
+ co, _, _, _, _ = s[kernel_vec].op.axis
s[kernel_vec].parallel(co)
elif kernel_vec.op.name == 'kernel_vec_conv2d_transpose': # for conv2d transpose
co, _, _, _, _ = s[kernel_vec].op.axis
data_vec = te.compute(dvshape, lambda n, oho, owo, ohi, owi, ic:
data_pad[n][oho*OHI*HSTR+ohi][owo*OWI*WSTR+owi][ic],
name='data_vec')
- kernel_vec = te.compute(kvshape, lambda oco, kh, kw, ic, oci: \
- kernel[kh][kw][ic][oco*OCI+oci],
- name='kernel_vec')
+
+ if autotvm.GLOBAL_SCOPE.in_tuning:
+ kernel_vec = tvm.te.placeholder(kvshape, kernel.dtype, name="kernel")
+ else:
+ kernel_vec = te.compute(kvshape, lambda oco, kh, kw, ic, oci: \
+ kernel[kh][kw][ic][oco*OCI+oci],
+ name='kernel_vec')
ic = te.reduce_axis((0, IC), name='ic')
kh = te.reduce_axis((0, KH), name='kh')
s[kernel_vec].compute_at(s[conv], compat_axis)
s[data_vec].compute_at(s[conv], compat_axis)
- # schedule kernel pack
- oco, kh, kw, ic, oci = kernel_vec.op.axis
- s[kernel_vec].vectorize(oci)
- s[kernel_vec].unroll(ic)
- if cfg['compat'].val == 2:
- s[kernel_vec].parallel(oco)
+ if not autotvm.GLOBAL_SCOPE.in_tuning:
+ # schedule kernel pack
+ oco, kh, kw, ic, oci = kernel_vec.op.axis
+ s[kernel_vec].vectorize(oci)
+ s[kernel_vec].unroll(ic)
+ if cfg['compat'].val == 2:
+ s[kernel_vec].parallel(oco)
# schedule data pack
if data_vec.op.name == 'data_vec_undilated':
s[data_vec].unroll(vw)
if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == 'kernel_vec':
- if autotvm.GLOBAL_SCOPE.in_tuning:
- # kernel packing will be pre-computed during compilation, so we skip
- # this part to make tuning records correct
- s[kernel_vec].pragma(s[kernel_vec].op.axis[0], 'debug_skip_region')
- else:
+ if not autotvm.GLOBAL_SCOPE.in_tuning:
max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
co, ci, kh, kw, vc = s[kernel_vec].op.axis
fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
data_pad[n][c][h][w],
name='d')
- if pre_computed:
- U = kernel
+ if autotvm.GLOBAL_SCOPE.in_tuning:
+ VC = cfg['tile_k'].size[-1]
+ kvshape = (KH + tile_size - 1, KW + tile_size - 1, tvm.tir.indexdiv(CO, VC), CI, VC)
+ U = tvm.te.placeholder(kvshape, kernel.dtype, name="U")
else:
- U = _decl_winograd_kernel_transform(kernel, tile_size, G)
+ if pre_computed:
+ U = kernel
+ else:
+ U = _decl_winograd_kernel_transform(kernel, tile_size, G)
# V [alpha * alpha, C, P_round)
# Perform the image transform
s[G].compute_inline()
eps, _, _, _ = s[U].op.axis
y, _, _, _ = s[padded_kernel].op.axis
- if autotvm.GLOBAL_SCOPE.in_tuning:
- # Kernel transformation will be pre-computed during compilation, so we skip
- # this part to make tuning records correct
- s[U].pragma(eps, 'debug_skip_region')
- s[padded_kernel].pragma(y, 'debug_skip_region')
- else:
+ if not autotvm.GLOBAL_SCOPE.in_tuning:
# Pad kernel
y, x, ky, kx = s[padded_kernel].op.axis
s[padded_kernel].unroll(ky)
s[data_vec].unroll(vw)
if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == 'kernel_vec':
- co, ci, kh, kw, vc = s[kernel_vec].op.axis
- if autotvm.GLOBAL_SCOPE.in_tuning:
- # Directly use modified data layout placeholder.
- kvshape = (co // vc, ci, kh, kw, vc)
- kernel_vec = tvm.te.placeholder(kvshape, kernel_vec.dtype, name="kernel")
- s[kernel_vec] = kernel_vec
- else:
+ if not autotvm.GLOBAL_SCOPE.in_tuning:
max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
+ co, ci, kh, kw, vc = s[kernel_vec].op.axis
fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
fused, vec = s[kernel_vec].split(fused, VC)
bb, tt = s[kernel_vec].split(fused, max_threads)
data_pad[(b*bnb+bb) // (nH*nW)][ci][(b*bnb+bb) // nW % nH * m + eps]
[(b*bnb+bb) % nW * m + nu], tvm.tir.const(0, data_pad.dtype)), name='d')
- # transform kernel
- if pre_computed:
- U = kernel
+ if autotvm.GLOBAL_SCOPE.in_tuning:
+ VC = cfg['tile_k'].size[-1]
+ kvshape = (KH + tile_size - 1, KW + tile_size - 1, tvm.tir.indexdiv(CO, VC), CI, VC)
+ U = tvm.te.placeholder(kvshape, kernel.dtype, name="U")
else:
- r_kh = te.reduce_axis((0, KH), 'r_kh')
- r_kw = te.reduce_axis((0, KW), 'r_kw')
- U = te.compute((alpha, alpha, CO // bna, CI, bna), lambda eps, nu, co, ci, vco:
- te.sum(kernel[co * bna + vco][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
- axis=[r_kh, r_kw]), name='U')
+ # transform kernel
+ if pre_computed:
+ U = kernel
+ else:
+ r_kh = te.reduce_axis((0, KH), 'r_kh')
+ r_kw = te.reduce_axis((0, KW), 'r_kw')
+ U = te.compute((alpha, alpha, CO // bna, CI, bna), lambda eps, nu, co, ci, vco:
+ te.sum(kernel[co * bna + vco][ci][r_kh][r_kw] *
+ G[eps][r_kh] * G[nu][r_kw],
+ axis=[r_kh, r_kw]), name='U')
# transform image
r_a = te.reduce_axis((0, alpha), 'r_a')