add dilation in x86 NCHWc depthwise conv support (#4962) (#6267)
authorwjliu <wjliu1998@gmail.com>
Fri, 14 Aug 2020 05:17:38 +0000 (13:17 +0800)
committerGitHub <noreply@github.com>
Fri, 14 Aug 2020 05:17:38 +0000 (14:17 +0900)
python/tvm/topi/x86/depthwise_conv2d.py
tests/python/frontend/pytorch/test_forward.py
tests/python/topi/python/test_topi_depthwise_conv2d.py

index 0976c33..acbe0f7 100644 (file)
@@ -122,13 +122,18 @@ def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation,
 
     strides = strides if isinstance(strides, (tuple, list)) else (strides, strides)
     HSTR, WSTR = strides
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (filter_height, filter_width))
 
     dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
-    assert (dh, dw) == (1, 1), "Does not support dilation"
 
-    out_height = (in_height - filter_height + pad_top + pad_down) // HSTR + 1
-    out_width = (in_width - filter_width + pad_left + pad_right) // WSTR + 1
+    dilated_kernel_h = (filter_height - 1) * dh + 1
+    dilated_kernel_w = (filter_width - 1) * dw + 1
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w))
+    HPAD = pad_top + pad_down
+    WPAD = pad_left + pad_right
+
+    out_height = (in_height + HPAD - dilated_kernel_h) // HSTR + 1
+    out_width = (in_width + WPAD - dilated_kernel_w) // WSTR + 1
 
     cfg.define_split("tile_ic", in_channel, num_outputs=2)
     cfg.define_split("tile_oc", out_channel, num_outputs=2)
@@ -140,7 +145,7 @@ def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation,
         te.placeholder((batch, in_channel, in_height, in_width), dtype=data.dtype),
         te.placeholder((out_channel, channel_multiplier, filter_height, filter_width),
                        dtype=kernel.dtype),
-        strides, padding, out_dtype)
+        strides, (pad_top, pad_down), out_dtype)
     if cfg.is_fallback:
         _fallback_schedule(cfg, wkl)
 
@@ -172,6 +177,7 @@ def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation,
     else:
         data_pad = data
 
+
     # depthconv stage
     idxdiv = tvm.tir.indexdiv
     idxmod = tvm.tir.indexmod
@@ -184,7 +190,7 @@ def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation,
             (data_pad[
                 b,
                 idxdiv(idxdiv(oco * out_channel_block + oci, channel_multiplier), in_channel_block),
-                oh*HSTR+kh, ow*WSTR+kw,
+                oh*HSTR+kh*dh, ow*WSTR+kw*dw,
                 idxmod(idxdiv(oco * out_channel_block + oci, channel_multiplier), in_channel_block)]
              .astype(out_dtype) *
              kernel[oco, 0, kh, kw, 0, oci].astype(out_dtype)),
index ae03a70..88203f5 100644 (file)
@@ -1552,12 +1552,7 @@ def test_segmentaton_models():
     inp = [torch.rand((1, 3, 300, 300), dtype=torch.float)]
 
     verify_model(SegmentationModelWrapper(fcn.eval()), inp, atol=1e-4, rtol=1e-4)
-
-    # depthwise + dilated covolution not supported on x86
-    # see https://github.com/apache/incubator-tvm/issues/4962
-    cuda_ctx = ("cuda", tvm.gpu(0))
-    if cuda_ctx[1].exist:
-        verify_model(SegmentationModelWrapper(deeplab.eval()), inp, [cuda_ctx], atol=1e-4, rtol=1e-4)
+    verify_model(SegmentationModelWrapper(deeplab.eval()), inp, atol=1e-4, rtol=1e-4)
 
 
 def test_3d_models():
index 93a166d..5497e11 100644 (file)
@@ -269,7 +269,6 @@ def depthwise_conv2d_with_workload_NCHWc(batch, in_channel, in_height, channel_m
     filter_width = filter_height
     stride_h = stride_w = stride
 
-    assert dilation == 1, "depthwise_conv2d_NCHWc currently does not support dilation."
     assert channel_multiplier == 1, "depthwise_conv2d_NCHWc currently does not support channel multiplier > 1."
     pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
     padding_args = (pad_h, pad_w)
@@ -307,7 +306,7 @@ def depthwise_conv2d_with_workload_NCHWc(batch, in_channel, in_height, channel_m
             # declare
             DepthwiseConv2d = topi.x86.depthwise_conv2d_NCHWc(Input, Filter,
                                                               (stride_h, stride_w),
-                                                              padding_args,
+                                                              padding,
                                                               (dilation, dilation),
                                                               in_layout,
                                                               out_layout, dtype)
@@ -330,8 +329,9 @@ def depthwise_conv2d_with_workload_NCHWc(batch, in_channel, in_height, channel_m
             input_np = np.random.uniform(size=input_shape).astype(dtype)
             filter_np = np.random.uniform(size=filter_shape).astype(dtype)
             # correctness with scipy
+            dw_np = tvm.topi.testing.dilate_python(filter_np, (1, 1, dilation, dilation)).astype(dtype)
             depthwise_conv2d_scipy = tvm.topi.testing.depthwise_conv2d_python_nchw(
-                input_np, filter_np, stride, padding)
+                input_np, dw_np, stride, padding)
             relu_scipy = np.maximum(depthwise_conv2d_scipy, 0)
             return (_transform_data(input_np, ic_block),
                     _transform_kernel(filter_np, oc_block),
@@ -390,6 +390,7 @@ def test_depthwise_conv2d():
     # depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2)
 
     # NCHW[x]c
+    depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "SAME", dilation=2)
     depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "VALID")