[IE CLDNN] Fix fused ops in 1x1 conv fsv16 kernel (#1948)
authorJedrzej Hajduczenia <jedrzej.hajduczenia@intel.com>
Wed, 26 Aug 2020 17:58:51 +0000 (19:58 +0200)
committerGitHub <noreply@github.com>
Wed, 26 Aug 2020 17:58:51 +0000 (20:58 +0300)
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl

index bcb6a1d45b513709ade6e513f6510ba30c76c446..c3b10842ad2e7e7e16128392e087e74c17df73a5 100644 (file)
@@ -165,6 +165,7 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
     jit.AddConstant(MakeJitConstant("PADDED_INPUT", params.inputs[0].X().pad.Total() != 0));
 
     bool padded_output = params.output.X().pad.Total() != 0;
+    bool non_unit_fused_op_spatial = false;
 
     // Set padded_output to true when fused inputs have paddings to have correct blocked loads
     for (auto& fused_op : params.fused_ops) {
@@ -172,10 +173,17 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
             if (t.PitchesDifferFromLogicalDims()) {
                 padded_output = true;
             }
+            if ((t.X().v > 1) ||
+                (t.Y().v > 1) ||
+                (t.Z().v > 1) ||
+                (t.W().v > 1)) {
+                non_unit_fused_op_spatial = true;
+            }
         }
     }
 
     jit.AddConstant(MakeJitConstant("PADDED_OUTPUT", padded_output));
+    jit.AddConstant(MakeJitConstant("NON_UNIT_FUSED_OP_SPATIAL", non_unit_fused_op_spatial));
 
     jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", blockWidth));
     jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(params.output.X().v, blockWidth)));
index 155ed590e73113723d0fc81f212d59a8bc805548..7e103cf2f02477a26e947130c416f3f255c46589 100644 (file)
@@ -207,7 +207,7 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
     else
 #endif
     {
-#if !PADDED_OUTPUT
+#if !PADDED_OUTPUT && !NON_UNIT_FUSED_OP_SPATIAL
         if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y || (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % X_BLOCK_SIZE == 0) {
 #else
         if (x + X_BLOCK_SIZE <= OUTPUT_SIZE_X || OUTPUT_SIZE_X % X_BLOCK_SIZE == 0) {