[IE CLDNN] Fix concat in place with conv interactions (#1714)
authorKonrad Dobros <konrad.dobros@intel.com>
Mon, 24 Aug 2020 14:37:46 +0000 (16:37 +0200)
committerGitHub <noreply@github.com>
Mon, 24 Aug 2020 14:37:46 +0000 (17:37 +0300)
This change fixes concatenation in place optimization where it may
interact with convolution that uses physical padding.
One of such cases is where input to optimized concatenation is also
input to convolution, so it should have padding to enable optimized
implementation.
Previously for all concatenation inputs padding was overriden with only
concatenation axis being padded.
This change fixes this issue by propagating padding across inputs and
output.

13 files changed:
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_1x1.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_depthwise.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_fs_byx_fsv32.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_1x1.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_depthwise.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp
inference-engine/thirdparty/clDNN/src/include/program_helpers.h
inference-engine/thirdparty/clDNN/tests/test_cases/depth_concatenate_gpu_test.cpp

index fc3e332..dca5348 100644 (file)
@@ -139,6 +139,10 @@ bool ConvolutionKernel_fs_byx_fsv32::Validate(const Params& p, const optional_pa
     if (cp.output.Feature().pad.before % fsv != 0)
         return false;
 
+    // Input feature padding must be multiple of fsv to keep block alignment
+    if (cp.inputs[0].Feature().pad.before % fsv != 0)
+        return false;
+
     return true;
 }
 
index 67c0b3e..5533baa 100644 (file)
@@ -142,6 +142,10 @@ bool ConvolutionKernel_fs_byx_fsv32_1x1::Validate(const Params& p, const optiona
     if (cp.output.Feature().pad.before % fsv != 0)
         return false;
 
+    // Input feature padding must be multiple of fsv to keep block alignment
+    if (cp.inputs[0].Feature().pad.before % fsv != 0)
+        return false;
+
     return true;
 }
 
index dc3f414..cbb3999 100644 (file)
@@ -146,6 +146,10 @@ bool ConvolutionKernel_fs_byx_fsv32_depthwise::Validate(const Params& p, const o
     if (cp.output.Feature().pad.before % fsv != 0)
         return false;
 
+    // Input feature padding must be multiple of fsv to keep block alignment
+    if (cp.inputs[0].Feature().pad.before % fsv != 0)
+        return false;
+
     return true;
 }
 
index b963162..25ccfe1 100644 (file)
@@ -68,6 +68,9 @@ bool PoolingKerneGPU_fs_b_yx_fsv32::Validate(const Params& p, const optional_par
     if (pp.output.Feature().pad.before % 32 != 0)
         return false;
 
+    if (pp.inputs[0].Feature().pad.before % 32 != 0)
+        return false;
+
     return true;
 }
 
index 0764640..a3afa68 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
 #define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
+#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM)
 
 // In some cases input padding may be bigger than needed, those variables describe the offset into padding.
 #define INPUT0_PADDING_OFFSET_SIZE_X (INPUT0_PAD_BEFORE_SIZE_X - PADDING_SIZE_X)
@@ -103,7 +104,7 @@ KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)(
     uint input_offset = oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X;
     input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING;
     input_offset += INPUT0_PAD_BEFORE_FEATURE_NUM * INPUT0_FEATURE_PITCH;
-    input_offset += b * INPUT0_BATCH_PITCH;
+    input_offset += (b + INPUT0_PAD_BEFORE_BATCH_NUM) * INPUT0_BATCH_PITCH;
 
     uint weight_offset = 0;
     weight_offset += fs * FILTER_SIZE_X * FILTER_SIZE_Y * ALIGNED_IFM_NUM * FSV;
@@ -243,12 +244,19 @@ KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)(
 
     // ========================================================================
     // Store results:
+    // Calculate offset to first output element
+    const uint out_pitch_x = FSV;
+    const uint out_pitch_y = out_pitch_x * OUTPUT_SIZE_X_WITH_PADDING;
+    const uint out_pitch_b = out_pitch_y * OUTPUT_SIZE_Y_WITH_PADDING;
+    const uint out_pitch_fs = out_pitch_b * OUTPUT_SIZE_B_WITH_PADDING;
+
     const uint pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / FSV);
+
     uint output_offset = 0;
-    output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * FSV;
-    output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * FSV * OUTPUT_SIZE_X_WITH_PADDING;
-    output_offset += b  * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING;
-    output_offset += (pad_before_fs + fs) * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING * OUTPUT_BATCH_NUM;
+    output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * out_pitch_x;
+    output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * out_pitch_y;
+    output_offset += (b + OUTPUT_PAD_BEFORE_BATCH_NUM) * out_pitch_b;
+    output_offset += (pad_before_fs + fs) * out_pitch_fs;
 
     const bool full_f = OUTPUT_FEATURE_NUM % FSV == 0 || fs * FSV + FSV <= OUTPUT_FEATURE_NUM;
     const bool full_x = OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH == 0 || oc + OUTPUT_BLOCK_WIDTH <= OUTPUT_SIZE_X;
@@ -309,5 +317,6 @@ KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)(
 
 #undef OUTPUT_SIZE_X_WITH_PADDING
 #undef OUTPUT_SIZE_Y_WITH_PADDING
+#undef OUTPUT_SIZE_B_WITH_PADDING
 
 #undef INPUT_BLOCK_WIDTH_EL_CNT
index bed7518..a6c7bbf 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 
 #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
 #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
+#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
 
 #define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
 #define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
+#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM)
 
 // In some cases input padding may be bigger than needed, those variables describe the offset into padding.
 #define INPUT0_PADDING_OFFSET_SIZE_X (INPUT0_PAD_BEFORE_SIZE_X - PADDING_SIZE_X)
@@ -74,10 +76,17 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
         out[out_i] = UNIT_VAL_ZERO;
     }
 
+    // Calculate offset to first input data element
+    const uint in_pitch_x = FSV;
+    const uint in_pitch_y = in_pitch_x * INPUT0_SIZE_X_WITH_PADDING;
+    const uint in_pitch_b = in_pitch_y * INPUT0_SIZE_Y_WITH_PADDING;
+    const uint in_pitch_fs = in_pitch_b * INPUT0_SIZE_B_WITH_PADDING;
+
     uint input_offset = 0;
-    input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * FSV;
-    input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING * FSV;
-    input_offset += b * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
+    input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * in_pitch_x;
+    input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * in_pitch_y;
+    input_offset += (b + INPUT0_PAD_BEFORE_BATCH_NUM) * in_pitch_b;
+    input_offset += (INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * in_pitch_fs;
 
     uint weight_offset = 0;
     weight_offset += fs * FILTER_SIZE_X * FILTER_SIZE_Y * ALIGNED_IFM_NUM * FSV;
@@ -108,7 +117,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
             // ====================================================================
 
             // Move temporary input offset to next row
-            tmp_input_offset += DILATION_SIZE_Y * INPUT0_SIZE_X_WITH_PADDING * FSV;
+            tmp_input_offset += DILATION_SIZE_Y * in_pitch_y;
 
             uint tmp_weight_offset = weight_offset;
 
@@ -146,7 +155,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
             weight_offset += FILTER_SIZE_X * FSV;
         }
         // Move input offset to next input feature slice
-        input_offset += INPUT0_BATCH_NUM * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
+        input_offset += in_pitch_fs;
         // Move weight offset to next input feature slice (FSV input features)
         //  minus offset added by moving FILTER_SIZE_Y times to new row
         weight_offset += FSV * FILTER_SIZE_Y * FILTER_SIZE_X * FSV // FSV * input filter feature pitch
@@ -190,13 +199,19 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
 
     // ========================================================================
     // Store results:
+    // Calculate offset to first output element
+    const uint out_pitch_x = FSV;
+    const uint out_pitch_y = out_pitch_x * OUTPUT_SIZE_X_WITH_PADDING;
+    const uint out_pitch_b = out_pitch_y * OUTPUT_SIZE_Y_WITH_PADDING;
+    const uint out_pitch_fs = out_pitch_b * OUTPUT_SIZE_B_WITH_PADDING;
+
     const uint pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / FSV);
 
     uint output_offset = 0;
-    output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * FSV;
-    output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * FSV * OUTPUT_SIZE_X_WITH_PADDING;
-    output_offset += b  * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING;
-    output_offset += (pad_before_fs + fs) * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING * OUTPUT_BATCH_NUM;
+    output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * out_pitch_x;
+    output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * out_pitch_y;
+    output_offset += (b + OUTPUT_PAD_BEFORE_BATCH_NUM) * out_pitch_b;
+    output_offset += (fs + pad_before_fs) * out_pitch_fs;
 
     const bool full_f = OUTPUT_FEATURE_NUM % FSV == 0 || fs * FSV + FSV <= OUTPUT_FEATURE_NUM;
     const bool full_x = OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH == 0 || oc + OUTPUT_BLOCK_WIDTH <= OUTPUT_SIZE_X;
@@ -243,6 +258,8 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
 
 #undef INPUT0_SIZE_X_WITH_PADDING
 #undef INPUT0_SIZE_Y_WITH_PADDING
+#undef INPUT0_SIZE_B_WITH_PADDING
 
 #undef OUTPUT_SIZE_X_WITH_PADDING
 #undef OUTPUT_SIZE_Y_WITH_PADDING
+#undef OUTPUT_SIZE_B_WITH_PADDING
index a82af38..7e9f8e7 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 
 #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
 #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
+#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
 
 #define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
 #define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
+#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM)
 
 // In some cases input padding may be bigger than needed, those variables describe the offset into padding.
 #define INPUT0_PADDING_OFFSET_SIZE_X (INPUT0_PAD_BEFORE_SIZE_X - PADDING_SIZE_X)
@@ -73,10 +75,17 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
         out[out_i] = UNIT_VAL_ZERO;
     }
 
+    // Calculate offset to first input data element
+    const uint in_pitch_x = FSV;
+    const uint in_pitch_y = in_pitch_x * INPUT0_SIZE_X_WITH_PADDING;
+    const uint in_pitch_b = in_pitch_y * INPUT0_SIZE_Y_WITH_PADDING;
+    const uint in_pitch_fs = in_pitch_b * INPUT0_SIZE_B_WITH_PADDING;
+
     uint input_offset = 0;
-    input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * FSV;
-    input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING * FSV;
-    input_offset += b * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
+    input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * in_pitch_x;
+    input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * in_pitch_y;
+    input_offset += (b + INPUT0_PAD_BEFORE_BATCH_NUM) * in_pitch_b;
+    input_offset += (INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * in_pitch_fs;
 
     uint weight_offset = 0;
     weight_offset += fs * ALIGNED_IFM_NUM * FSV;
@@ -119,11 +128,11 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
                 }
             }
             // Move temporary input offset to next strided row
-            tmp_input_offset += INPUT0_SIZE_X_WITH_PADDING * FSV * STRIDE_SIZE_Y;
+            tmp_input_offset += in_pitch_y * STRIDE_SIZE_Y;
         }
         // ========================================================================
         // Move input offset to next input feature slice
-        input_offset += INPUT0_BATCH_NUM * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
+        input_offset += in_pitch_fs;
 
     }
     // ========================================================================
@@ -170,12 +179,19 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
 
     // ========================================================================
     // Store results:
+    // Calculate offset to first output element
+    const uint out_pitch_x = FSV;
+    const uint out_pitch_y = out_pitch_x * OUTPUT_SIZE_X_WITH_PADDING;
+    const uint out_pitch_b = out_pitch_y * OUTPUT_SIZE_Y_WITH_PADDING;
+    const uint out_pitch_fs = out_pitch_b * OUTPUT_SIZE_B_WITH_PADDING;
+
     const uint pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / FSV);
+
     uint output_offset = 0;
-    output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * FSV;
-    output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * FSV * OUTPUT_SIZE_X_WITH_PADDING;
-    output_offset += b  * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING;
-    output_offset += (pad_before_fs + fs) * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING * OUTPUT_BATCH_NUM;
+    output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * out_pitch_x;
+    output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * out_pitch_y;
+    output_offset += (b + OUTPUT_PAD_BEFORE_BATCH_NUM)  * out_pitch_b;
+    output_offset += (pad_before_fs + fs) * out_pitch_fs;
 
     const bool full_f = OUTPUT_FEATURE_NUM % FSV == 0 || fs * FSV + FSV <= OUTPUT_FEATURE_NUM;
     const bool full_x = OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH == 0 || oc + OUTPUT_BLOCK_WIDTH <= OUTPUT_SIZE_X;
@@ -201,7 +217,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
                 UNIT_BLOCK_WRITE2(output, output_offset + out_x * FSV, tmp_write);
             }
             // Move output offset to next row
-            output_offset += FSV * OUTPUT_SIZE_X_WITH_PADDING;
+            output_offset += out_pitch_y;
         }
     }
     else
@@ -225,7 +241,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
                 }
             }
             // Move output offset to next row
-            output_offset += FSV * OUTPUT_SIZE_X_WITH_PADDING;
+            output_offset += out_pitch_y;
         }
     }
     // ========================================================================
@@ -235,6 +251,8 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
 
 #undef INPUT0_SIZE_X_WITH_PADDING
 #undef INPUT0_SIZE_Y_WITH_PADDING
+#undef INPUT0_SIZE_B_WITH_PADDING
 
 #undef OUTPUT_SIZE_X_WITH_PADDING
 #undef OUTPUT_SIZE_Y_WITH_PADDING
+#undef OUTPUT_SIZE_B_WITH_PADDING
index 7131d35..45c4897 100644 (file)
 
 #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
 #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
+#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
 
 #define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
 #define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
+#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM)
 
 // In some cases input padding may be bigger than needed, those variables describe the offset into padding.
 #define INPUT0_PADDING_OFFSET_SIZE_X (INPUT0_PAD_BEFORE_SIZE_X - PADDING_SIZE_X)
@@ -72,11 +74,17 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
         out[out_i] = UNIT_VAL_ZERO;
     }
 
+    // Calculate offset to first input data element
+    const uint in_pitch_x = FSV;
+    const uint in_pitch_y = in_pitch_x * INPUT0_SIZE_X_WITH_PADDING;
+    const uint in_pitch_b = in_pitch_y * INPUT0_SIZE_Y_WITH_PADDING;
+    const uint in_pitch_fs = in_pitch_b * INPUT0_SIZE_B_WITH_PADDING;
+
     uint input_offset = 0;
-    input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * FSV;
-    input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING * FSV;
-    input_offset += b * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
-    input_offset += fs * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV * INPUT0_BATCH_NUM;
+    input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * in_pitch_x;
+    input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * in_pitch_y;
+    input_offset += (b + INPUT0_PAD_BEFORE_BATCH_NUM) * in_pitch_b;
+    input_offset += (fs + INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * in_pitch_fs;
 
     uint weight_offset = 0;
 
@@ -105,7 +113,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
         // ====================================================================
 
         // Move temporary input offset to next row
-        tmp_input_offset += DILATION_SIZE_Y * INPUT0_SIZE_X_WITH_PADDING * FSV;
+        tmp_input_offset += DILATION_SIZE_Y * in_pitch_y;
 
         uint tmp_weight_offset = weight_offset;
 
@@ -174,13 +182,19 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
 
     // ========================================================================
     // Store results:
+    // Calculate offset to first output element
+    const uint out_pitch_x = FSV;
+    const uint out_pitch_y = out_pitch_x * OUTPUT_SIZE_X_WITH_PADDING;
+    const uint out_pitch_b = out_pitch_y * OUTPUT_SIZE_Y_WITH_PADDING;
+    const uint out_pitch_fs = out_pitch_b * OUTPUT_SIZE_B_WITH_PADDING;
+
     const uint pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / FSV);
 
     uint output_offset = 0;
-    output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * FSV;
-    output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * FSV * OUTPUT_SIZE_X_WITH_PADDING;
-    output_offset += b  * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING;
-    output_offset += (pad_before_fs + fs) * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING * OUTPUT_BATCH_NUM;
+    output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * out_pitch_x;
+    output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * out_pitch_y;
+    output_offset += (b + OUTPUT_PAD_BEFORE_BATCH_NUM)  * out_pitch_b;
+    output_offset += (pad_before_fs + fs) * out_pitch_fs;
 
     const bool full_f = OUTPUT_FEATURE_NUM % FSV == 0 || fs * FSV + FSV <= OUTPUT_FEATURE_NUM;
     const bool full_x = OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH == 0 || oc + OUTPUT_BLOCK_WIDTH <= OUTPUT_SIZE_X;
@@ -227,6 +241,8 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
 
 #undef INPUT0_SIZE_X_WITH_PADDING
 #undef INPUT0_SIZE_Y_WITH_PADDING
+#undef INPUT0_SIZE_B_WITH_PADDING
 
 #undef OUTPUT_SIZE_X_WITH_PADDING
 #undef OUTPUT_SIZE_Y_WITH_PADDING
+#undef OUTPUT_SIZE_B_WITH_PADDING
index 7c98ee7..bec60b1 100644 (file)
 
 #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
 #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
+#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
+
 #define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
 #define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
+#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM)
 
 // Kernel works only for sub_group size of 16 with 32 features slice size and process 2 features per WI
 #define REQD_SUB_GROUP_SIZE 16
@@ -79,14 +82,18 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)(
     const uint x_pitch = REQD_FEATURE_SLICE_SIZE;                        // difference in location between (x+1) and (x)
     const uint y_pitch = x_pitch * INPUT0_SIZE_X_WITH_PADDING;           // difference in location between (y+1) and (y)
     const uint b_pitch = y_pitch * INPUT0_SIZE_Y_WITH_PADDING;           // difference in location between (b+1) and (b)
-    const uint fs_pitch = b_pitch * INPUT0_BATCH_NUM;                    // difference in location between (fs+1) and (fs)
+    const uint fs_pitch = b_pitch * INPUT0_SIZE_B_WITH_PADDING;          // difference in location between (fs+1) and (fs)
 
     const int offset_x = (int)out_x*STRIDE_SIZE_X - PADDING_SIZE_X;
     const int offset_y = (int)out_y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
 
-    const size_t padding_offset = INPUT0_PAD_BEFORE_SIZE_X * x_pitch + INPUT0_PAD_BEFORE_SIZE_Y * y_pitch;
+    const size_t padding_offset = INPUT0_PAD_BEFORE_SIZE_X * x_pitch +
+                                  INPUT0_PAD_BEFORE_SIZE_Y * y_pitch +
+                                  INPUT0_PAD_BEFORE_BATCH_NUM * b_pitch +
+                                  INPUT0_PAD_BEFORE_FEATURE_NUM / REQD_FEATURE_SLICE_SIZE * fs_pitch;
     const size_t fs_offset = fs * fs_pitch; // locate beginning of feature tile
     const size_t b_offset = b * b_pitch;   // locate beginning of batch
+
 #ifdef CHECK_BOUNDRY
     if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
         offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
@@ -152,15 +159,14 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)(
     const size_t out_x_pitch = REQD_FEATURE_SLICE_SIZE;
     const size_t out_y_pitch = out_x_pitch * OUTPUT_SIZE_X_WITH_PADDING;
     const size_t out_b_pitch = out_y_pitch * OUTPUT_SIZE_Y_WITH_PADDING;
-    const size_t out_fs_pitch = out_b_pitch * OUTPUT_BATCH_NUM;
+    const size_t out_fs_pitch = out_b_pitch * OUTPUT_SIZE_B_WITH_PADDING;
 
     const size_t out_pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / REQD_FEATURE_SLICE_SIZE);
     const size_t out_x_offset = (out_x + OUTPUT_PAD_BEFORE_SIZE_X) * out_x_pitch;
     const size_t out_y_offset = (out_y + OUTPUT_PAD_BEFORE_SIZE_Y) * out_y_pitch;
-    const size_t out_b_offset = b * out_b_pitch;
+    const size_t out_b_offset = (b + OUTPUT_PAD_BEFORE_BATCH_NUM) * out_b_pitch;
     const size_t out_fs_offset = (fs + out_pad_before_fs) * out_fs_pitch;
 
-
     const size_t output_offset = out_fs_offset + out_b_offset + out_y_offset + out_x_offset;
 
     const bool full_f = OUTPUT_FEATURE_NUM % REQD_FEATURE_SLICE_SIZE == 0 ||
@@ -204,3 +210,15 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)(
 
 #undef OUTPUT_VEC2
 #undef TO_OUTPUT_VEC2
+
+#undef INPUT0_SIZE_X_WITH_PADDING
+#undef INPUT0_SIZE_Y_WITH_PADDING
+#undef INPUT0_SIZE_B_WITH_PADDING
+
+#undef OUTPUT_SIZE_X_WITH_PADDING
+#undef OUTPUT_SIZE_Y_WITH_PADDING
+#undef OUTPUT_SIZE_B_WITH_PADDING
+
+#undef REQD_SUB_GROUP_SIZE
+#undef REQD_FEATURE_SLICE_SIZE
+#undef REQD_FEATURES_PER_WORK_ITEM
index 362e9a7..bda6833 100644 (file)
@@ -448,8 +448,8 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const {
             definitions.push_back({ safe_index_func_name, safe_index_func_val });
             definitions.push_back({ index_func_name, index_func_val });
         } else {
-            definitions.push_back({ safe_index_func_name, "(f)" });
-            definitions.push_back({ index_func_name, "(f)" });
+            definitions.push_back({ safe_index_func_name, "(" + std::to_string(_tensor.Feature().pad.before) + " + (f))" });
+            definitions.push_back({ index_func_name, "(" + std::to_string(_tensor.Feature().pad.before) + " + (f))" });
         }
     } else {
         definitions.push_back({ safe_index_func_name, safe_index_func_val });
index d2ce48e..81db4d5 100644 (file)
 
 using namespace cldnn;
 
-// ToDo remove friendship relation from  program_node
-void prepare_buffer_fusing::run(program_impl& p) {
-    bool is_debug = p.get_options().get<build_option_type::debug>()->enabled();
-    /*
-    We need to take care of proper ordering by types.
-    1. Concats
-    2. Crops
-    3. Others
-    Concat before crops is needed because of the crop fusing padding requirments.
-    If crop is before concat there can be padding mismtach, since concat changes padding.
-    */
-    auto can_optimize = [](const program_node* node) {
-        if (node->is_output() || (!node->get_fused_activations_funcs().empty())) {
-            return false;
+namespace {
+
+struct concat_noop_optimization : pattern_match_optimization_typed<concat_noop_optimization, concatenation> {
+    // Removes concatenation nodes with single input.
+    using base = pattern_match_optimization_typed<concat_noop_optimization, concatenation>;
+    using base::base;
+
+    bool match(concatenation_node& node);
+    bool optimize(concatenation_node& node);
+};
+
+struct concat_in_place_optimization : pattern_match_optimization_typed<concat_in_place_optimization, concatenation> {
+    // Performs in-place concat optimization.
+    // Padding of predecessors is updated to use single buffer by all, which is output from concatenation.
+    // Then concatenation can be optimized out, as memory will be correctly filled by previous nodes.
+    // If one of the dependencies is also optimized-out concatenation, then cascade adjusment is performed to update it.
+    // This optimization is expected to be executed in some topological order, as cascade adjustment is performed backwards.
+    using base = pattern_match_optimization_typed<concat_in_place_optimization, concatenation>;
+    using base::base;
+
+    // Runs concat in-place optimization and adds already optimized concatenations that need re-optimization to `needs_reoptimization`.
+    void optimize_cascade(concatenation_node& node, std::list<concatenation_node*>& need_reoptimization);
+    bool match(concatenation_node& node);
+    bool optimize(concatenation_node& node) {
+        std::list<concatenation_node*> need_reopt;
+        optimize_cascade(node, need_reopt);
+        while (!need_reopt.empty()) {
+            auto& prop = *need_reopt.front();
+            need_reopt.pop_front();
+            if (match(prop))
+                optimize_cascade(prop, need_reopt);
+            else
+                // TODO: Revert extra padding when cascade adjustment failed.
+                prop.can_be_optimized(false);
         }
-        return true;
-    };
+        return false;  // node not invalidated
+    }
+};
+
+bool concat_noop_optimization::match(concatenation_node& node) {
+    if (node.is_output() && !get_program().is_debug_build())
+        return false;
+    return node.get_dependencies().size() == 1 &&
+        !node.has_fused_primitives() &&
+        node.get_fused_activations_funcs().empty();
+}
 
-    // [1] First try to optimize all concats
-    auto node_itr = p.get_processing_order().begin();
-    while (node_itr != p.get_processing_order().end()) {
-        auto& node = (*node_itr++);
-        if (!can_optimize(node))
-            continue;
-        program_helpers::do_for_types<concatenation>(*node, [&p, is_debug](concatenation_node& node) {
-            // For in place concatenation input layouts and data types must match
-            auto output_format = node.get_output_layout().format;
-            auto output_datatype = node.get_output_layout().data_type;
-            // we need to avoid mixing padded and unpadded buffer
-            bool all_dependencies_padded = true;
-            bool all_dependencies_unpadded = true;
-            for (auto& input : node.get_dependencies()) {
-                if (input->type() == reshape::type_id())
-                    // reshapes should be optimized out
-                    return;
+bool concat_noop_optimization::optimize(concatenation_node& node) {
+    auto& dep = node.get_dependency(0);
+    dep.merge_output_padding(node.get_output_layout().data_padding);
+    prog.extract_and_remove(node);
+    // Node has been removed, so no further optimizations.
+    return true;
+}
 
-                layout l = input->get_output_layout();
-                if (static_cast<bool>(l.data_padding))
-                    all_dependencies_unpadded = false;
-                else
-                    all_dependencies_padded = false;
+bool concat_in_place_optimization::match(concatenation_node& node) {
+    if (node.is_output() && !get_program().is_debug_build())
+        return false;
+    if (node.has_fused_primitives() || !node.get_fused_activations_funcs().empty())
+        return false;
 
-                if (output_format != l.format || output_datatype != l.data_type)
-                    return;
+    // For in place concatenation input layouts and data types must match.
+    auto output_format = node.get_output_layout().format;
+    auto output_datatype = node.get_output_layout().data_type;
+    auto concat_axis = node.get_primitive()->axis;
 
-                if (l.format == format::b_fs_yx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
-                    return;
+    for (auto& input : node.get_dependencies()) {
+        if (input->is_type<reshape>())
+            // reshapes should be optimized out.
+            return false;
 
-                if (l.format == format::b_fs_zyx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
-                    return;
+        layout l = input->get_output_layout();
 
-                if ((l.format == format::b_fs_yx_fsv32 || l.format == format::b_fs_zyx_fsv32) &&
-                    (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
-                    return;
+        if (output_format != l.format || output_datatype != l.data_type)
+            return false;
 
-                // TODO: If we replace byxf_af32 with byxf we can probably do this optimization, but support in kernels is required
-                if (l.format == format::byxf_af32 && (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
-                    return;
+        // TODO: Below condition should be moved to program_node::supports_padding.
+        // This hovewer will need updating the algorithm as it may make cascade adjustment impossible in some cases.
+        // It hovewer would make normal optimizations possible in others, so this is a trade-off to be investigated.
+        if (l.format == format::b_fs_yx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
+            return false;
 
-                if (l.format == format::bs_fs_yx_bsv16_fsv16)
-                    return;
+        if (l.format == format::b_fs_zyx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
+            return false;
 
-                if (l.format == format::b_fs_yx_fsv4 && (l.size.feature[0] != 8 || node.get_primitive()->axis != concatenation::along_f))
-                    return;
-            }
+        if ((l.format == format::b_fs_yx_fsv32 || l.format == format::b_fs_zyx_fsv32) &&
+            (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
+            return false;
 
-            auto concat_axis = node.get_primitive()->axis;
-            auto padd = node.get_output_layout().data_padding;
+        // TODO: If we replace byxf_af32 with byxf we can probably do this optimization, but support in kernels is required
+        if (l.format == format::byxf_af32 && (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
+            return false;
 
-            tensor lower_padd = padd.lower_size();
-            tensor upper_padd = padd.upper_size();
+        if (l.format == format::bs_fs_yx_bsv16_fsv16)
+            return false;
 
-            auto upper_padd_val =
-                node.get_output_layout().get_buffer_size().raw[concat_axis] - lower_padd.raw[concat_axis];
-            tensor lower_padd_offset = lower_padd;
+        if (l.format == format::b_fs_yx_fsv4 && (l.size.feature[0] != 8 || node.get_primitive()->axis != concatenation::along_f))
+            return false;
+    }
 
-            std::list<std::pair<const std::vector<program_node*>, tensor>> stack = {
-                std::make_pair(node.get_dependencies(), tensor(0))};
-            while (!stack.empty()) {
-                auto nodes_list = stack.front();
-                stack.pop_front();
+    auto lower_padd_in_axis = node.get_output_layout().data_padding.lower_size().raw[concat_axis];
+    lower_padd_in_axis = std::max(lower_padd_in_axis,
+                                  node.get_dependency(0).get_output_layout().data_padding.lower_size().raw[concat_axis]);
+
+    // check if concatenation in place can be applied for inputs set
+    size_t idx = 0;
+    for (auto input : node.get_dependencies()) {
+        // reverted condition - if any of this node's inputs is used by more than one primitive
+        // and is not optimized concatenation then do not fuse buffers
+        // todo: we need add padding support for all optimized kernels to remove this condition
+        if (!input->is_type<pooling>() && !input->is_type<convolution>() &&
+            !input->is_type<activation>() && !input->is_type<deconvolution>() &&
+            !input->is_type<concatenation>() && !input->is_type<crop>() && !input->is_type<scale>() &&
+            !input->is_type<resample>())
+            return false;
 
-                // if concatenation has only one input it does nothing, remove the node
-                if (node.get_dependencies().size() == 1) {
-                    p.extract_and_remove(node);
-                    return;
-                }
+        // if an input is marked as network output, prevent optimizations
+        // which would affect a form of its output (unless debug flag is set),
+        // we also need to restrict input types to those which support padding on all axis
+        if ((input->is_output() && !get_program().is_debug_build()) ||
+            !input->is_padding_supported(concat_axis, lower_padd_in_axis))
+            return false;
 
-                auto cascade_adjustment = nodes_list.second;
-                upper_padd.raw[concat_axis] = upper_padd_val;
-                lower_padd = lower_padd_offset;
-
-                auto lower_padd_in_axis = lower_padd.raw[concat_axis] + cascade_adjustment.raw[concat_axis];
-                auto first_input_format = nodes_list.first[0]->get_output_layout().format;
-
-                // check if concatenation in place can be applied for inputs set
-                for (auto input : nodes_list.first) {
-                    // reverted condition - if any of this node's inputs is used by more than one primitive
-                    // and is not optimized concatenation then do not fuse buffers
-                    // todo: we need add padding support for all optimized kernels to remove this condition
-                    if (!input->is_type<pooling>() && !input->is_type<convolution>() &&
-                        !input->is_type<activation>() && !input->is_type<deconvolution>() &&
-                        !input->is_type<concatenation>() && !input->is_type<crop>() && !input->is_type<scale>() &&
-                        !input->is_type<resample>())
-                        return;
+        // TODO: Investigate if this condition is needed
+        if (input->get_users().size() > 2)
+            return false;
 
-                    // if an input is marked as network output, prevent optimizations
-                    // which would affect a form of its output (unless debug flag is set),
-                    // we also need to restrict input types to those which support padding on all axis
-                    if ((input->is_output() && !is_debug) || input->get_users().size() > 2 ||
-                        !input->is_padding_supported(concat_axis, lower_padd_in_axis))
-                        return;
+        // Check that input isn't optimized out concatenation along different axis.
+        if (input->is_type<concatenation>() && input->can_be_optimized() &&
+            input->as<concatenation>().get_primitive()->axis != concat_axis)
+            return false;
 
-                    if (input->get_users().size() > 1) {
-                        auto user_count = input->get_users().size();
-                        for (auto& user : input->get_users())
-                            if (user->is_type<concatenation>())
-                                user_count--;
-                        if (user_count != 1)  // user_cout == 0 means that input will be used only by concatenations, so
-                                              // we cannot apply concat in place for it
-                            return;
-                    }
-
-                    // check if all inputs have the same format
-                    if (input->get_output_layout().format != first_input_format)
-                        return;
+        // Check that input isn't optimized out non-concatenation.
+        if (!input->is_type<concatenation>() && input->can_be_optimized())
+            return false;
 
-                    lower_padd_in_axis += input->get_output_layout().size.raw[concat_axis];
-                }
+        size_t concat_users = 0;
+        for (auto& user : input->get_users())
+            if (user->is_type<concatenation>())
+                concat_users += 1;
 
-                // check if it is worth doing concat in place, in case the following primitive is convolution
-                // with different input padding than concatenation's input users' convolutions,
-                // it is likely that convolution's implementation will be a reference one, due to mismatched padding
-                // and performance gain by doing in place concat is nullified by slower convolution implementation
-                // this should be handled by more advanced tuning mechanism on the topology level
-                auto& users = node.get_users();
-                if (users.size() == 1) {
-                    auto& user = users.front();
-                    if (node.get_output_layout().format == format::bfyx && user->type() == convolution::type_id()) {
-                        auto out_input_offsets = user->as<convolution>().get_primitive()->input_offset;
-
-                        std::vector<tensor> in_input_offsets;
-                        for (auto& in_user : nodes_list.first) {
-                            if (in_user->type() == convolution::type_id())
-                                in_input_offsets.push_back(in_user->as<convolution>().get_primitive()->input_offset);
-                        }
-
-                        for (auto& in_input_offset : in_input_offsets) {
-                            if (in_input_offset.spatial[0] != out_input_offsets.spatial[0] &&
-                                in_input_offset.spatial[1] != out_input_offsets.spatial[1])
-                                return;
-                        }
-                    } else if (user->type() == fused_conv_eltwise::type_id()) {
-                        if (!user->as<fused_conv_eltwise>().get_fused_primitives().empty() &&
-                            user->as<fused_conv_eltwise>().get_fused_primitives().begin()->node->is_type<depth_to_space>())
-                            return;
-                    }
-                }
+        // If input is used by more than one concatenation then they may require different paddings.
+        if (concat_users != 1)
+            return false;
 
-                // apply concatenation in place optimization
-                for (auto input : nodes_list.first) {
-                    auto input_lenght = input->get_output_layout().size.raw[concat_axis];
+        auto input_padd = input->get_output_layout().data_padding;
 
-                    bool optimized_concat_input = false;
-                    if (input->type() == concatenation::type_id() && input->can_be_optimized()) {
-                        if (input->as<concatenation>().get_primitive()->axis != node.get_primitive()->axis)
-                            return;
-                        optimized_concat_input = true;
-                    } else if (input->can_be_optimized()) {
-                        return;
-                    }
+        // Check that there isn't already some padding between inputs in concat axis.
+        // If node has already been optimized we skip this check - this is just cascade adjustment.
+        if (!node.can_be_optimized()) {
+            if (idx != node.get_dependencies().size() && input_padd.upper_size().raw[concat_axis] != 0)
+                return false;
+            if (idx != 0 && input_padd.lower_size().raw[concat_axis] != 0)
+                return false;
+        }
 
-                    // shrink upper pad so it points at the end of the input's buffer
-                    //
-                    //   |--- lower padd ---|                    |---------- upper padd -----------|
-                    //   |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
-                    upper_padd.raw[concat_axis] -= input_lenght;
+        lower_padd_in_axis += input->get_output_layout().size.raw[concat_axis];
+        idx += 1;
+    }
 
-                    // adjust padding sizes for cascade concatenations
-                    auto lower_padd_tmp = lower_padd;
-                    lower_padd_tmp.raw[concat_axis] += cascade_adjustment.raw[concat_axis];
-                    auto upper_padd_tmp = upper_padd;
-                    upper_padd_tmp.raw[concat_axis] -= cascade_adjustment.raw[concat_axis];
+    return true;
+}
 
-                    // set new padding for input
-                    input->set_output_padding(padding(lower_padd_tmp.sizes(), upper_padd_tmp.sizes()));
+void concat_in_place_optimization::optimize_cascade(concatenation_node& node, std::list<concatenation_node*>& need_reoptimization) {
+    auto concat_axis = node.get_primitive()->axis;
 
-                    // move lower padd further
-                    //
-                    //   |-------------- lower padd -------------|---------- upper padd -----------|
-                    //   |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
+    // Select output padding by propagating all required input paddings.
+    auto padd = node.get_output_layout().data_padding;
+    for (auto input : node.get_dependencies()) {
+        padd = padding::max(padd, input->get_output_layout().data_padding);
+    }
 
-                    lower_padd.raw[concat_axis] += input_lenght;
+    auto lower_padd = padd.lower_size();
+    auto upper_padd = padd.upper_size();
 
-                    if (optimized_concat_input && !input->get_dependencies().empty())
-                        stack.push_back(std::make_pair(input->get_dependencies(),
-                                                       input->get_output_layout().data_padding.lower_size()));
-                }
-            }
+    // For cascade adjustment override padding in concat axis to output padding.
+    // In other case match(...) already checked that only first/last input have lower/upper padding.
+    if (node.can_be_optimized()) {
+        lower_padd.raw[concat_axis] = node.get_output_layout().data_padding.lower_size().raw[concat_axis];
+        upper_padd.raw[concat_axis] = node.get_output_layout().data_padding.upper_size().raw[concat_axis];
+    }
+    node.set_output_padding(padding(lower_padd.sizes(), upper_padd.sizes()));
 
-            node.can_be_optimized(true);
-            for (auto dep : node.get_users()) {
-                dep->can_share_buffer(false);
-            }
-            if (!all_dependencies_padded && !all_dependencies_unpadded)
-                node.can_share_buffer(false);
-        });
+    upper_padd.raw[concat_axis] += node.get_output_layout().size.raw[concat_axis];
+
+    // apply concatenation in place optimization
+    for (auto input : node.get_dependencies()) {
+        auto input_lenght = input->get_output_layout().size.raw[concat_axis];
+
+        if (input->is_type<concatenation>() && input->can_be_optimized())
+            need_reoptimization.push_back(&input->as<concatenation>());
+
+        // shrink upper pad so it points at the end of the input's buffer
+        //
+        //   |--- lower padd ---|                    |---------- upper padd -----------|
+        //   |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
+        upper_padd.raw[concat_axis] -= input_lenght;
+
+        // set new padding for input
+        input->set_output_padding(padding(lower_padd.sizes(), upper_padd.sizes()));
+
+        // move lower padd further
+        //
+        //   |-------------- lower padd -------------|---------- upper padd -----------|
+        //   |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
+        lower_padd.raw[concat_axis] += input_lenght;
+    }
+
+    node.can_be_optimized(true);
+    for (auto dep : node.get_users()) {
+        dep->can_share_buffer(false);
     }
+}
+
+}  // namespace
+
+// ToDo remove friendship relation from  program_node
+void prepare_buffer_fusing::run(program_impl& p) {
+    bool is_debug = p.get_options().get<build_option_type::debug>()->enabled();
+    /*
+    We need to take care of proper ordering by types.
+    1. Concats
+    2. Crops
+    3. Others
+    Concat before crops is needed because of the crop fusing padding requirments.
+    If crop is before concat there can be padding mismtach, since concat changes padding.
+    */
+    auto can_optimize = [](const program_node* node) {
+        if (node->is_output() || (!node->get_fused_activations_funcs().empty())) {
+            return false;
+        }
+        return true;
+    };
+
+    // [1] First try to optimize all concats
+    run_node_optimizations<concat_noop_optimization,
+                           concat_in_place_optimization>(p);
 
     // [2] Then try to optimize all crops
-    node_itr = p.get_processing_order().begin();
+    auto node_itr = p.get_processing_order().begin();
     while (node_itr != p.get_processing_order().end()) {
         auto& node = (*node_itr++);
         if (!can_optimize(node))
index 7ec2622..57e56f7 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -135,4 +135,105 @@ struct program_helpers {
     }
     static layout get_weights_layout(typed_program_node<cldnn::data>& data_node, int32_t split);
 };
+
+// Base class for performing pattern match style optimizations.
+// Uses CRTP idiom, implementing class should be passed as template parameter `Impl`,
+// and overload match and optimize methods.
+template <typename Impl>
+struct pattern_match_optimization {
+    pattern_match_optimization(program_impl& prog)
+        : prog(prog)
+    {}
+
+    // Returns whether optimization can be performed for specified node.
+    bool match(program_node& node) {
+        return static_cast<Impl*>(this)->match(node);
+    }
+    // Returns whether optimization invalidated the node and no futher optimizations should execute.
+    bool optimize(program_node& node) {
+        // TODO: Add program optimizer class that would take responsibility of modifying program.
+        //       Then use it to provide more complex control over pattern-matches, ie:
+        //       new node added - run applicable optimizations on it as well;
+        //       node deleted - don't do more optimizations;
+        return static_cast<Impl*>(this)->optimize(node);
+    }
+    // Returns whether optimization invalidated the node and no futher optimizations should execute.
+    bool match_and_optimize(program_node& node) {
+        if (!match(node))
+            return false;
+        return optimize(node);
+    }
+
+    program_impl& get_program() { return prog; }
+
+    program_impl& prog;
+};
+
+// Class for pattern-match optimizations that provides support for matching
+// single primitive type `Prim`.
+// Implementing class `Impl` is expected to overload:
+// bool match(typed_program_node<Prim>&)
+// bool optimize(typed_program_node<Prim>&)
+// Uses CRTP idiom, implementing class should be passed as template parameter `Impl`.
+template <typename Impl, typename Prim>
+struct pattern_match_optimization_typed : pattern_match_optimization<pattern_match_optimization_typed<Impl, Prim>> {
+    using base = pattern_match_optimization<pattern_match_optimization_typed<Impl, Prim>>;
+
+    using base::base;
+
+    // Returns whether optimization can be performed for specified node.
+    bool match(program_node& node) {
+        if (!node.is_type<Prim>())
+            return false;
+        return static_cast<Impl*>(this)->match(node.as<Prim>());
+    }
+    // Should be overloaded by implementation class to match specified primitive.
+    bool match(typed_program_node<Prim>& node) {
+        return false;
+    }
+
+    // Returns whether optimization invalidated the node and no futher optimizations should execute.
+    bool optimize(program_node& node) {
+        return static_cast<Impl*>(this)->optimize(node.as<Prim>());
+    }
+    // Should be overloaded by implementation class to optimize specified primitive.
+    bool optimize(typed_program_node<Prim>& node) {
+        return false;
+    }
+};
+
+// Runs pattern-match optimiations passed as arguments on `node`.
+inline bool run_node_optimizations(program_node& /*node*/) {
+    return false;
+}
+
+template <typename Opt, typename... Rest>
+bool run_node_optimizations(program_node& node, Opt&& opt, Rest&&... rest) {
+    if (opt.match_and_optimize(node))
+        return true;
+    return run_node_optimizations(node, std::forward<Rest>(rest)...);
+}
+
+// Runs pattern-match optimizations `Opts` on `node`.
+// Optimizations should have constructor with single argument `program_impl&`.
+template <typename... Opts>
+bool run_node_optimizations(program_impl& p, program_node& node) {
+    return run_node_optimizations<Opts...>(node, Opts(p)...);
+}
+
+// Runs specified pattern-match optimizations on whole program, in processing order.
+template <typename... Opts>
+void run_node_optimizations(program_impl& p, Opts&&... opts) {
+    auto it = p.get_processing_order().begin();
+    while (it != p.get_processing_order().end()) {
+        auto node = *it++;
+        run_node_optimizations(*node, std::forward<Opts>(opts)...);
+    }
+}
+
+template <typename... Opts>
+void run_node_optimizations(program_impl& p) {
+    run_node_optimizations(p, Opts(p)...);
+}
+
 }  // namespace cldnn
index 0d8bc88..05e32e5 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -431,6 +431,211 @@ TEST(depth_concatenate_f32_gpu, test05_different_formats) {
     }
 }
 
+TEST(depth_concatenate_f32_gpu, test06_padded_input) {
+    // input1 - activation - concatenation - concatenation - reorder
+    //                     /                /
+    // input2 - activation -  convolution* /
+    //
+    // *Convolution has input offset so it should be propagated, both back to reorders and to second concatenation.
+    // As a result both concatenations should be optimized out and convolution should use optimized implementation.
+    const int32_t input_f = 32;
+    const int32_t output_f = 3 * input_f;
+
+    const auto& engine = get_test_engine();
+    auto input1 = memory::allocate(engine, { data_types::f16, format::fs_b_yx_fsv32, {1, input_f, 1, 1} });
+    auto input2 = memory::allocate(engine, { data_types::f16, format::fs_b_yx_fsv32, {1, input_f, 1, 1} });
+
+    auto input1_data = generate_random_4d<FLOAT16>(1, input_f, 1, 1, -1, 1);
+    auto input2_data = generate_random_4d<FLOAT16>(1, input_f, 1, 1, -1, 1);
+    set_values(input1, flatten_4d(format::bfyx, input1_data));
+    set_values(input2, flatten_4d(format::bfyx, input2_data));
+
+    auto weights = memory::allocate(engine, { data_types::f16, format::oiyx, {input_f, input_f, 3, 3} });
+    // Construct weights for convolution that just double input values.
+    VVVVF<FLOAT16> weights_data;
+    weights_data.resize(input_f);
+    for (size_t oi = 0; oi < input_f; ++oi) {
+        weights_data[oi].resize(input_f, VVF<FLOAT16>(3, VF<FLOAT16>(3, FLOAT16(0.f))));
+        weights_data[oi][oi][1][1] = 2.f;
+    }
+    set_values(weights, flatten_4d(format::bfyx, weights_data));
+
+    topology topology;
+    topology.add(input_layout("input1", input1.get_layout()));
+    topology.add(input_layout("input2", input2.get_layout()));
+    topology.add(activation("actv1", "input1", activation_func::linear, { 0.75f }));
+    topology.add(activation("actv2", "input2", activation_func::linear, { 0.5f }));
+    topology.add(data("weights", weights));
+    topology.add(convolution("conv", "actv2", { "weights" }, tensor(1), tensor(batch(0), feature(0), spatial(-1, -1, 0, 0))));
+    topology.add(concatenation("depth1", { "actv1", "actv2" }, concatenation::along_f));
+    topology.add(concatenation("depth2", { "depth1", "conv" }, concatenation::along_f));
+    topology.add(reorder("output", "depth2", format::bfyx, data_types::f32));
+
+    cldnn::build_options options;
+    options.set_option(cldnn::build_option::optimize_data(true));
+     options.set_option(cldnn::build_option::force_implementations({ {"conv", implementation_desc{format::fs_b_yx_fsv32, ""} } }));
+    network network(engine, topology, options);
+
+    network.set_input_data("input1", input1);
+    network.set_input_data("input2", input2);
+
+    auto outputs = network.execute({});
+    EXPECT_EQ(outputs.size(), size_t(1));
+    EXPECT_EQ(outputs.begin()->first, "output");
+    // Check that all concatenations have been optimized out.
+    auto executed_primitives = network.get_executed_primitives();
+    EXPECT_TRUE(executed_primitives.count("depth1") == 0);
+    EXPECT_TRUE(executed_primitives.count("depth2") == 0);
+    // Check that convolution was able to use optimzed kernel.
+    for (auto& info : network.get_primitives_info()) {
+        if (info.original_id == "conv") {
+            EXPECT_TRUE(info.kernel_id.find("ref") == std::string::npos) << " selected kernel: " << info.kernel_id;
+        }
+    }
+
+    auto output = outputs.at("output").get_memory();
+    auto output_ptr = output.pointer<float>();
+    ASSERT_EQ(output.count(), output_f);
+    for (size_t i = 0; i < output_f; ++i) {
+        auto& val = output_ptr[i];
+        float ref;
+        if (i < input_f)
+            ref = 0.75f * static_cast<float>(input1_data[0][i % input_f][0][0]);
+        else if (i < 2 * input_f)
+            ref = 0.5f * static_cast<float>(input2_data[0][i % input_f][0][0]);
+        else
+            ref = static_cast<float>(input2_data[0][i % input_f][0][0]);
+
+        EXPECT_EQ(val, ref) << " at i=" << i;
+    }
+}
+
+TEST(depth_concatenate_f32_gpu, test07_padded_output) {
+    // input1 - activation - concatenation - convolution - reorder
+    // input2 - activation /
+    //
+    // *Convolution has input offset so it should be propagated back to activations.
+    const int32_t input_f = 32;
+    const int32_t output_f = 2 * input_f;
+
+    const auto& engine = get_test_engine();
+    auto input1 = memory::allocate(engine, { data_types::f16, format::fs_b_yx_fsv32, {1, input_f, 1, 1} });
+    auto input2 = memory::allocate(engine, { data_types::f16, format::fs_b_yx_fsv32, {1, input_f, 1, 1} });
+
+    auto input1_data = generate_random_4d<FLOAT16>(1, input_f, 1, 1, -1, 1);
+    auto input2_data = generate_random_4d<FLOAT16>(1, input_f, 1, 1, -1, 1);
+    set_values(input1, flatten_4d(format::bfyx, input1_data));
+    set_values(input2, flatten_4d(format::bfyx, input2_data));
+
+    auto weights = memory::allocate(engine, { data_types::f16, format::oiyx, {output_f, output_f, 3, 3} });
+    // Construct weights for convolution that just double input values.
+    VVVVF<FLOAT16> weights_data;
+    weights_data.resize(output_f);
+    for (size_t oi = 0; oi < output_f; ++oi) {
+        weights_data[oi].resize(output_f, VVF<FLOAT16>(3, VF<FLOAT16>(3, FLOAT16(0.f))));
+        weights_data[oi][oi][1][1] = 2.f;
+    }
+    set_values(weights, flatten_4d(format::bfyx, weights_data));
+
+    topology topology;
+    topology.add(input_layout("input1", input1.get_layout()));
+    topology.add(input_layout("input2", input2.get_layout()));
+    topology.add(activation("actv1", "input1", activation_func::linear, { 0.75f }));
+    topology.add(activation("actv2", "input2", activation_func::linear, { 0.5f }));
+    topology.add(concatenation("depth1", { "actv1", "actv2" }, concatenation::along_f));
+    topology.add(data("weights", weights));
+    topology.add(convolution("conv", "depth1", { "weights" }, tensor(1), tensor(batch(0), feature(0), spatial(-1, -1, 0, 0))));
+    topology.add(reorder("output", "conv", format::bfyx, data_types::f32));
+
+    cldnn::build_options options;
+    options.set_option(cldnn::build_option::optimize_data(true));
+    options.set_option(cldnn::build_option::force_implementations({ {"conv", implementation_desc{format::fs_b_yx_fsv32, ""} } }));
+    network network(engine, topology, options);
+
+    network.set_input_data("input1", input1);
+    network.set_input_data("input2", input2);
+
+    auto outputs = network.execute({});
+    EXPECT_EQ(outputs.size(), size_t(1));
+    EXPECT_EQ(outputs.begin()->first, "output");
+    // Check that all concatenations have been optimized out.
+    auto executed_primitives = network.get_executed_primitives();
+    EXPECT_TRUE(executed_primitives.count("depth1") == 0);
+    // Check that convolution was able to use optimzed kernel.
+    for (auto& info : network.get_primitives_info()) {
+        if (info.original_id == "conv") {
+            EXPECT_TRUE(info.kernel_id.find("ref") == std::string::npos) << " selected kernel: " << info.kernel_id;
+        }
+    }
+
+    auto output = outputs.at("output").get_memory();
+    auto output_ptr = output.pointer<float>();
+    ASSERT_EQ(output.count(), output_f);
+    for (size_t i = 0; i < output_f; ++i) {
+        auto& val = output_ptr[i];
+        float ref;
+        if (i < input_f)
+            ref = 1.5f * static_cast<float>(input1_data[0][i % input_f][0][0]);
+        else
+            ref = static_cast<float>(input2_data[0][i % input_f][0][0]);
+
+        EXPECT_EQ(val, ref) << " at i=" << i;
+    }
+}
+
+TEST(depth_concatenate_f32_gpu, test07_concat_is_output) {
+    // input1 - activation - concatenation
+    // input2 - activation /
+    //
+    // As concatenation is output it should not be optimizex out.
+    const int32_t input_f = 16;
+    const int32_t output_f = 2 * input_f;
+
+    const auto& engine = get_test_engine();
+    auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, {1, input_f, 1, 1} });
+    auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, {1, input_f, 1, 1} });
+
+    auto input1_data = generate_random_4d<float>(1, input_f, 1, 1, -1, 1);
+    auto input2_data = generate_random_4d<float>(1, input_f, 1, 1, -1, 1);
+    set_values(input1, flatten_4d(format::bfyx, input1_data));
+    set_values(input2, flatten_4d(format::bfyx, input2_data));
+
+    topology topology;
+    topology.add(input_layout("input1", input1.get_layout()));
+    topology.add(input_layout("input2", input2.get_layout()));
+    topology.add(activation("actv1", "input1", activation_func::linear, { 0.75f }));
+    topology.add(activation("actv2", "input2", activation_func::linear, { 0.5f }));
+    topology.add(concatenation("depth1", { "actv1", "actv2" }, concatenation::along_f));
+
+    cldnn::build_options options;
+    options.set_option(cldnn::build_option::optimize_data(true));
+    network network(engine, topology, options);
+
+    network.set_input_data("input1", input1);
+    network.set_input_data("input2", input2);
+
+    auto outputs = network.execute({});
+    EXPECT_EQ(outputs.size(), size_t(1));
+    EXPECT_EQ(outputs.begin()->first, "depth1");
+    // Check that concatenation haven't been optimized out.
+    auto executed_primitives = network.get_executed_primitives();
+    EXPECT_TRUE(executed_primitives.count("depth1") == 1);
+
+    auto output = outputs.at("depth1").get_memory();
+    auto output_ptr = output.pointer<float>();
+    ASSERT_EQ(output.count(), output_f);
+    for (size_t i = 0; i < output_f; ++i) {
+        auto& val = output_ptr[i];
+        float ref;
+        if (i < input_f)
+            ref = 0.75f * input1_data[0][i % input_f][0][0];
+        else
+            ref = 0.5f * input2_data[0][i % input_f][0][0];
+
+        EXPECT_EQ(val, ref) << " at i=" << i;
+    }
+}
+
 TEST(depth_concatenate_f32_gpu, concat_with_different_format_inputs) {
     const auto& engine = get_test_engine();
     build_options build_opt;