[IE CLDNN] fix perf for fsv16 global avg pooling (#878)

author Vladimir Paramuzov <vladimir.paramuzov@intel.com>

Thu, 11 Jun 2020 17:45:11 +0000 (20:45 +0300)

committer GitHub <noreply@github.com>

Thu, 11 Jun 2020 17:45:11 +0000 (20:45 +0300)
author Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Thu, 11 Jun 2020 17:45:11 +0000 (20:45 +0300)
committer GitHub <noreply@github.com>
Thu, 11 Jun 2020 17:45:11 +0000 (20:45 +0300)
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.cpp

index aeb4337..157430a 100644 (file)
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.cpp
@@ -53,11 +53,20 @@ size_t PoolingKernel_b_fs_yx_fsv16::GetBlockSize(const pooling_params& params) c
          return 1;
  }
  
+size_t PoolingKernel_b_fs_yx_fsv16::GetSimdSize(const pooling_params& params) const {
+    auto& out = params.output;
+    // Use smaller simd size in case of global pooling and small channels count to have more threads
+    if (out.X().v == 1 && out.Y().v == 1 && out.Feature().v < 64)
+        return 8;
+    else
+        return 16;
+}
+
  PoolingKernelBase::DispatchData PoolingKernel_b_fs_yx_fsv16::SetDefault(const pooling_params& params) const {
      DispatchData kd = PoolingKernelBase::SetDefault(params);
  
      const auto& out = params.output;
-    const size_t alignment = 16;
+    const size_t alignment = GetSimdSize(params);
      size_t x_block_size = GetBlockSize(params);
      auto x = out.X().v;
      auto y = out.Y().v;
@@ -78,7 +87,7 @@ PoolingKernelBase::DispatchData PoolingKernel_b_fs_yx_fsv16::SetDefault(const po
  }
  
  JitConstants PoolingKernel_b_fs_yx_fsv16::GetJitConstants(const pooling_params& params, DispatchData runInfo) const {
-    const size_t alignment = 16;
+    const size_t alignment = GetSimdSize(params);
      size_t x_block_size = GetBlockSize(params);
      auto input = params.inputs[0];
      auto output = params.output;
@@ -86,13 +95,26 @@ JitConstants PoolingKernel_b_fs_yx_fsv16::GetJitConstants(const pooling_params&
  
      size_t input_line_size = params.poolStride.x * (x_block_size - 1) + params.poolSize.x;
  
+    auto acc_type = GetAccumulatorType(params);
+    jit.Merge(MakeTypeJitConstants(acc_type, "ACCUMULATOR"));
+
+    auto can_preload_full_line = [&]() -> bool {
+        const float max_reg_bytes = 128 * 32 * 0.95f;
+        const size_t line_bytes = input_line_size * BytesPerElement(input.GetDType());
+        const size_t acc_bytes = x_block_size * BytesPerElement(acc_type);
+
+        const float req_bytes = static_cast<float>((line_bytes + acc_bytes) * alignment);
+
+        return req_bytes < max_reg_bytes;
+    };
+
+    jit.AddConstant(MakeJitConstant("CAN_PRELOAD_FULL_LINE", can_preload_full_line()));
      jit.AddConstant(MakeJitConstant("PADDED_INPUT", params.inputs[0].X().pad.Total() != 0));
      jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", x_block_size));
      jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
      jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", alignment));
      jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(output.X().v, x_block_size)));
      jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
-    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
  
      if (params.output.Feature().v % 16 != 0) {
          jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
@@ -101,7 +123,7 @@ JitConstants PoolingKernel_b_fs_yx_fsv16::GetJitConstants(const pooling_params&
      if (!params.fused_ops.empty()) {
          auto input_dt = GetActivationType(params);
          FusedOpsConfiguration conf_vec = {"_VEC",
-                                         {"b", "(f_block*16)", "y", "x"},
+                                         {"b", "(f_block*FEATURE_SLICE_SIZE + f_val*SUB_GROUP_SIZE)", "y", "x"},
                                           "pool_result",
                                           input_dt,
                                           x_block_size,
@@ -110,7 +132,7 @@ JitConstants PoolingKernel_b_fs_yx_fsv16::GetJitConstants(const pooling_params&
                                           IndexType::TENSOR_COORD,
                                           Tensor::DataChannelName::X};
          FusedOpsConfiguration conf_scalar = {"_SCALAR",
-                                            {"b", "(f_block*16)", "y", "(x+i)"},
+                                            {"b", "(f_block*FEATURE_SLICE_SIZE + f_val*SUB_GROUP_SIZE)", "y", "(x+i)"},
                                              "pool_result[i]",
                                              input_dt,
                                              1,
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.h

index 4877d4f..6b35c94 100644 (file)
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.h
@@ -37,5 +37,6 @@ protected:
      }
  
      size_t GetBlockSize(const pooling_params& params) const;
+    size_t GetSimdSize(const pooling_params& params) const;
  };
  }  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_blocked.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_blocked.cl

index c20dbc1..92ad468 100644 (file)
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_blocked.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_blocked.cl
@@ -39,7 +39,7 @@
      #define INIT_VAL ACCUMULATOR_VAL_ZERO
  #endif
  
-__attribute__((intel_reqd_sub_group_size(16)))
+__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
  KERNEL(pooling_gpu_blocked)(
      const __global INPUT0_TYPE* input,
      __global OUTPUT_TYPE* output
@@ -49,7 +49,13 @@ KERNEL(pooling_gpu_blocked)(
  )
  {
      const int lid = get_sub_group_local_id();
+#if SUB_GROUP_SIZE == 16
      const int f_block = get_group_id(1);
+    const int f_val = 0;
+#else
+    const int f_block = (uint)get_group_id(1) / (FEATURE_SLICE_SIZE / SUB_GROUP_SIZE);
+    const int f_val = (uint)get_group_id(1) % (FEATURE_SLICE_SIZE / SUB_GROUP_SIZE);
+#endif
      const int b = get_global_id(2);
  
      const int xy = get_global_id(0);
@@ -71,7 +77,8 @@ KERNEL(pooling_gpu_blocked)(
      const uint input_offset = b * input_b_pitch +
                                (f_block + input_fs_pad_before) * input_fs_pitch +
                                (INPUT0_PAD_BEFORE_SIZE_Y + input_y) * input_y_pitch +
-                              (INPUT0_PAD_BEFORE_SIZE_X + input_x) * input_x_pitch;
+                              (INPUT0_PAD_BEFORE_SIZE_X + input_x) * input_x_pitch +
+                              f_val * SUB_GROUP_SIZE;
  
      // Output offset calculations:
      const uint output_x_pitch = FEATURE_SLICE_SIZE;
@@ -85,7 +92,8 @@ KERNEL(pooling_gpu_blocked)(
      const uint output_offset = b * output_b_pitch +
                                 (f_block + output_fs_pad_before) * output_fs_pitch +
                                 (y + OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
-                               (x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;
+                               (x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch +
+                               f_val * SUB_GROUP_SIZE;
  
  
      ACCUMULATOR_VAR_TYPE dst = (ACCUMULATOR_VAR_TYPE)INIT_VAL;
@@ -122,12 +130,13 @@ KERNEL(pooling_gpu_blocked)(
          if (input_y + kh < 0 || input_y + kh >= INPUT0_SIZE_Y)
              continue;
  
+#if CAN_PRELOAD_FULL_LINE
          INPUT0_TYPE line_cache[INPUT_LINE_SIZE];
          for (int i = 0; i < INPUT_LINE_SIZE; i++) {
              if ((input_x + i) >= 0 && (input_x + i) < INPUT0_SIZE_X)
                  line_cache[i] = DT_INPUT_BLOCK_READ(input, input_offset + kh*input_y_pitch + i*input_x_pitch);
              else
-                #if   defined MAX_POOLING
+                #if defined MAX_POOLING
                      line_cache[i] = INPUT0_VAL_MIN;
                  #elif defined AVG_POOLING
                      line_cache[i] = INPUT0_VAL_ZERO;
@@ -152,35 +161,56 @@ KERNEL(pooling_gpu_blocked)(
              dst += src;
  #endif
          }
-    }
  
+#else // CAN_PRELOAD_FULL_LINE
+        // TODO: try partial preload
+        for (int kw = 0; kw < POOL_SIZE_X; kw++)
+        {
+            INPUT_VAR_TYPE src;
+#if X_BLOCK_SIZE > 1
+            for (int i = 0; i < X_BLOCK_SIZE; i++) {
+                if ((input_x + kw + STRIDE_SIZE_X*i) >= 0 && (input_x + kw + STRIDE_SIZE_X*i) < INPUT0_SIZE_X)
+                    src[i] = DT_INPUT_BLOCK_READ(input, input_offset + kh*input_y_pitch + (kw + STRIDE_SIZE_X*i)*input_x_pitch);
+                else
+                    #if defined MAX_POOLING
+                        src[i] = INPUT0_VAL_MIN;
+                    #elif defined AVG_POOLING
+                        src[i] = INPUT0_VAL_ZERO;
+                    #endif
+            }
+#else
+            src = DT_INPUT_BLOCK_READ(input, input_offset + kh*input_y_pitch + kw*input_x_pitch);
+#endif
+#if defined MAX_POOLING
+            dst = ACCUMULATOR_MAX_FUNC(dst, src);
+#elif defined AVG_POOLING
+            dst += TO_ACCUMULATOR_VAR_TYPE(src);
+#endif
+        }
+#endif // CAN_PRELOAD_FULL_LINE
+    }
      ACTIVATION_VAR_TYPE pool_result;
  
  #if defined MAX_POOLING
-        pool_result = TO_ACTIVATION_VAR_TYPE(dst);
-    #if !HAS_FUSED_OP
-        pool_result = ACTIVATION(pool_result, ACTIVATION_PARAMS);
-    #endif
+    pool_result = TO_ACTIVATION_VAR_TYPE(dst);
  #elif defined AVG_POOLING && (defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER))
-        pool_result = TO_ACTIVATION_VAR_TYPE(dst*scale);
-    #if !HAS_FUSED_OP
-        pool_result = ACTIVATION(pool_result, ACTIVATION_PARAMS);
-    #endif
+    pool_result = TO_ACTIVATION_VAR_TYPE(dst*scale);
  #elif defined AVG_POOLING
-        pool_result = TO_ACTIVATION_VAR_TYPE(dst/(POOL_SIZE_X*POOL_SIZE_Y));
-    #if !HAS_FUSED_OP
-        pool_result = ACTIVATION(pool_result, ACTIVATION_PARAMS);
-    #endif
+    pool_result = TO_ACTIVATION_VAR_TYPE(dst/(POOL_SIZE_X*POOL_SIZE_Y));
+#endif
+
+#if !HAS_FUSED_OPS
+    pool_result = ACTIVATION(pool_result, ACTIVATION_PARAMS);
  #endif
  
      OUTPUT_VAR_TYPE final_result;
  
  #if OUTPUT_LEFTOVERS
-    if ((f_block+1)*FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) {
+    if (f_block*FEATURE_SLICE_SIZE + (f_val + 1)*SUB_GROUP_SIZE >= OUTPUT_FEATURE_NUM) {
          for (int i = 0; i < X_BLOCK_SIZE; i++) {
-            if ((f_block*FEATURE_SLICE_SIZE + lid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) {
+            if ((f_block*FEATURE_SLICE_SIZE + f_val*SUB_GROUP_SIZE + lid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) {
  #if X_BLOCK_SIZE > 1
-            #if HAS_FUSED_OP
+            #if HAS_FUSED_OPS
                  FUSED_OPS_SCALAR;
                  final_result[i] = FUSED_OPS_RESULT_SCALAR;
              #else
@@ -195,7 +225,6 @@ KERNEL(pooling_gpu_blocked)(
                  final_result = TO_OUTPUT_VAR_TYPE(pool_result);
              #endif
                  output[output_offset + i * output_x_pitch + lid] = final_result;
-
  #endif
              }
          }
@@ -211,6 +240,7 @@ KERNEL(pooling_gpu_blocked)(
                  final_result = TO_OUTPUT_VAR_TYPE(pool_result);
          #endif
  
+#if SUB_GROUP_SIZE == FEATURE_SLICE_SIZE
          #if X_BLOCK_SIZE == 8
                  DT_OUTPUT_BLOCK_WRITE8(output, output_offset, final_result);
          #elif X_BLOCK_SIZE == 4
@@ -220,10 +250,20 @@ KERNEL(pooling_gpu_blocked)(
          #elif X_BLOCK_SIZE == 1
                  DT_OUTPUT_BLOCK_WRITE(output, output_offset, final_result);
          #endif
+#else
+    #if X_BLOCK_SIZE > 1
+        __attribute__((opencl_unroll_hint(X_BLOCK_SIZE)))
+        for (int i = 0; i < X_BLOCK_SIZE; i++) {
+            DT_OUTPUT_BLOCK_WRITE(output, output_offset + i * output_x_pitch, final_result[i]);
+        }
+    #else
+        DT_OUTPUT_BLOCK_WRITE(output, output_offset, final_result);
+    #endif
+#endif
      }
      else
      {
-        const int x_tail = OUTPUT_SIZE_X - x;
+        const int x_tail = OUTPUT_SIZE_X % X_BLOCK_SIZE;
          for (int i = 0; i < x_tail; i++){
  #if X_BLOCK_SIZE > 1
          #if HAS_FUSED_OPS
author	Vladimir Paramuzov <vladimir.paramuzov@intel.com>
	Thu, 11 Jun 2020 17:45:11 +0000 (20:45 +0300)
committer	GitHub <noreply@github.com>
	Thu, 11 Jun 2020 17:45:11 +0000 (20:45 +0300)
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.cpp		patch \| blob \| history
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.h		patch \| blob \| history
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_blocked.cl		patch \| blob \| history