inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl

   1 // Copyright (c) 2018 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "include/include_all.cl"
  17
  18 #if MAX_POOLING
  19     #define INIT_VAL CHAR_MIN
  20 #elif AVG_POOLING
  21     #define INIT_VAL 0
  22 #else
  23 #error
  24 #endif
  25
  26
  27 inline int FUNC(apply_pooling)(int tmp, int in)
  28 {
  29 #if MAX_POOLING
  30     return max(tmp, in);
  31 #elif AVG_POOLING
  32     return tmp + in;
  33 #endif
  34 }
  35
  36 __attribute__((intel_reqd_sub_group_size(8)))
  37 KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)(
  38     const __global UNIT_TYPE* input,
  39     __global UNIT_TYPE* output)
  40 {
  41     const uint x    = (uint)get_global_id(0);
  42     const uint y    = (uint)get_global_id(1);
  43     const uint bf   = (uint)get_global_id(2);
  44         // we process 4 features per workitem that's why we need to divide it
  45     const uint aligned32_features = ((INPUT0_FEATURE_NUM + 31) / 32) * 32;
  46     const uint f    = (get_global_id(2) * 4) % aligned32_features;
  47     const uint b = 4 * ((get_global_id(2) * 4) / aligned32_features);
  48
  49     if (x >= OUTPUT_SIZE_X)
  50     {
  51         return;
  52     }
  53
  54     const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
  55     const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
  56
  57     int4 result[4] = { INIT_VAL };
  58
  59 #ifdef CHECK_BOUNDRY
  60     if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
  61         offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
  62     {
  63         return;
  64     }
  65
  66 #ifdef DYNAMIC_KERNEL_DIVIDER
  67     uint num_elementes = 0;
  68 #endif
  69
  70     const uint batch_and_feature_offset = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, 0, 0);
  71     for(uint j = 0; j < POOL_SIZE_Y; j++)
  72     {
  73         int input_offset_y = offset_y + j;
  74         bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
  75         if(!zero_y)
  76         {
  77             for(uint i = 0; i < POOL_SIZE_X; i++)
  78             {
  79                 int input_offset_x = offset_x + i;
  80                 bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
  81                 if(!zero)
  82                 {
  83                     const uint input_idx = batch_and_feature_offset + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH;
  84
  85                     int4 int_data = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx)));
  86                     for(uint b = 0; b < 4; b++)
  87                     {
  88                         char4 input_data = as_char4(int_data[b]);
  89                         result[b][0] = FUNC_CALL(apply_pooling)(result[b][0], (int)input_data[0]);
  90                         result[b][1] = FUNC_CALL(apply_pooling)(result[b][1], (int)input_data[1]);
  91                         result[b][2] = FUNC_CALL(apply_pooling)(result[b][2], (int)input_data[2]);
  92                         result[b][3] = FUNC_CALL(apply_pooling)(result[b][3], (int)input_data[3]);
  93
  94                     }
  95
  96 #ifdef DYNAMIC_KERNEL_DIVIDER
  97                     num_elementes++;
  98 #endif
  99                 }
 100             }
 101         }
 102     }
 103 #ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
 104     const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
 105     const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
 106     const uint num_elementes = (hend - offset_y) * (wend - offset_x);
 107 #endif
 108 #else
 109     uint input_idx = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, offset_y, offset_x);
 110
 111     for(uint j = 0; j < POOL_SIZE_Y; j++)
 112     {
 113         for(uint i = 0; i < POOL_SIZE_X; i++)
 114         {
 115             int4 int_data = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx)));
 116             for(uint b = 0; b < 4; b++)
 117             {
 118                 char4 input_data = as_char4(int_data[b]);
 119                 result[b][0] = FUNC_CALL(apply_pooling)(result[b][0], (int)input_data[0]);
 120                 result[b][1] = FUNC_CALL(apply_pooling)(result[b][1], (int)input_data[1]);
 121                 result[b][2] = FUNC_CALL(apply_pooling)(result[b][2], (int)input_data[2]);
 122                 result[b][3] = FUNC_CALL(apply_pooling)(result[b][3], (int)input_data[3]);
 123             }
 124
 125             input_idx += IN_X_PITCH;
 126         }
 127         input_idx += (IN_Y_PITCH - POOL_SIZE_X*IN_X_PITCH);
 128     }
 129
 130 #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
 131     const uint num_elementes = POOL_SIZE_X*POOL_SIZE_Y;
 132 #endif
 133 #endif
 134
 135 #if defined AVG_POOLING
 136     #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
 137         for(uint b = 0; b < 4; b++)
 138         {
 139             for(uint i = 0; i < 4; i++)
 140             {
 141                 result[b][i] = convert_int(round(((float)result[b][i] / max(num_elementes, (uint)1)));
 142             }
 143         }
 144     #else
 145         for(uint b = 0; b < 4; b++)
 146         {
 147             for(uint i = 0; i < 4; i++)
 148             {
 149                 result[b][i] = convert_int(round((float)result[b][i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
 150             }
 151         }
 152     #endif
 153 #endif
 154
 155     int4 char_result;
 156     for(uint b = 0; b < 4; b++)
 157     {
 158         char4 char_res = as_char4(char_result[b]);
 159         for(uint op = 0; op < 4; op++)
 160         {
 161             char_res[op] = ACTIVATION(convert_char(result[b][op]), NL_M ,NL_N);
 162         }
 163         char_result[b] = as_int(char_res);
 164     }
 165     const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x);
 166     intel_sub_group_block_write4((__global uint*)(output + output_pos), as_uint4(char_result));
 167 }
 168
 169 #undef INIT_VAL