inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_average_opt.cl

   1 // Copyright (c) 2016-2017 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "include/include_all.cl"
  17
  18 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
  19 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
  20 KERNEL(pooling_gpu_average_opt)(const __global float* input, __global float* output)
  21 {
  22     int local_id = get_local_id(0);
  23     int tile_x = get_global_id(0);
  24     int tile_y = get_global_id(1);
  25     int channel = get_global_id(2);
  26
  27     int start_x = tile_x / SUB_GROUP_SIZE * TILE_WIDTH;
  28     int offset_x = start_x + (tile_x - tile_x / SUB_GROUP_SIZE * SUB_GROUP_SIZE) % TILE_WIDTH;
  29     int offset = INPUT0_SIZE_Y * INPUT0_SIZE_X * channel;
  30
  31     int start_y = tile_y * TILE_HEIGHT;
  32     int end_y = min(INPUT0_SIZE_Y - 1, start_y + TILE_HEIGHT - 1);
  33
  34     // Read 3 lines of SUB_GROUP_SIZE floats.
  35     // The 3 lines start one float before the current (to the left) and one line up:
  36     // For example: SUB_GROUP_SIZE=16
  37     // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
  38     // 0 X 2 3 4 5 6 7 8 9 10 11 12 13 14 15
  39     // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
  40     // In the diagram above X represents the current work item.
  41
  42     const __global float* base_addr = input + offset + (start_y * INPUT0_SIZE_X + start_x) - 1;
  43
  44     float input_buffer[3];
  45     input_buffer[0] = as_float(intel_sub_group_block_read((const __global uint*)(base_addr - INPUT0_SIZE_X)));
  46     input_buffer[1] = as_float(intel_sub_group_block_read((const __global uint*)(base_addr)));
  47
  48     int first = 0;
  49     int second = 1;
  50     int third = 2;
  51     float res, sum, sum_1, sum_2;
  52
  53     for (int y = start_y; y <= end_y; y++)
  54     {
  55         base_addr += INPUT0_SIZE_X;
  56
  57         input_buffer[third] = as_float(intel_sub_group_block_read((const __global uint*)(base_addr)));
  58
  59 #if INPUT0_SIZE_Y == 1
  60         sum = input_buffer[second];
  61 #else
  62         if (y == 0)
  63         {
  64             sum = input_buffer[second] + input_buffer[third];
  65         }
  66         else if (y == INPUT0_SIZE_Y - 1)
  67         {
  68             sum = input_buffer[first] + input_buffer[second];
  69         }
  70         else
  71         {
  72             sum = input_buffer[first] + input_buffer[second] + input_buffer[third];
  73         }
  74 #endif
  75
  76         sum_1 = intel_sub_group_shuffle_down(sum, 0.f, 1);
  77         sum_2 = intel_sub_group_shuffle_down(sum, 0.f, 2);
  78
  79 #if INPUT0_SIZE_X == 1
  80         res = sum_1 * ONE_OVER_POOL_SIZE;
  81 #else
  82         if (offset_x == 0)
  83         {
  84             res = (sum_1 + sum_2) * ONE_OVER_POOL_SIZE;
  85         }
  86         else if (offset_x == INPUT0_SIZE_X - 1)
  87         {
  88             res = (sum + sum_1) * ONE_OVER_POOL_SIZE;
  89         }
  90         else
  91         {
  92             res = (sum + sum_1 + sum_2) * ONE_OVER_POOL_SIZE;
  93         }
  94 #endif
  95
  96         if ((local_id < TILE_WIDTH) && (offset_x < INPUT0_SIZE_X))
  97         {
  98             output[offset + y * INPUT0_SIZE_X + offset_x] = ACTIVATION(res, NL_M ,NL_N);
  99         }
 100
 101         first = (first + 1) % 3;
 102         second = (second + 1) % 3;
 103         third = (third + 1) % 3;
 104     }
 105
 106 }