1 // Copyright (c) 2016-2017 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 #include "include/include_all.cl"
18 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
19 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
20 KERNEL(pooling_gpu_average_opt)(const __global float* input, __global float* output)
22 int local_id = get_local_id(0);
23 int tile_x = get_global_id(0);
24 int tile_y = get_global_id(1);
25 int channel = get_global_id(2);
27 int start_x = tile_x / SUB_GROUP_SIZE * TILE_WIDTH;
28 int offset_x = start_x + (tile_x - tile_x / SUB_GROUP_SIZE * SUB_GROUP_SIZE) % TILE_WIDTH;
29 int offset = INPUT0_SIZE_Y * INPUT0_SIZE_X * channel;
31 int start_y = tile_y * TILE_HEIGHT;
32 int end_y = min(INPUT0_SIZE_Y - 1, start_y + TILE_HEIGHT - 1);
34 // Read 3 lines of SUB_GROUP_SIZE floats.
35 // The 3 lines start one float before the current (to the left) and one line up:
36 // For example: SUB_GROUP_SIZE=16
37 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
38 // 0 X 2 3 4 5 6 7 8 9 10 11 12 13 14 15
39 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
40 // In the diagram above X represents the current work item.
42 const __global float* base_addr = input + offset + (start_y * INPUT0_SIZE_X + start_x) - 1;
44 float input_buffer[3];
45 input_buffer[0] = as_float(intel_sub_group_block_read((const __global uint*)(base_addr - INPUT0_SIZE_X)));
46 input_buffer[1] = as_float(intel_sub_group_block_read((const __global uint*)(base_addr)));
51 float res, sum, sum_1, sum_2;
53 for (int y = start_y; y <= end_y; y++)
55 base_addr += INPUT0_SIZE_X;
57 input_buffer[third] = as_float(intel_sub_group_block_read((const __global uint*)(base_addr)));
59 #if INPUT0_SIZE_Y == 1
60 sum = input_buffer[second];
64 sum = input_buffer[second] + input_buffer[third];
66 else if (y == INPUT0_SIZE_Y - 1)
68 sum = input_buffer[first] + input_buffer[second];
72 sum = input_buffer[first] + input_buffer[second] + input_buffer[third];
76 sum_1 = intel_sub_group_shuffle_down(sum, 0.f, 1);
77 sum_2 = intel_sub_group_shuffle_down(sum, 0.f, 2);
79 #if INPUT0_SIZE_X == 1
80 res = sum_1 * ONE_OVER_POOL_SIZE;
84 res = (sum_1 + sum_2) * ONE_OVER_POOL_SIZE;
86 else if (offset_x == INPUT0_SIZE_X - 1)
88 res = (sum + sum_1) * ONE_OVER_POOL_SIZE;
92 res = (sum + sum_1 + sum_2) * ONE_OVER_POOL_SIZE;
96 if ((local_id < TILE_WIDTH) && (offset_x < INPUT0_SIZE_X))
98 output[offset + y * INPUT0_SIZE_X + offset_x] = ACTIVATION(res, NL_M ,NL_N);
101 first = (first + 1) % 3;
102 second = (second + 1) % 3;
103 third = (third + 1) % 3;