1 // Copyright (c) 2016-2017 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 #include "include/include_all.cl"
19 // In this kernel we are processing "fyx" as flatten 1D "elements".
20 // As long as we can we use block read/write.
21 // For last SIMD in which we have to write only partial data we use normal read/write to buffer.
24 // must be 8 as long as we use block_read8/write8
25 #define ELEMENTS_PER_WORK_ITEM 8
26 #define WORK_GROUP_SIZE 16
27 #define INPUT0_ELEMENTS_COUNT (INPUT0_LENGTH/INPUT0_BATCH_NUM)
30 #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
31 #define ALIGNED_BLOCK_WRITE8(ptr, byte_offset, val) intel_sub_group_block_write_us8((__global ushort*)(ptr) + (byte_offset), as_ushort8(val))
33 #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_float8(intel_sub_group_block_read8((const __global uint*)(ptr) + (byte_offset)))
34 #define ALIGNED_BLOCK_WRITE8(ptr, byte_offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (byte_offset), as_uint8(val))
37 __attribute__((reqd_work_group_size(1, WORK_GROUP_SIZE, 1)))
38 __attribute__((intel_reqd_sub_group_size(WORK_GROUP_SIZE)))
39 KERNEL (concatenation_gpu_depth_bfyx_no_padding)(__global UNIT_TYPE* input, __global UNIT_TYPE* output, uint output_offset_in_concat_axis)
41 const uint batch_id = get_group_id(0);
43 // Which pack of 16*8 elements we are processing.
44 uint element_group_id = get_group_id(1);
45 uint element_offset = (uint)get_global_id(1) * ELEMENTS_PER_WORK_ITEM;
47 const uint element_group_offset = element_group_id * WORK_GROUP_SIZE * ELEMENTS_PER_WORK_ITEM;
49 const uint input_offset = INPUT0_OFFSET + element_group_offset + batch_id * INPUT0_BATCH_PITCH;
50 const uint output_batch_offset = batch_id * OUTPUT_BATCH_PITCH;
51 const uint output_offset = OUTPUT_OFFSET + element_group_offset + output_batch_offset + output_offset_in_concat_axis*OUTPUT_PITCHES[CONCAT_AXIS_INDEX];
53 //Check if current group in batch starts from 16-byte aligned pos. If not then move block read to 16-byte aligned position.
54 //Requirement for intel_sub_group_block_write8.
55 uint align_offset = 0;
56 const uint group_start_pos = output_offset;
57 if(group_start_pos % WORK_GROUP_SIZE != 0)
59 uint next_aligned_pos = group_start_pos / WORK_GROUP_SIZE * WORK_GROUP_SIZE + WORK_GROUP_SIZE;
60 align_offset = next_aligned_pos - group_start_pos;
63 if(element_group_offset + align_offset + WORK_GROUP_SIZE * ELEMENTS_PER_WORK_ITEM < INPUT0_ELEMENTS_COUNT)
65 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) in = ALIGNED_BLOCK_READ8(input, input_offset + align_offset);
66 ALIGNED_BLOCK_WRITE8(output, output_offset + align_offset, ACTIVATION(in, NL_M, NL_N));
68 //Fill the values that were missed upon adding align_offset
69 if((align_offset != 0) && (element_offset + output_batch_offset < group_start_pos + align_offset))
71 for(uint i = 0; i < align_offset; i++)
72 output[output_offset + i] = ACTIVATION(input[input_offset + i], NL_M, NL_N);
77 // This is the last SIMD that needs to write only partial data.
78 uint element_offset_in_workitem = element_offset - element_group_offset;
79 for(uint i = 0; i < ELEMENTS_PER_WORK_ITEM; i++)
81 if(element_offset + i >= INPUT0_ELEMENTS_COUNT)
84 output[output_offset + element_offset_in_workitem] = ACTIVATION(input[input_offset + element_offset_in_workitem], NL_M, NL_N);
85 element_offset_in_workitem++;
90 #undef INPUT0_ELEMENTS_COUNT
91 #undef WORK_GROUP_SIZE
92 #undef ELEMENTS_PER_WORK_ITEM
93 #undef ALIGNED_BLOCK_READ8
94 #undef ALIGNED_BLOCK_WRITE8