inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/concatenation_gpu_depth_bfyx_no_pitch.cl

   1 // Copyright (c) 2016-2017 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "include/include_all.cl"
  17
  18 //
  19 // In this kernel we are processing "fyx" as flatten 1D "elements".
  20 // As long as we can we use block read/write.
  21 // For last SIMD in which we have to write only partial data we use normal read/write to buffer.
  22 //
  23
  24 // must be 8 as long as we use block_read8/write8
  25 #define ELEMENTS_PER_WORK_ITEM 8
  26 #define WORK_GROUP_SIZE 16
  27 #define INPUT0_ELEMENTS_COUNT (INPUT0_LENGTH/INPUT0_BATCH_NUM)
  28
  29 #if FP16_UNIT_USED
  30     #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
  31     #define ALIGNED_BLOCK_WRITE8(ptr, byte_offset, val) intel_sub_group_block_write_us8((__global ushort*)(ptr) + (byte_offset), as_ushort8(val))
  32 #else
  33     #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_float8(intel_sub_group_block_read8((const __global uint*)(ptr) + (byte_offset)))
  34     #define ALIGNED_BLOCK_WRITE8(ptr, byte_offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (byte_offset), as_uint8(val))
  35 #endif
  36
  37 __attribute__((reqd_work_group_size(1, WORK_GROUP_SIZE, 1)))
  38 __attribute__((intel_reqd_sub_group_size(WORK_GROUP_SIZE)))
  39 KERNEL (concatenation_gpu_depth_bfyx_no_padding)(__global UNIT_TYPE* input, __global UNIT_TYPE* output, uint output_offset_in_concat_axis)
  40 {
  41     const uint batch_id = get_group_id(0);
  42
  43     // Which pack of 16*8 elements we are processing.
  44     uint element_group_id = get_group_id(1);
  45     uint element_offset = (uint)get_global_id(1) * ELEMENTS_PER_WORK_ITEM;
  46
  47     const uint element_group_offset = element_group_id * WORK_GROUP_SIZE * ELEMENTS_PER_WORK_ITEM;
  48
  49     const uint input_offset = INPUT0_OFFSET + element_group_offset + batch_id * INPUT0_BATCH_PITCH;
  50     const uint output_batch_offset = batch_id * OUTPUT_BATCH_PITCH;
  51     const uint output_offset = OUTPUT_OFFSET + element_group_offset + output_batch_offset + output_offset_in_concat_axis*OUTPUT_PITCHES[CONCAT_AXIS_INDEX];
  52
  53     //Check if current group in batch starts from 16-byte aligned pos. If not then move block read to 16-byte aligned position.
  54     //Requirement for intel_sub_group_block_write8.
  55     uint align_offset = 0;
  56     const uint group_start_pos = output_offset;
  57     if(group_start_pos % WORK_GROUP_SIZE != 0)
  58     {
  59         uint next_aligned_pos = group_start_pos / WORK_GROUP_SIZE * WORK_GROUP_SIZE + WORK_GROUP_SIZE;
  60         align_offset = next_aligned_pos - group_start_pos;
  61     }
  62
  63     if(element_group_offset + align_offset + WORK_GROUP_SIZE * ELEMENTS_PER_WORK_ITEM < INPUT0_ELEMENTS_COUNT)
  64     {
  65         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) in = ALIGNED_BLOCK_READ8(input, input_offset + align_offset);
  66         ALIGNED_BLOCK_WRITE8(output, output_offset + align_offset, ACTIVATION(in, NL_M, NL_N));
  67
  68         //Fill the values that were missed upon adding align_offset
  69         if((align_offset != 0) && (element_offset + output_batch_offset < group_start_pos + align_offset))
  70         {
  71             for(uint i = 0; i < align_offset; i++)
  72                 output[output_offset + i] = ACTIVATION(input[input_offset + i], NL_M, NL_N);
  73         }
  74     }
  75     else
  76     {
  77         // This is the last SIMD that needs to write only partial data.
  78         uint element_offset_in_workitem = element_offset - element_group_offset;
  79         for(uint i = 0; i < ELEMENTS_PER_WORK_ITEM; i++)
  80         {
  81             if(element_offset + i >= INPUT0_ELEMENTS_COUNT)
  82                 return;
  83
  84             output[output_offset + element_offset_in_workitem] = ACTIVATION(input[input_offset + element_offset_in_workitem], NL_M, NL_N);
  85             element_offset_in_workitem++;
  86         }
  87     }
  88 }
  89
  90 #undef INPUT0_ELEMENTS_COUNT
  91 #undef WORK_GROUP_SIZE
  92 #undef ELEMENTS_PER_WORK_ITEM
  93 #undef ALIGNED_BLOCK_READ8
  94 #undef ALIGNED_BLOCK_WRITE8