inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.cl

   1 // Copyright (c) 2016-2017 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "include/include_all.cl"
  17
  18 #ifndef DIRECTION
  19 #define DIRECTION 0
  20 #endif
  21
  22 #ifndef SIMD
  23 #define SIMD 16
  24 #endif
  25
  26 // Sums value of result across all subgroups.
  27 #define SUM_ACROSS_SUB_GROUP(val) \
  28  \
  29 { \
  30     val += intel_sub_group_shuffle(val, x+1); \
  31     val += intel_sub_group_shuffle(val, x+2); \
  32     val += intel_sub_group_shuffle(val, x+4); \
  33     val += (SIMD > 8) ? intel_sub_group_shuffle(val, x+8) : 0; \
  34     val += (SIMD > 16) ? intel_sub_group_shuffle(val, x+16) : 0; \
  35 }
  36
  37 // input     = [    batch,  sequence,               1,      input_size ]
  38 // weights   = [        1, direction, 4 * hidden_size,      input_size ]
  39 // recurrent = [        1, direction, 4 * hidden_size,     hidden_size ]
  40 // biases    = [        1,         1,       direction, 4 * hidden_size ] optional
  41 // hidden    = [    batch, direction,               1,     hidden_size ] optional
  42 // tempGEMM  = [    batch, direction,               1, 4 * hidden_size ] output
  43
  44 __attribute__((reqd_work_group_size(SIMD, 1, 1)))
  45 KERNEL(lstm_gemm)(
  46     const __global INPUT0_TYPE* input,
  47     __global OUTPUT_TYPE* output,
  48     const __global WEIGHTS_TYPE* weights
  49 #if HIDDEN_TERM
  50     , const __global OUTPUT_TYPE* hidden,
  51     const __global RECURRENT_TYPE* recurrent
  52 #endif
  53 #if BIAS_TERM
  54     , const __global BIAS_TYPE* biases
  55 #endif
  56     )
  57 {
  58     const uint x = get_local_id(0);
  59     const uint y = get_global_id(1);
  60         const int local_sz = get_local_size(0);
  61         const int weight_num_rows = get_global_size(1);
  62
  63         uint K;
  64         int start_offset;
  65         int end_offset;
  66         int matrix_offset;
  67         int vector_offset;
  68         float4 sum;
  69         float result;
  70
  71         K = INPUT0_SIZE_X;  // Width of  weight matrix
  72         start_offset = GET_DATA_INDEX(WEIGHTS, 0, DIRECTION, y, 0);  // set as the starting offset of the weight matrix
  73         end_offset = start_offset + K;
  74         matrix_offset = start_offset + (x * 4);  // Weight offset for the work item to work on
  75         vector_offset = GET_DATA_INDEX(INPUT0, 0, 0, INPUT_DIRECTION, (x*4));  // Input offset for the work item to work on
  76         sum = (float4)(0.f);
  77         result = 0;
  78         for(; matrix_offset < end_offset; matrix_offset += (local_sz * 4), vector_offset += (local_sz * 4))
  79         {
  80                 float4 mask = (float4) (1 , (matrix_offset + 1) < end_offset , (matrix_offset + 2) < end_offset , (matrix_offset + 3) < end_offset);
  81                 float4 m = (float4) (weights[matrix_offset], weights[matrix_offset + 1], weights[matrix_offset + 2], weights[matrix_offset + 3]);
  82                 m = m * mask;
  83
  84                 const float4 v = (float4) (input[vector_offset], input[vector_offset + 1], input[vector_offset + 2], input[vector_offset + 3]);
  85
  86                 sum = mad(m, v, sum);
  87         }
  88
  89         result = sum.x + sum.y + sum.z + sum.w;
  90
  91 #if HIDDEN_TERM
  92         K = HIDDEN_SIZE_X;  // width of recurrent matrix
  93         start_offset =  GET_DATA_INDEX(RECURRENT, 0, DIRECTION, y, 0);  // set as the starting offset of the recurrent matrix
  94         end_offset = start_offset + K;
  95         matrix_offset = start_offset + (x * 4);  // recurrent offset for the work item to work on
  96         vector_offset = GET_DATA_INDEX(HIDDEN, 0, 0, HIDDEN_DIRECTION, (x*4));  // hidden vector offset for the work item to work on
  97         sum = (float4)(0.f);
  98         for(; matrix_offset < end_offset; matrix_offset += (local_sz * 4), vector_offset += (local_sz * 4))
  99         {
 100                 float4 mask = (float4) (1 , (matrix_offset + 1) < end_offset , (matrix_offset + 2) < end_offset , (matrix_offset + 3) < end_offset);
 101                 float4 m = (float4) (recurrent[matrix_offset], recurrent[matrix_offset + 1], recurrent[matrix_offset + 2], recurrent[matrix_offset + 3]);
 102                 m = m * mask;
 103
 104                 const float4 v = (float4) (hidden[vector_offset], hidden[vector_offset + 1], hidden[vector_offset + 2], hidden[vector_offset + 3]);
 105
 106                 sum = mad(m, v, sum);
 107         }
 108
 109         result += sum.x + sum.y + sum.z + sum.w;
 110 #endif
 111
 112         // Add together partial sums contained in each work item's "result" variable
 113         SUM_ACROSS_SUB_GROUP(result);
 114
 115         if(x == 0)
 116         {
 117                 output[y] = (OUTPUT_TYPE)result;
 118
 119 #if BIAS_TERM
 120                 const uint bias_idx = GET_DATA_INDEX(BIAS, 0, 0, DIRECTION, y);
 121                 float bias = (ACCUMULATOR_TYPE)biases[bias_idx];
 122                 output[y] += (OUTPUT_TYPE)bias;
 123 #endif
 124         }
 125 }
 126
 127 #undef SUM_ACROSS_SUB_GROUP
 128 #undef SIMD