inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/binary_convolution_gpu_1x1.cl

   1 // Copyright (c) 2019 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "include/include_all.cl"
  17
  18 #define OC_BLOCK_SIZE 32
  19
  20 #define GET_WEI(data, id) intel_sub_group_shuffle(data, id)
  21 #define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(intel_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
  22 #define ALIGNED_BLOCK_WRITE(ptr, byte_offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (byte_offset), as_uint(val))
  23 #define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(intel_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
  24
  25 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
  26 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
  27 KERNEL(binary_convolution_1x1)(const __global INPUT0_TYPE* input,
  28                                      __global OUTPUT_TYPE* output,
  29                                const __global FILTER_TYPE* weights,
  30 #if HAS_FUSED_OPS_DECLS
  31                                FUSED_OPS_DECLS,
  32 #endif
  33                                uint split_idx)
  34 {
  35     const int xy = get_group_id(0);
  36     const int f_block = get_global_id(1);
  37     const int b = get_global_id(2);
  38     const int lid = get_sub_group_local_id();
  39 #if PADDED_INPUT
  40     const int x = (xy * XY_BLOCK_SIZE + lid) % OUTPUT_SIZE_X;
  41     const int y = (xy * XY_BLOCK_SIZE + lid) / OUTPUT_SIZE_X;
  42     const uint input_offset = INPUT0_OFFSET
  43                             + b*INPUT0_FEATURE_NUM_PACKED*INPUT0_FEATURE_PITCH
  44                             + y*INPUT0_Y_PITCH;
  45 #else
  46     const int x = (xy * XY_BLOCK_SIZE + lid) % OUTPUT_SIZE_X;
  47     const int y = (xy * XY_BLOCK_SIZE + lid) / OUTPUT_SIZE_X;
  48     const uint input_offset = INPUT0_OFFSET
  49                             + b*INPUT0_FEATURE_NUM_PACKED*INPUT0_FEATURE_PITCH
  50                             + xy*XY_BLOCK_SIZE;
  51 #endif
  52     typedef MAKE_VECTOR_TYPE(FILTER_TYPE, 2) wei_t;
  53
  54 #if BINARY_PACKED_OUTPUT
  55     const uint dst_index = OUTPUT_OFFSET
  56                          + b*OUTPUT_FEATURE_NUM_PACKED*OUTPUT_FEATURE_PITCH
  57                          + f_block*OUTPUT_FEATURE_PITCH;
  58 #else
  59     const uint dst_index = OUTPUT_OFFSET
  60                          + b*OUTPUT_BATCH_PITCH
  61                          + f_block*OC_BLOCK_SIZE*OUTPUT_FEATURE_PITCH;
  62 #endif
  63     const uint filter_offset = f_block*OC_BLOCK_SIZE*INPUT0_FEATURE_NUM_PACKED;
  64
  65     int dst_buf[OC_BLOCK_SIZE] = { 0 }; // 32 OC
  66
  67     for (int k = 0; k < INPUT0_FEATURE_NUM_PACKED; ++k)
  68     {
  69         // Load 16 input elements from feature map by subgroup
  70 #if PADDED_INPUT
  71         INPUT0_TYPE src = input[input_offset + k*INPUT0_FEATURE_PITCH + x];
  72 #else
  73         INPUT0_TYPE src = ALIGNED_BLOCK_READ(input, input_offset + k*INPUT0_FEATURE_PITCH);
  74 #endif
  75
  76         // Load 32 OC x 32 ICP. Each WI has lid-th and (lid+16)-th channels
  77         wei_t wei = ALIGNED_BLOCK_READ2(weights, filter_offset + k * OC_BLOCK_SIZE);
  78
  79         // Shuffle 32 OC x 32 ICP of weights in each WI
  80         const wei_t wei0  = GET_WEI(wei, 0);
  81         const wei_t wei1  = GET_WEI(wei, 1);
  82         const wei_t wei2  = GET_WEI(wei, 2);
  83         const wei_t wei3  = GET_WEI(wei, 3);
  84         const wei_t wei4  = GET_WEI(wei, 4);
  85         const wei_t wei5  = GET_WEI(wei, 5);
  86         const wei_t wei6  = GET_WEI(wei, 6);
  87         const wei_t wei7  = GET_WEI(wei, 7);
  88         const wei_t wei8  = GET_WEI(wei, 8);
  89         const wei_t wei9  = GET_WEI(wei, 9);
  90         const wei_t wei10 = GET_WEI(wei, 10);
  91         const wei_t wei11 = GET_WEI(wei, 11);
  92         const wei_t wei12 = GET_WEI(wei, 12);
  93         const wei_t wei13 = GET_WEI(wei, 13);
  94         const wei_t wei14 = GET_WEI(wei, 14);
  95         const wei_t wei15 = GET_WEI(wei, 15);
  96
  97 #if LEFTOVERS_IC
  98         if (k == INPUT0_FEATURE_NUM_PACKED - 1)
  99         {
 100             dst_buf[0]  += popcount((wei0.s0 ^ src) & FILTER_MASK);
 101             dst_buf[1]  += popcount((wei1.s0 ^ src) & FILTER_MASK);
 102             dst_buf[2]  += popcount((wei2.s0 ^ src) & FILTER_MASK);
 103             dst_buf[3]  += popcount((wei3.s0 ^ src) & FILTER_MASK);
 104             dst_buf[4]  += popcount((wei4.s0 ^ src) & FILTER_MASK);
 105             dst_buf[5]  += popcount((wei5.s0 ^ src) & FILTER_MASK);
 106             dst_buf[6]  += popcount((wei6.s0 ^ src) & FILTER_MASK);
 107             dst_buf[7]  += popcount((wei7.s0 ^ src) & FILTER_MASK);
 108             dst_buf[8]  += popcount((wei8.s0 ^ src) & FILTER_MASK);
 109             dst_buf[9]  += popcount((wei9.s0 ^ src) & FILTER_MASK);
 110             dst_buf[10] += popcount((wei10.s0 ^ src) & FILTER_MASK);
 111             dst_buf[11] += popcount((wei11.s0 ^ src) & FILTER_MASK);
 112             dst_buf[12] += popcount((wei12.s0 ^ src) & FILTER_MASK);
 113             dst_buf[13] += popcount((wei13.s0 ^ src) & FILTER_MASK);
 114             dst_buf[14] += popcount((wei14.s0 ^ src) & FILTER_MASK);
 115             dst_buf[15] += popcount((wei15.s0 ^ src) & FILTER_MASK);
 116
 117 #if OUTPUT_FEATURE_NUM > 16
 118             dst_buf[16] += popcount((wei0.s1 ^ src) & FILTER_MASK);
 119             dst_buf[17] += popcount((wei1.s1 ^ src) & FILTER_MASK);
 120             dst_buf[18] += popcount((wei2.s1 ^ src) & FILTER_MASK);
 121             dst_buf[19] += popcount((wei3.s1 ^ src) & FILTER_MASK);
 122             dst_buf[20] += popcount((wei4.s1 ^ src) & FILTER_MASK);
 123             dst_buf[21] += popcount((wei5.s1 ^ src) & FILTER_MASK);
 124             dst_buf[22] += popcount((wei6.s1 ^ src) & FILTER_MASK);
 125             dst_buf[23] += popcount((wei7.s1 ^ src) & FILTER_MASK);
 126             dst_buf[24] += popcount((wei8.s1 ^ src) & FILTER_MASK);
 127             dst_buf[25] += popcount((wei9.s1 ^ src) & FILTER_MASK);
 128             dst_buf[26] += popcount((wei10.s1 ^ src) & FILTER_MASK);
 129             dst_buf[27] += popcount((wei11.s1 ^ src) & FILTER_MASK);
 130             dst_buf[28] += popcount((wei12.s1 ^ src) & FILTER_MASK);
 131             dst_buf[29] += popcount((wei13.s1 ^ src) & FILTER_MASK);
 132             dst_buf[30] += popcount((wei14.s1 ^ src) & FILTER_MASK);
 133             dst_buf[31] += popcount((wei15.s1 ^ src) & FILTER_MASK);
 134 #endif
 135             break;
 136         }
 137 #endif
 138         dst_buf[0]  += popcount(wei0.s0 ^ src);
 139         dst_buf[1]  += popcount(wei1.s0 ^ src);
 140         dst_buf[2]  += popcount(wei2.s0 ^ src);
 141         dst_buf[3]  += popcount(wei3.s0 ^ src);
 142         dst_buf[4]  += popcount(wei4.s0 ^ src);
 143         dst_buf[5]  += popcount(wei5.s0 ^ src);
 144         dst_buf[6]  += popcount(wei6.s0 ^ src);
 145         dst_buf[7]  += popcount(wei7.s0 ^ src);
 146         dst_buf[8]  += popcount(wei8.s0 ^ src);
 147         dst_buf[9]  += popcount(wei9.s0 ^ src);
 148         dst_buf[10] += popcount(wei10.s0 ^ src);
 149         dst_buf[11] += popcount(wei11.s0 ^ src);
 150         dst_buf[12] += popcount(wei12.s0 ^ src);
 151         dst_buf[13] += popcount(wei13.s0 ^ src);
 152         dst_buf[14] += popcount(wei14.s0 ^ src);
 153         dst_buf[15] += popcount(wei15.s0 ^ src);
 154
 155 #if OUTPUT_FEATURE_NUM > 16
 156         dst_buf[16] += popcount(wei0.s1 ^ src);
 157         dst_buf[17] += popcount(wei1.s1 ^ src);
 158         dst_buf[18] += popcount(wei2.s1 ^ src);
 159         dst_buf[19] += popcount(wei3.s1 ^ src);
 160         dst_buf[20] += popcount(wei4.s1 ^ src);
 161         dst_buf[21] += popcount(wei5.s1 ^ src);
 162         dst_buf[22] += popcount(wei6.s1 ^ src);
 163         dst_buf[23] += popcount(wei7.s1 ^ src);
 164         dst_buf[24] += popcount(wei8.s1 ^ src);
 165         dst_buf[25] += popcount(wei9.s1 ^ src);
 166         dst_buf[26] += popcount(wei10.s1 ^ src);
 167         dst_buf[27] += popcount(wei11.s1 ^ src);
 168         dst_buf[28] += popcount(wei12.s1 ^ src);
 169         dst_buf[29] += popcount(wei13.s1 ^ src);
 170         dst_buf[30] += popcount(wei14.s1 ^ src);
 171         dst_buf[31] += popcount(wei15.s1 ^ src);
 172 #endif
 173     }
 174
 175     // Load data for fused operations (scales, biases, quantization thresholds, etc)
 176 #if CUSTOM_FUSED_OPS
 177     FUSED_OPS_PREPARE_DATA;
 178 #endif
 179
 180     UNIT_TYPE dst[OC_BLOCK_SIZE];
 181     for (int oc = 0; oc < OC_BLOCK_SIZE; oc++)
 182     {
 183         CONV_RESULT_TYPE res = TO_CONV_RESULT_TYPE(INPUT0_FEATURE_NUM - 2*dst_buf[oc]);
 184 #if CUSTOM_FUSED_OPS
 185         DO_ELTWISE_FUSED_OPS;
 186 // Don't save floating-point intermediate result, since packed one is already computed
 187 #if !BINARY_PACKED_OUTPUT
 188         dst[oc] = res;
 189 #endif
 190 #elif HAS_FUSED_OPS
 191         FUSED_OPS;
 192         dst[oc] = FINAL_NAME;
 193 #endif
 194
 195     }
 196
 197     bool in_x = x < OUTPUT_SIZE_X;
 198     bool in_y = y < OUTPUT_SIZE_Y;
 199 #if BINARY_PACKED_OUTPUT
 200
 201 #if PADDED_OUTPUT
 202     if (in_x && in_y)
 203         output[dst_index + y*OUTPUT_Y_PITCH + x] = TO_OUTPUT_TYPE(packed_res);
 204 #else
 205     if (xy * XY_BLOCK_SIZE < OUTPUT_SIZE_X*OUTPUT_SIZE_Y)
 206         ALIGNED_BLOCK_WRITE(output, dst_index + xy*XY_BLOCK_SIZE, TO_OUTPUT_TYPE(packed_res));
 207     else if (in_x && in_y)
 208         output[dst_index + y*OUTPUT_Y_PITCH + x] = TO_OUTPUT_TYPE(packed_res);
 209
 210 #endif
 211
 212 #else
 213
 214     for (int oc = 0; oc < OC_BLOCK_SIZE; oc++)
 215     {
 216         bool in_fm = f_block*OC_BLOCK_SIZE + oc < OUTPUT_FEATURE_NUM;
 217         if (in_x && in_y && in_fm)
 218             output[dst_index + oc*OUTPUT_FEATURE_PITCH + y*OUTPUT_Y_PITCH + x] = TO_OUTPUT_TYPE(dst[oc]);
 219     }
 220
 221 #endif
 222
 223 }