inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/detection_output_common.cl

   1 // Copyright (c) 2018 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #define PRIOR_BOX_SIZE 4 // Each prior-box consists of [xmin, ymin, xmax, ymax].
  16 #define OUTPUT_ROW_SIZE 7 // Each detection consists of [image_id, label, confidence, xmin, ymin, xmax, ymax].
  17
  18 #define CODE_TYPE_CORNER 0
  19 #define CODE_TYPE_CENTER_SIZE 1
  20 #define CODE_TYPE_CORNER_SIZE 2
  21
  22 #define HIDDEN_CLASS ((BACKGROUND_LABEL_ID == 0 && SHARE_LOCATION)?  1 : 0)
  23 #define NUM_OF_IMAGES INPUT0_BATCH_NUM
  24 #define NUM_LOC_CLASSES ((SHARE_LOCATION)? 1 : NUM_CLASSES)
  25 #define NUM_CLASSES_OUT ((HIDDEN_CLASS == 1)? NUM_CLASSES - 1 : NUM_CLASSES)
  26 #define NUM_OF_PRIORS (INPUT0_LENGTH / (NUM_OF_IMAGES * NUM_LOC_CLASSES * PRIOR_BOX_SIZE))
  27 #define NUM_OF_ITEMS ((NUM_OF_PRIORS / 256) + 1)
  28 #define NUM_OF_ITERATIONS ((NUM_OF_PRIORS % NUM_OF_ITEMS == 0)? (NUM_OF_PRIORS / NUM_OF_ITEMS) : ((NUM_OF_PRIORS / NUM_OF_ITEMS) + 1))
  29
  30 #define X_SIZE INPUT0_Y_PITCH
  31 #define Y_SIZE (INPUT0_FEATURE_PITCH/INPUT0_Y_PITCH)
  32 #define LOCATION_PADDING (INPUT0_PAD_BEFORE_SIZE_Y * X_SIZE + INPUT0_PAD_BEFORE_SIZE_X)
  33 #define LOC_XY_SIZE_PRODUCT (X_SIZE * Y_SIZE)
  34 #define CONF_PADDING (CONF_PADDING_Y * CONF_SIZE_X + CONF_PADDING_X)
  35 #define CONF_XY_SIZE_PRODUCT (CONF_SIZE_X * CONF_SIZE_Y)
  36
  37 #define NUM_OF_PRIOR_COMPONENTS (NUM_OF_PRIORS * PRIOR_BOX_SIZE)
  38 #define NUM_OF_IMAGE_CONF (INPUT0_LENGTH/NUM_OF_IMAGES/PRIOR_BOX_SIZE)
  39
  40 #define SCORES_COUNT (((TOP_K != -1) && (TOP_K < NUM_OF_PRIORS))? TOP_K : NUM_OF_PRIORS)
  41
  42 #define OUTPUT_OFFSET (((NUM_OF_IMAGES + 15) / 16) * 16)
  43 #define SCORE_OFFSET 2
  44
  45 #define INPUT_OFFSET (((NUM_IMAGES + 15) / 16) * 16)
  46 #define INPUT_BBOXES_COUNT ((INPUT0_LENGTH - INPUT_OFFSET) / OUTPUT_ROW_SIZE)
  47 #define NUM_CLASSES_IN NUM_CLASSES_OUT
  48 #define BBOXES_NUM_BASED_TOP_K (TOP_K * NUM_CLASSES_IN * NUM_IMAGES)
  49 #define INPUT_BBOXES_LENGTH (((TOP_K != -1) && (BBOXES_NUM_BASED_TOP_K < INPUT_BBOXES_COUNT))? BBOXES_NUM_BASED_TOP_K : INPUT_BBOXES_COUNT)
  50 #define NUM_OF_CLASS_BBOXES (INPUT_BBOXES_LENGTH / (NUM_IMAGES * NUM_CLASSES_IN))
  51 #define NUM_OF_IMAGE_BBOXES (INPUT_BBOXES_LENGTH / NUM_IMAGES)
  52 #define NUM_OF_ITEMS_SORT ((NUM_CLASSES_IN / 256) + 1)
  53
  54
  55 // Number of bboxes to keep in output
  56 #define KEEP_BBOXES_NUM ((KEEP_TOP_K < NUM_OF_IMAGE_BBOXES)? KEEP_TOP_K : NUM_OF_IMAGE_BBOXES)
  57
  58 void FUNC(get_decoded_bbox)(UNIT_TYPE* decoded_bbox, __global UNIT_TYPE* input_location, __global UNIT_TYPE* input_prior_box, const uint idx_prior, const uint idx_class, const uint idx_image)
  59 {
  60     const uint prior_offset = idx_prior * PRIOR_INFO_SIZE + PRIOR_COORD_OFFSET;
  61     uint location_offset =
  62         (NUM_LOC_CLASSES * (idx_prior * PRIOR_BOX_SIZE) + idx_image * INPUT0_FEATURE_NUM + idx_class * PRIOR_BOX_SIZE) *
  63         LOC_XY_SIZE_PRODUCT +
  64         LOCATION_PADDING;
  65
  66     UNIT_TYPE prior_bboxes[4] = {
  67         input_prior_box[prior_offset],
  68         input_prior_box[prior_offset + 1],
  69         input_prior_box[prior_offset + 2],
  70         input_prior_box[prior_offset + 3]};
  71
  72     if (!PRIOR_IS_NORMALIZED)
  73     {
  74         prior_bboxes[0] /= IMAGE_WIDTH;
  75         prior_bboxes[1] /= IMAGE_HEIGH;
  76         prior_bboxes[2] /= IMAGE_WIDTH;
  77         prior_bboxes[3] /= IMAGE_HEIGH;
  78     }
  79
  80     if (CODE_TYPE == CODE_TYPE_CORNER)
  81     {
  82         if (VARIANCE_ENCODED_IN_TARGET)
  83         {
  84             // variance is encoded in target, we simply need to add the offset predictions.
  85             for(uint i = 0; i < PRIOR_BOX_SIZE; i++)
  86             {
  87                 decoded_bbox[i] =
  88                     prior_bboxes[i] +
  89                     input_location[location_offset];
  90
  91                 location_offset += LOC_XY_SIZE_PRODUCT;
  92             }
  93         }
  94         else
  95         {
  96             // variance is encoded in bbox, we need to scale the offset accordingly.
  97             for(uint i = 0; i < PRIOR_BOX_SIZE; i++)
  98             {
  99                 decoded_bbox[i] =
 100                     mad(input_prior_box[NUM_OF_PRIOR_COMPONENTS + i], // prior variances are places after prior bboxes
 101                         input_location[location_offset],
 102                         prior_bboxes[i]);
 103
 104                 location_offset += LOC_XY_SIZE_PRODUCT;
 105             }
 106         }
 107     }
 108     else if (CODE_TYPE == CODE_TYPE_CENTER_SIZE)
 109     {
 110         const UNIT_TYPE prior_width = prior_bboxes[2] - prior_bboxes[0];
 111         const UNIT_TYPE prior_height = prior_bboxes[3] - prior_bboxes[1];
 112         const UNIT_TYPE prior_center_x = (prior_bboxes[0] + prior_bboxes[2]) / 2;
 113         const UNIT_TYPE prior_center_y = (prior_bboxes[1] + prior_bboxes[3]) / 2;
 114         const UNIT_TYPE bbox_xmin = input_location[location_offset];
 115         const UNIT_TYPE bbox_ymin = input_location[location_offset + LOC_XY_SIZE_PRODUCT];
 116         const UNIT_TYPE bbox_xmax = input_location[location_offset + 2 * LOC_XY_SIZE_PRODUCT];
 117         const UNIT_TYPE bbox_ymax = input_location[location_offset + 3 * LOC_XY_SIZE_PRODUCT];
 118         UNIT_TYPE decode_bbox_center_x, decode_bbox_center_y;
 119         UNIT_TYPE decode_bbox_width, decode_bbox_height;
 120
 121         if (VARIANCE_ENCODED_IN_TARGET)
 122         {
 123             // variance is encoded in target, we simply need to restore the offset predictions.
 124             decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
 125             decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
 126             decode_bbox_width = (exp(bbox_xmax) * prior_width) / 2;
 127             decode_bbox_height = (exp(bbox_ymax) * prior_height) / 2;
 128         }
 129         else
 130         {
 131             // variance is encoded in bbox, we need to scale the offset accordingly.
 132             decode_bbox_center_x = input_prior_box[NUM_OF_PRIOR_COMPONENTS] * bbox_xmin * prior_width + prior_center_x;
 133             decode_bbox_center_y = input_prior_box[NUM_OF_PRIOR_COMPONENTS + 1] * bbox_ymin * prior_height + prior_center_y;
 134             decode_bbox_width = (exp(input_prior_box[NUM_OF_PRIOR_COMPONENTS + 2] * bbox_xmax) * prior_width) / 2;
 135             decode_bbox_height = (exp(input_prior_box[NUM_OF_PRIOR_COMPONENTS + 3] * bbox_ymax) * prior_height) / 2;
 136         }
 137
 138         decoded_bbox[0] = decode_bbox_center_x - decode_bbox_width;
 139         decoded_bbox[1] = decode_bbox_center_y - decode_bbox_height;
 140         decoded_bbox[2] = decode_bbox_center_x + decode_bbox_width;
 141         decoded_bbox[3] = decode_bbox_center_y + decode_bbox_height;
 142     }
 143     else
 144     {
 145         const UNIT_TYPE prior_width = prior_bboxes[2] - prior_bboxes[0];
 146         const UNIT_TYPE prior_height = prior_bboxes[3] - prior_bboxes[1];
 147         const UNIT_TYPE bbox_xmin = input_location[location_offset];
 148         const UNIT_TYPE bbox_ymin = input_location[location_offset + LOC_XY_SIZE_PRODUCT];
 149         const UNIT_TYPE bbox_xmax = input_location[location_offset + 2 * LOC_XY_SIZE_PRODUCT];
 150         const UNIT_TYPE bbox_ymax = input_location[location_offset + 3 * LOC_XY_SIZE_PRODUCT];
 151
 152         if (VARIANCE_ENCODED_IN_TARGET)
 153         {
 154             // variance is encoded in target, we simply need to add the offset predictions.
 155             decoded_bbox[0] = prior_bboxes[0] + bbox_xmin * prior_width;
 156             decoded_bbox[1] = prior_bboxes[1] + bbox_ymin * prior_height;
 157             decoded_bbox[2] = prior_bboxes[2] + bbox_xmax * prior_width;
 158             decoded_bbox[3] = prior_bboxes[3] + bbox_ymax * prior_height;
 159         }
 160         else
 161         {
 162             // variance is encoded in bbox, we need to scale the offset accordingly.
 163             decoded_bbox[0] = prior_bboxes[0] + input_prior_box[NUM_OF_PRIOR_COMPONENTS] * bbox_xmin * prior_width;
 164             decoded_bbox[1] = prior_bboxes[1] + input_prior_box[NUM_OF_PRIOR_COMPONENTS + 1] * bbox_ymin * prior_height;
 165             decoded_bbox[2] = prior_bboxes[2] + input_prior_box[NUM_OF_PRIOR_COMPONENTS + 2] * bbox_xmax * prior_width;
 166             decoded_bbox[3] = prior_bboxes[3] + input_prior_box[NUM_OF_PRIOR_COMPONENTS + 3] * bbox_ymax * prior_height;
 167         }
 168     }
 169 }
 170
 171 UNIT_TYPE FUNC(get_score)(__global UNIT_TYPE* input_confidence, const uint idx_prior, const uint idx_class, const uint idx_image)
 172 {
 173     const uint confidence_offset =                    // offset in kernel input 'input_confidence'
 174             (idx_prior * NUM_CLASSES + idx_image * NUM_OF_PRIORS * NUM_CLASSES + idx_class) *
 175             CONF_XY_SIZE_PRODUCT +
 176             CONF_PADDING;
 177
 178     return (input_confidence[confidence_offset] > CONFIDENCE_THRESHOLD)? input_confidence[confidence_offset] : 0;
 179 }
 180