inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output.cl

   1 // Copyright (c) 2018 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "include/include_all.cl"
  17 #include "include/detection_output_common.cl"
  18
  19 KERNEL (detection_output)(__global UNIT_TYPE* input_location, __global UNIT_TYPE* output, __global UNIT_TYPE* input_confidence, __global UNIT_TYPE* input_prior_box)
  20 {
  21     const uint idx = get_global_id(0);              // bbox idx
  22     const uint local_id = get_local_id(0) * NUM_OF_ITEMS; // All bboxes from one image in work group
  23     const uint idx_image = idx / NUM_OF_ITERATIONS;  // idx of current image
  24
  25     __local uint indexes[NUM_OF_PRIORS];
  26     __local uint scores_size[NUM_CLASSES * NUM_OF_IMAGES];
  27     __local bool stillSorting;
  28
  29     uint indexes_class_0[NUM_OF_PRIORS];
  30
  31     int last_bbox_in_class = NUM_OF_ITEMS;
  32     bool is_last_bbox_in_class = false;
  33     for (uint it = 0; it < NUM_OF_ITEMS; it ++)
  34     {
  35         if (((local_id + it + 1) % NUM_OF_PRIORS) == 0 )
  36         {
  37             last_bbox_in_class = it;
  38             is_last_bbox_in_class = true;
  39             break;
  40         }
  41     }
  42
  43     for (uint idx_class = 0; idx_class < NUM_CLASSES; idx_class++)
  44     {
  45         if (idx_class == BACKGROUND_LABEL_ID)
  46         {
  47             continue;
  48         }
  49
  50         for (uint it = 0;  it < NUM_OF_ITEMS; it++)
  51         {
  52             indexes[local_id + it] = local_id + it;
  53         }
  54
  55         stillSorting = true;
  56         barrier(CLK_LOCAL_MEM_FENCE);
  57
  58         bool is_last_bbox_in_image = (is_last_bbox_in_class) && (idx_class == (NUM_CLASSES - 1));
  59
  60         while(stillSorting)
  61         {
  62             barrier(CLK_LOCAL_MEM_FENCE);
  63             stillSorting = false;
  64
  65             for (uint i = 0; i < 2; i++)
  66             {
  67                 for (uint it = 0; it < NUM_OF_ITEMS; it++)
  68                 {
  69                     uint item_id = local_id + it;
  70
  71                     uint idx1 = indexes[item_id];
  72                     uint idx2 = indexes[item_id+1];
  73                     bool perform = false;
  74                     if ((((i % 2) && (item_id % 2)) ||
  75                         ((!(i % 2)) && (!(item_id % 2)))) &&
  76                         (it < last_bbox_in_class))
  77                     {
  78                         perform = true;
  79                     }
  80
  81                     if (perform &&
  82                         (FUNC_CALL(get_score)(input_confidence, idx1, idx_class, idx_image) <
  83                          FUNC_CALL(get_score)(input_confidence, idx2, idx_class, idx_image)))
  84                     {
  85                         indexes[item_id] = idx2;
  86                         indexes[item_id+1] = idx1;
  87                         stillSorting = true;
  88                     }
  89                     barrier(CLK_LOCAL_MEM_FENCE);
  90                 }
  91             }
  92         }
  93
  94         // Do it only once per class in image
  95         if (is_last_bbox_in_class)
  96         {
  97             UNIT_TYPE adaptive_threshold = NMS_THRESHOLD;
  98             uint post_nms_count = 0;
  99             const uint shared_class = (SHARE_LOCATION)? 0 : idx_class;
 100             scores_size[idx_class] = 0;
 101
 102             // Do the "keep" algorithm only for classes with confidence greater than CONFIDENCE_THRESHOLD.
 103             // Check first, the biggest one (after sort) element in class.
 104             if (FUNC_CALL(get_score)(input_confidence, indexes[0], idx_class, idx_image) != 0.0f)
 105             {
 106                 for (uint i = 0; i < SCORES_COUNT; i++)
 107                 {
 108                     const uint bb_idx = indexes[i];
 109                     bool keep = true;
 110                     for (uint j = 0; j < post_nms_count; j++)
 111                     {
 112                         if (!keep)
 113                         {
 114                             break;
 115                         }
 116
 117                         UNIT_TYPE overlap = 0.0;
 118                         const uint bb_idx2 = indexes[j];
 119
 120                         UNIT_TYPE decoded_bbox1[4];
 121                         FUNC_CALL(get_decoded_bbox)(decoded_bbox1, input_location, input_prior_box, bb_idx, shared_class, idx_image);
 122                         UNIT_TYPE decoded_bbox2[4];
 123                         FUNC_CALL(get_decoded_bbox)(decoded_bbox2, input_location, input_prior_box, bb_idx2, shared_class, idx_image);
 124                         bool intersecting =
 125                             (decoded_bbox1[0] < decoded_bbox2[2]) &
 126                             (decoded_bbox2[0] < decoded_bbox1[2]) &
 127                             (decoded_bbox1[1] < decoded_bbox2[3]) &
 128                             (decoded_bbox2[1] < decoded_bbox1[3]);
 129
 130                         if (intersecting)
 131                         {
 132                             const UNIT_TYPE intersect_width = min(decoded_bbox1[2], decoded_bbox2[2]) - max(decoded_bbox1[0], decoded_bbox2[0]);
 133                             const UNIT_TYPE intersect_height = min(decoded_bbox1[3], decoded_bbox2[3]) - max(decoded_bbox1[1], decoded_bbox2[1]);
 134                             const UNIT_TYPE intersect_size = intersect_width * intersect_height;
 135                             const UNIT_TYPE bbox1_area = (decoded_bbox1[2] - decoded_bbox1[0]) * (decoded_bbox1[3] - decoded_bbox1[1]);
 136                             const UNIT_TYPE bbox2_area = (decoded_bbox2[2] - decoded_bbox2[0]) * (decoded_bbox2[3] - decoded_bbox2[1]);
 137                             overlap = intersect_size / (bbox1_area + bbox2_area - intersect_size);
 138                         }
 139                         keep = (overlap <= adaptive_threshold);
 140                     }
 141                     if (keep)
 142                     {
 143                         indexes[post_nms_count] = indexes[i];
 144                         ++post_nms_count;
 145                     }
 146                     if ((keep) && (ETA < 1) && (adaptive_threshold > 0.5))
 147                     {
 148                         adaptive_threshold *= ETA;
 149                     }
 150                 }
 151             }
 152             // Write number of scores to global memory, for proper output order in separated work groups
 153             scores_size[idx_class] = post_nms_count;
 154         }
 155
 156         stillSorting = true;
 157         // Wait for scores number from all classes in images
 158         barrier(CLK_LOCAL_MEM_FENCE);
 159
 160         uint output_offset = (idx_image * NUM_CLASSES_OUT + idx_class - HIDDEN_CLASS) * SCORES_COUNT;
 161
 162         for (uint it = 0; it < NUM_OF_ITEMS; it++)
 163         {
 164             const uint local_id_out = local_id + it;
 165
 166             if (local_id_out < scores_size[idx_class])
 167             {
 168                 const uint score_idx = indexes[local_id_out];
 169                 uint bb_idx = indexes[local_id_out];
 170                 const uint shared_class = (SHARE_LOCATION)? 0 : idx_class;
 171                 UNIT_TYPE decoded_bbox[4];
 172                 FUNC_CALL(get_decoded_bbox)(decoded_bbox, input_location, input_prior_box, bb_idx, shared_class, idx_image);
 173
 174                 const uint out_idx = (local_id_out + output_offset) * OUTPUT_ROW_SIZE + OUTPUT_OFFSET;
 175                 output[out_idx] = TO_UNIT_TYPE(idx_image);
 176                 output[out_idx + 1] = TO_UNIT_TYPE(idx_class);
 177                 output[out_idx + 2] = FUNC_CALL(get_score)(input_confidence, score_idx, idx_class, idx_image);
 178                 output[out_idx + 3] = decoded_bbox[0];
 179                 output[out_idx + 4] = decoded_bbox[1];
 180                 output[out_idx + 5] = decoded_bbox[2];
 181                 output[out_idx + 6] = decoded_bbox[3];
 182             }
 183         }
 184
 185         // If work item is processing last bbox in image (we already know the number of all detections),
 186         // use it to fill rest of keep_top_k items if number of detections is smaller
 187         if (is_last_bbox_in_class)
 188         {
 189             uint out_idx = output_offset + scores_size[idx_class];
 190
 191             uint current_top_k = output_offset + SCORES_COUNT;
 192             for (uint i = out_idx; i < current_top_k; i++)
 193             {
 194                 out_idx = i * OUTPUT_ROW_SIZE + OUTPUT_OFFSET;
 195                 output[out_idx] = -1.0;
 196                 output[out_idx + 1] = 0.0;
 197                 output[out_idx + 2] = 0.0;
 198                 output[out_idx + 3] = 0.0;
 199                 output[out_idx + 4] = 0.0;
 200                 output[out_idx + 5] = 0.0;
 201                 output[out_idx + 6] = 0.0;
 202             }
 203         }
 204
 205         // Write number of scores kept in first step of detection output
 206         if (is_last_bbox_in_image)
 207         {
 208             uint scores_sum = 0;
 209             for (uint i = 0; i < NUM_CLASSES; i++)
 210             {
 211                 scores_sum += scores_size[i];
 212             }
 213             output[idx_image] = scores_sum;
 214
 215         }
 216     }
 217 }