Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / cl_kernels / detection_output.cl
1 // Copyright (c) 2018 Intel Corporation
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15
16 #include "include/include_all.cl"
17 #include "include/detection_output_common.cl"
18
19 KERNEL (detection_output)(__global UNIT_TYPE* input_location, __global UNIT_TYPE* output, __global UNIT_TYPE* input_confidence, __global UNIT_TYPE* input_prior_box)
20 {
21     const uint idx = get_global_id(0);              // bbox idx
22     const uint local_id = get_local_id(0) * NUM_OF_ITEMS; // All bboxes from one image in work group
23     const uint idx_image = idx / NUM_OF_ITERATIONS;  // idx of current image
24
25     __local uint indexes[NUM_OF_PRIORS];
26     __local uint scores_size[NUM_CLASSES * NUM_OF_IMAGES];
27     __local bool stillSorting;
28
29     uint indexes_class_0[NUM_OF_PRIORS];
30
31     int last_bbox_in_class = NUM_OF_ITEMS;
32     bool is_last_bbox_in_class = false;
33     for (uint it = 0; it < NUM_OF_ITEMS; it ++)
34     {
35         if (((local_id + it + 1) % NUM_OF_PRIORS) == 0 )
36         {
37             last_bbox_in_class = it;
38             is_last_bbox_in_class = true;
39             break;
40         }
41     }
42
43     for (uint idx_class = 0; idx_class < NUM_CLASSES; idx_class++)
44     {
45         if (idx_class == BACKGROUND_LABEL_ID)
46         {
47             continue;
48         }
49
50         for (uint it = 0;  it < NUM_OF_ITEMS; it++)
51         {
52             indexes[local_id + it] = local_id + it; 
53         }
54
55         stillSorting = true;
56         barrier(CLK_LOCAL_MEM_FENCE);
57
58         bool is_last_bbox_in_image = (is_last_bbox_in_class) && (idx_class == (NUM_CLASSES - 1));
59
60         while(stillSorting)
61         {
62             barrier(CLK_LOCAL_MEM_FENCE);
63             stillSorting = false;
64
65             for (uint i = 0; i < 2; i++)
66             {
67                 for (uint it = 0; it < NUM_OF_ITEMS; it++)
68                 {
69                     uint item_id = local_id + it;
70      
71                     uint idx1 = indexes[item_id];
72                     uint idx2 = indexes[item_id+1];
73                     bool perform = false;
74                     if ((((i % 2) && (item_id % 2)) ||
75                         ((!(i % 2)) && (!(item_id % 2)))) &&
76                         (it < last_bbox_in_class))
77                     {
78                         perform = true;
79                     }
80
81                     if (perform &&
82                         (FUNC_CALL(get_score)(input_confidence, idx1, idx_class, idx_image) <
83                          FUNC_CALL(get_score)(input_confidence, idx2, idx_class, idx_image)))
84                     {
85                         indexes[item_id] = idx2;
86                         indexes[item_id+1] = idx1;
87                         stillSorting = true;
88                     }
89                     barrier(CLK_LOCAL_MEM_FENCE);
90                 }
91             }
92         }
93
94         // Do it only once per class in image
95         if (is_last_bbox_in_class)
96         {
97             UNIT_TYPE adaptive_threshold = NMS_THRESHOLD;
98             uint post_nms_count = 0;
99             const uint shared_class = (SHARE_LOCATION)? 0 : idx_class;
100             scores_size[idx_class] = 0;
101
102             // Do the "keep" algorithm only for classes with confidence greater than CONFIDENCE_THRESHOLD.
103             // Check first, the biggest one (after sort) element in class.
104             if (FUNC_CALL(get_score)(input_confidence, indexes[0], idx_class, idx_image) != 0.0f)
105             {
106                 for (uint i = 0; i < SCORES_COUNT; i++)
107                 {
108                     const uint bb_idx = indexes[i];
109                     bool keep = true;
110                     for (uint j = 0; j < post_nms_count; j++)
111                     {
112                         if (!keep)
113                         {
114                             break;
115                         }
116
117                         UNIT_TYPE overlap = 0.0;
118                         const uint bb_idx2 = indexes[j];
119
120                         UNIT_TYPE decoded_bbox1[4];
121                         FUNC_CALL(get_decoded_bbox)(decoded_bbox1, input_location, input_prior_box, bb_idx, shared_class, idx_image);
122                         UNIT_TYPE decoded_bbox2[4];
123                         FUNC_CALL(get_decoded_bbox)(decoded_bbox2, input_location, input_prior_box, bb_idx2, shared_class, idx_image);
124                         bool intersecting =
125                             (decoded_bbox1[0] < decoded_bbox2[2]) &
126                             (decoded_bbox2[0] < decoded_bbox1[2]) &
127                             (decoded_bbox1[1] < decoded_bbox2[3]) &
128                             (decoded_bbox2[1] < decoded_bbox1[3]);
129
130                         if (intersecting)
131                         {
132                             const UNIT_TYPE intersect_width = min(decoded_bbox1[2], decoded_bbox2[2]) - max(decoded_bbox1[0], decoded_bbox2[0]);
133                             const UNIT_TYPE intersect_height = min(decoded_bbox1[3], decoded_bbox2[3]) - max(decoded_bbox1[1], decoded_bbox2[1]);
134                             const UNIT_TYPE intersect_size = intersect_width * intersect_height;
135                             const UNIT_TYPE bbox1_area = (decoded_bbox1[2] - decoded_bbox1[0]) * (decoded_bbox1[3] - decoded_bbox1[1]);
136                             const UNIT_TYPE bbox2_area = (decoded_bbox2[2] - decoded_bbox2[0]) * (decoded_bbox2[3] - decoded_bbox2[1]);
137                             overlap = intersect_size / (bbox1_area + bbox2_area - intersect_size);
138                         }
139                         keep = (overlap <= adaptive_threshold);
140                     }
141                     if (keep)
142                     {
143                         indexes[post_nms_count] = indexes[i];
144                         ++post_nms_count;
145                     }
146                     if ((keep) && (ETA < 1) && (adaptive_threshold > 0.5))
147                     {
148                         adaptive_threshold *= ETA;
149                     }
150                 }
151             }
152             // Write number of scores to global memory, for proper output order in separated work groups
153             scores_size[idx_class] = post_nms_count;
154         }
155
156         stillSorting = true;
157         // Wait for scores number from all classes in images
158         barrier(CLK_LOCAL_MEM_FENCE);
159
160         uint output_offset = (idx_image * NUM_CLASSES_OUT + idx_class - HIDDEN_CLASS) * SCORES_COUNT;
161
162         for (uint it = 0; it < NUM_OF_ITEMS; it++)
163         {
164             const uint local_id_out = local_id + it;
165             
166             if (local_id_out < scores_size[idx_class])
167             {
168                 const uint score_idx = indexes[local_id_out];
169                 uint bb_idx = indexes[local_id_out];
170                 const uint shared_class = (SHARE_LOCATION)? 0 : idx_class;
171                 UNIT_TYPE decoded_bbox[4];
172                 FUNC_CALL(get_decoded_bbox)(decoded_bbox, input_location, input_prior_box, bb_idx, shared_class, idx_image);
173
174                 const uint out_idx = (local_id_out + output_offset) * OUTPUT_ROW_SIZE + OUTPUT_OFFSET;
175                 output[out_idx] = TO_UNIT_TYPE(idx_image);
176                 output[out_idx + 1] = TO_UNIT_TYPE(idx_class);
177                 output[out_idx + 2] = FUNC_CALL(get_score)(input_confidence, score_idx, idx_class, idx_image);
178                 output[out_idx + 3] = decoded_bbox[0];
179                 output[out_idx + 4] = decoded_bbox[1];
180                 output[out_idx + 5] = decoded_bbox[2];
181                 output[out_idx + 6] = decoded_bbox[3];
182             }
183         }
184
185         // If work item is processing last bbox in image (we already know the number of all detections),
186         // use it to fill rest of keep_top_k items if number of detections is smaller
187         if (is_last_bbox_in_class)
188         {
189             uint out_idx = output_offset + scores_size[idx_class];
190
191             uint current_top_k = output_offset + SCORES_COUNT;
192             for (uint i = out_idx; i < current_top_k; i++)
193             {
194                 out_idx = i * OUTPUT_ROW_SIZE + OUTPUT_OFFSET;
195                 output[out_idx] = -1.0;
196                 output[out_idx + 1] = 0.0;
197                 output[out_idx + 2] = 0.0;
198                 output[out_idx + 3] = 0.0;
199                 output[out_idx + 4] = 0.0;
200                 output[out_idx + 5] = 0.0;
201                 output[out_idx + 6] = 0.0;
202             }
203         }
204
205         // Write number of scores kept in first step of detection output
206         if (is_last_bbox_in_image)
207         {
208             uint scores_sum = 0;
209             for (uint i = 0; i < NUM_CLASSES; i++)
210             {
211                 scores_sum += scores_size[i];
212             }
213             output[idx_image] = scores_sum;
214
215         }
216     }
217 }