1 // Copyright (c) 2018 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 #include "include/include_all.cl"
17 #include "include/detection_output_common.cl"
19 KERNEL (detection_output)(__global UNIT_TYPE* input_location, __global UNIT_TYPE* output, __global UNIT_TYPE* input_confidence, __global UNIT_TYPE* input_prior_box)
21 const uint idx = get_global_id(0); // bbox idx
22 const uint local_id = get_local_id(0) * NUM_OF_ITEMS; // All bboxes from one image in work group
23 const uint idx_image = idx / NUM_OF_ITERATIONS; // idx of current image
25 __local uint indexes[NUM_OF_PRIORS];
26 __local uint scores_size[NUM_CLASSES * NUM_OF_IMAGES];
27 __local bool stillSorting;
29 uint indexes_class_0[NUM_OF_PRIORS];
31 int last_bbox_in_class = NUM_OF_ITEMS;
32 bool is_last_bbox_in_class = false;
33 for (uint it = 0; it < NUM_OF_ITEMS; it ++)
35 if (((local_id + it + 1) % NUM_OF_PRIORS) == 0 )
37 last_bbox_in_class = it;
38 is_last_bbox_in_class = true;
43 for (uint idx_class = 0; idx_class < NUM_CLASSES; idx_class++)
45 if (idx_class == BACKGROUND_LABEL_ID)
50 for (uint it = 0; it < NUM_OF_ITEMS; it++)
52 indexes[local_id + it] = local_id + it;
56 barrier(CLK_LOCAL_MEM_FENCE);
58 bool is_last_bbox_in_image = (is_last_bbox_in_class) && (idx_class == (NUM_CLASSES - 1));
62 barrier(CLK_LOCAL_MEM_FENCE);
65 for (uint i = 0; i < 2; i++)
67 for (uint it = 0; it < NUM_OF_ITEMS; it++)
69 uint item_id = local_id + it;
71 uint idx1 = indexes[item_id];
72 uint idx2 = indexes[item_id+1];
74 if ((((i % 2) && (item_id % 2)) ||
75 ((!(i % 2)) && (!(item_id % 2)))) &&
76 (it < last_bbox_in_class))
82 (FUNC_CALL(get_score)(input_confidence, idx1, idx_class, idx_image) <
83 FUNC_CALL(get_score)(input_confidence, idx2, idx_class, idx_image)))
85 indexes[item_id] = idx2;
86 indexes[item_id+1] = idx1;
89 barrier(CLK_LOCAL_MEM_FENCE);
94 // Do it only once per class in image
95 if (is_last_bbox_in_class)
97 UNIT_TYPE adaptive_threshold = NMS_THRESHOLD;
98 uint post_nms_count = 0;
99 const uint shared_class = (SHARE_LOCATION)? 0 : idx_class;
100 scores_size[idx_class] = 0;
102 // Do the "keep" algorithm only for classes with confidence greater than CONFIDENCE_THRESHOLD.
103 // Check first, the biggest one (after sort) element in class.
104 if (FUNC_CALL(get_score)(input_confidence, indexes[0], idx_class, idx_image) != 0.0f)
106 for (uint i = 0; i < SCORES_COUNT; i++)
108 const uint bb_idx = indexes[i];
110 for (uint j = 0; j < post_nms_count; j++)
117 UNIT_TYPE overlap = 0.0;
118 const uint bb_idx2 = indexes[j];
120 UNIT_TYPE decoded_bbox1[4];
121 FUNC_CALL(get_decoded_bbox)(decoded_bbox1, input_location, input_prior_box, bb_idx, shared_class, idx_image);
122 UNIT_TYPE decoded_bbox2[4];
123 FUNC_CALL(get_decoded_bbox)(decoded_bbox2, input_location, input_prior_box, bb_idx2, shared_class, idx_image);
125 (decoded_bbox1[0] < decoded_bbox2[2]) &
126 (decoded_bbox2[0] < decoded_bbox1[2]) &
127 (decoded_bbox1[1] < decoded_bbox2[3]) &
128 (decoded_bbox2[1] < decoded_bbox1[3]);
132 const UNIT_TYPE intersect_width = min(decoded_bbox1[2], decoded_bbox2[2]) - max(decoded_bbox1[0], decoded_bbox2[0]);
133 const UNIT_TYPE intersect_height = min(decoded_bbox1[3], decoded_bbox2[3]) - max(decoded_bbox1[1], decoded_bbox2[1]);
134 const UNIT_TYPE intersect_size = intersect_width * intersect_height;
135 const UNIT_TYPE bbox1_area = (decoded_bbox1[2] - decoded_bbox1[0]) * (decoded_bbox1[3] - decoded_bbox1[1]);
136 const UNIT_TYPE bbox2_area = (decoded_bbox2[2] - decoded_bbox2[0]) * (decoded_bbox2[3] - decoded_bbox2[1]);
137 overlap = intersect_size / (bbox1_area + bbox2_area - intersect_size);
139 keep = (overlap <= adaptive_threshold);
143 indexes[post_nms_count] = indexes[i];
146 if ((keep) && (ETA < 1) && (adaptive_threshold > 0.5))
148 adaptive_threshold *= ETA;
152 // Write number of scores to global memory, for proper output order in separated work groups
153 scores_size[idx_class] = post_nms_count;
157 // Wait for scores number from all classes in images
158 barrier(CLK_LOCAL_MEM_FENCE);
160 uint output_offset = (idx_image * NUM_CLASSES_OUT + idx_class - HIDDEN_CLASS) * SCORES_COUNT;
162 for (uint it = 0; it < NUM_OF_ITEMS; it++)
164 const uint local_id_out = local_id + it;
166 if (local_id_out < scores_size[idx_class])
168 const uint score_idx = indexes[local_id_out];
169 uint bb_idx = indexes[local_id_out];
170 const uint shared_class = (SHARE_LOCATION)? 0 : idx_class;
171 UNIT_TYPE decoded_bbox[4];
172 FUNC_CALL(get_decoded_bbox)(decoded_bbox, input_location, input_prior_box, bb_idx, shared_class, idx_image);
174 const uint out_idx = (local_id_out + output_offset) * OUTPUT_ROW_SIZE + OUTPUT_OFFSET;
175 output[out_idx] = TO_UNIT_TYPE(idx_image);
176 output[out_idx + 1] = TO_UNIT_TYPE(idx_class);
177 output[out_idx + 2] = FUNC_CALL(get_score)(input_confidence, score_idx, idx_class, idx_image);
178 output[out_idx + 3] = decoded_bbox[0];
179 output[out_idx + 4] = decoded_bbox[1];
180 output[out_idx + 5] = decoded_bbox[2];
181 output[out_idx + 6] = decoded_bbox[3];
185 // If work item is processing last bbox in image (we already know the number of all detections),
186 // use it to fill rest of keep_top_k items if number of detections is smaller
187 if (is_last_bbox_in_class)
189 uint out_idx = output_offset + scores_size[idx_class];
191 uint current_top_k = output_offset + SCORES_COUNT;
192 for (uint i = out_idx; i < current_top_k; i++)
194 out_idx = i * OUTPUT_ROW_SIZE + OUTPUT_OFFSET;
195 output[out_idx] = -1.0;
196 output[out_idx + 1] = 0.0;
197 output[out_idx + 2] = 0.0;
198 output[out_idx + 3] = 0.0;
199 output[out_idx + 4] = 0.0;
200 output[out_idx + 5] = 0.0;
201 output[out_idx + 6] = 0.0;
205 // Write number of scores kept in first step of detection output
206 if (is_last_bbox_in_image)
209 for (uint i = 0; i < NUM_CLASSES; i++)
211 scores_sum += scores_size[i];
213 output[idx_image] = scores_sum;