2 // Copyright (c) 2016-2018 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "proposal_inst.h"
19 #include "implementation_map.h"
20 #include "network_impl.h"
21 #include "engine_impl.h"
22 #include "math_utils.h"
23 #include "error_handler.h"
28 #define EPSILON 0.00001f
30 namespace cldnn { namespace gpu {
34 /****************************************************************************
38 ****************************************************************************/
40 inline const float& clamp(const float & v, const float & lower, const float & upper)
42 return std::max(lower, std::min(v, upper));
45 inline bool hasSingleBatchOutput(const program_node & node)
47 const auto & batch = node.get_output_layout().size.batch;
49 return batch.empty() || (batch.size() == 1 && batch[0] == 1);
56 inline float area() const
58 return std::max(0.f, y1 - y0 + 1.f) * std::max(0.f, x1 - x0 + 1.f);
62 struct delta_t { float shift_x, shift_y, log_w, log_h; };
66 proposal_t() = default;
67 proposal_t(const roi_t& r, const float c, const size_t& o) : roi(r), confidence(c), ord(o) {}
74 inline float float_read_helper(const float* mem)
79 inline float float_read_helper(const half_t* mem)
81 return float16_to_float32(*((uint16_t*)(mem)));
84 inline void float_write_helper(float* mem, float f)
89 inline void float_write_helper(half_t* mem, float f)
91 *mem = (half_t)float32_to_float16(f);
94 /****************************************************************************
98 ****************************************************************************/
100 void sort_and_keep_n_items(std::vector<proposal_t>& proposals, size_t n)
102 auto cmp_fn = [](const proposal_t& a, const proposal_t& b)
104 return (a.confidence > b.confidence);
107 if (proposals.size() > n)
109 std::partial_sort(proposals.begin(), proposals.begin() + n, proposals.end(), cmp_fn);
114 std::sort(proposals.begin(), proposals.end(), cmp_fn);
119 const proposal_inst::anchor& box,
120 const delta_t& delta,
125 float coordinates_offset,
127 bool clip_before_nms)
129 float x0 = box.start_x + anchor_shift_x;
130 float y0 = box.start_y + anchor_shift_y;
131 float x1 = box.end_x + anchor_shift_x;
132 float y1 = box.end_y + anchor_shift_y;
136 x0 = clamp(x0, 0.0f, static_cast<float>(img_w));
137 y0 = clamp(y0, 0.0f, static_cast<float>(img_h));
138 x1 = clamp(x1, 0.0f, static_cast<float>(img_w));
139 y1 = clamp(y1, 0.0f, static_cast<float>(img_h));
142 const float anchor_w = x1 - x0 + coordinates_offset;
143 const float anchor_h = y1 - y0 + coordinates_offset;
144 const float center_x = x0 + 0.5f * anchor_w;
145 const float center_y = y0 + 0.5f * anchor_h;
147 const float pred_center_x = delta.shift_x * anchor_w + center_x;
148 const float pred_center_y = delta.shift_y * anchor_h + center_y;
149 const float half_pred_w = std::exp(delta.log_w) * anchor_w * .5f;
150 const float half_pred_h = std::exp(delta.log_h) * anchor_h * .5f;
152 float new_x0 = pred_center_x - half_pred_w;
153 float new_y0 = pred_center_y - half_pred_h;
154 float new_x1 = pred_center_x + half_pred_w;
155 float new_y1 = pred_center_y + half_pred_h;
159 new_x0 = clamp(new_x0, 0.f, img_w - coordinates_offset);
160 new_y0 = clamp(new_y0, 0.f, img_h - coordinates_offset);
161 new_x1 = clamp(new_x1, 0.f, img_w - coordinates_offset);
162 new_y1 = clamp(new_y1, 0.f, img_h - coordinates_offset);
165 return { new_x0, new_y0, new_x1, new_y1 };
168 std::vector<roi_t> perform_nms(
169 const std::vector<proposal_t>& proposals,
172 float coordinates_offset)
174 std::vector<roi_t> res;
177 for (const auto & prop : proposals)
179 const roi_t& bbox = prop.roi;
181 bool overlaps = std::any_of(res.begin(), res.end(), [&](const roi_t& res_bbox)
183 bool intersecting = (bbox.x0 < res_bbox.x1) & (res_bbox.x0 < bbox.x1) & (bbox.y0 < res_bbox.y1) & (res_bbox.y0 < bbox.y1);
184 float overlap = 0.0f;
187 const float x0 = std::max(bbox.x0, res_bbox.x0);
188 const float y0 = std::max(bbox.y0, res_bbox.y0);
189 const float x1 = std::min(bbox.x1, res_bbox.x1);
190 const float y1 = std::min(bbox.y1, res_bbox.y1);
192 const float intersect_width = std::max(0.0f, x1 - x0 + coordinates_offset);
193 const float intersect_height = std::max(0.0f, y1 - y0 + coordinates_offset);
194 const float intersect_size = intersect_width * intersect_height;
196 const float A_area = (bbox.x1 - bbox.x0 + coordinates_offset) * (bbox.y1 - bbox.y0 + coordinates_offset);
197 const float B_area = (res_bbox.x1 - res_bbox.x0 + coordinates_offset) * (res_bbox.y1 - res_bbox.y0 + coordinates_offset);
199 overlap = intersect_size / (A_area + B_area - intersect_size);
201 return overlap > iou_threshold;
207 if (res.size() == top_n) break;
214 } // anonymous namespace
217 /****************************************************************************
221 ****************************************************************************/
223 struct proposal_gpu : typed_primitive_impl<proposal>
225 const proposal_node& outer;
227 proposal_gpu(const proposal_node& arg)
231 template<typename dtype>
232 void execute(proposal_inst& instance)
234 const std::vector<proposal_inst::anchor>& anchors = instance.get_anchors();
236 size_t anchors_num = anchors.size();
238 auto& cls_scores = instance.dep_memory(proposal_inst::cls_scores_index);
239 auto& bbox_pred = instance.dep_memory(proposal_inst::bbox_pred_index);
240 auto& image_info = instance.dep_memory(proposal_inst::image_info_index);
242 // original input image to the graph (after possible scaling etc.) so that coordinates are valid for it
243 mem_lock<dtype> image_info_ptr{ image_info };
244 const dtype* image_info_mem = image_info_ptr.data();
251 int scaled_min_bbox_size = instance.argument.min_bbox_size;
253 bool swap_xy = instance.argument.swap_xy;
254 bool initial_clip = instance.argument.initial_clip;
255 bool clip_before_nms = instance.argument.clip_before_nms;
256 bool clip_after_nms = instance.argument.clip_after_nms;
257 float coordinates_offset = instance.argument.coordinates_offset;
258 float box_coordinate_scale = instance.argument.box_coordinate_scale;
259 float box_size_scale = instance.argument.box_size_scale;
261 if (image_info.get_layout().size.feature[0] == 4)
263 img_w = static_cast<int>(float_read_helper(image_info_mem + proposal_inst::image_info_width_index) + EPSILON);
264 img_h = static_cast<int>(float_read_helper(image_info_mem + proposal_inst::image_info_height_index) + EPSILON);
265 min_bbox_x = static_cast<int>(scaled_min_bbox_size * float_read_helper(image_info_mem + 3));
266 min_bbox_y = static_cast<int>(scaled_min_bbox_size * float_read_helper(image_info_mem + 2));
270 img_w = static_cast<int>(float_read_helper(image_info_mem + proposal_inst::image_info_width_index) + EPSILON);
271 img_h = static_cast<int>(float_read_helper(image_info_mem + proposal_inst::image_info_height_index) + EPSILON);
272 img_z = static_cast<int>(float_read_helper(image_info_mem + proposal_inst::image_info_depth_index) + EPSILON);
274 scaled_min_bbox_size *= img_z;
276 min_bbox_x = scaled_min_bbox_size;
277 if (image_info.get_layout().size.feature[0] > proposal_inst::image_info_scale_min_bbox_x)
279 min_bbox_x = static_cast<int>(min_bbox_x * float_read_helper(image_info_mem + proposal_inst::image_info_scale_min_bbox_x));
282 min_bbox_y = scaled_min_bbox_size;
283 if (image_info.get_layout().size.feature[0] > proposal_inst::image_info_scale_min_bbox_y)
285 min_bbox_y = static_cast<int>(min_bbox_y * float_read_helper(image_info_mem + proposal_inst::image_info_scale_min_bbox_y));
291 std::swap(img_w, img_h);
295 const auto& score_size = cls_scores.get_layout().size;
296 int fm_h = score_size.spatial[1];
297 int fm_w = score_size.spatial[0];
299 int fm_sz = fm_w * fm_h;
301 mem_lock<dtype> cls_scores_ptr{ cls_scores };
302 mem_lock<dtype> bbox_pred_ptr{ bbox_pred };
303 const dtype* cls_scores_mem = cls_scores_ptr.data();
304 const dtype* bbox_pred_mem = bbox_pred_ptr.data();
306 for (int n = 0; n < score_size.batch[0]; n++)
308 std::vector<proposal_t> sorted_proposals_confidence;
309 size_t num_proposals = fm_h * fm_w * anchors_num;
310 sorted_proposals_confidence.reserve(num_proposals);
311 for (int y = 0; y < fm_h; ++y)
313 for (int x = 0; x < fm_w; ++x)
315 const int anchor_shift_x = (swap_xy ? y : x) * instance.argument.feature_stride;
316 const int anchor_shift_y = (swap_xy ? x : y) * instance.argument.feature_stride;
317 const int location_index = y * fm_w + x;
319 // we assume proposals are grouped by window location
320 for (unsigned int anchor_index = 0; anchor_index < anchors_num ; anchor_index++)
322 float dx0 = float_read_helper(bbox_pred_mem + n*num_proposals*4 + location_index + fm_sz * (anchor_index * 4 + 0)) / box_coordinate_scale;
323 float dy0 = float_read_helper(bbox_pred_mem + n*num_proposals*4 + location_index + fm_sz * (anchor_index * 4 + 1)) / box_coordinate_scale;
324 float dx1 = float_read_helper(bbox_pred_mem + n*num_proposals*4 + location_index + fm_sz * (anchor_index * 4 + 2)) / box_size_scale;
325 float dy1 = float_read_helper(bbox_pred_mem + n*num_proposals*4 + location_index + fm_sz * (anchor_index * 4 + 3)) / box_size_scale;
327 delta_t bbox_delta { dx0, dy0, dx1, dy1 };
329 const roi_t& roi = gen_bbox(anchors[anchor_index], bbox_delta, anchor_shift_x, anchor_shift_y,
330 img_w, img_h, coordinates_offset, initial_clip, clip_before_nms);
332 int bbox_w = (int)(roi.x1 - roi.x0 + coordinates_offset);
333 int bbox_h = (int)(roi.y1 - roi.y0 + coordinates_offset);
335 size_t scores_index = n*num_proposals * 2 + location_index + fm_sz * (anchor_index + anchors_num);
336 float proposal_confidence = (min_bbox_x <= bbox_w)* (min_bbox_y <= bbox_h) * float_read_helper(cls_scores_mem + scores_index);
337 sorted_proposals_confidence.emplace_back(roi, proposal_confidence, sorted_proposals_confidence.size());
342 size_t pre_nms = std::min(instance.argument.pre_nms_topn, (int)sorted_proposals_confidence.size());
343 sort_and_keep_n_items(sorted_proposals_confidence, pre_nms);
344 std::vector<roi_t> res = perform_nms(sorted_proposals_confidence, instance.argument.iou_threshold,
345 instance.argument.post_nms_topn, coordinates_offset);
347 auto& output = instance.output_memory();
349 mem_lock<dtype> output_ptr{ output };
350 dtype* top_data = output_ptr.data() + n*instance.argument.post_nms_topn*5;
352 size_t res_num_rois = res.size();
355 for (size_t i = 0; i < res_num_rois; ++i)
359 res[i].x0 = clamp(res[i].x0, 0.0f, float(img_w));
360 res[i].y0 = clamp(res[i].y0, 0.0f, float(img_h));
361 res[i].x1 = clamp(res[i].x1, 0.0f, float(img_w));
362 res[i].y1 = clamp(res[i].y1, 0.0f, float(img_h));
365 float_write_helper(top_data + 5 * i + 0, float(n));
366 float_write_helper(top_data + 5 * i + 1, res[i].x0 / (instance.argument.normalize ? img_w : 1.0f));
367 float_write_helper(top_data + 5 * i + 2, res[i].y0 / (instance.argument.normalize ? img_h : 1.0f));
368 float_write_helper(top_data + 5 * i + 3, res[i].x1 / (instance.argument.normalize ? img_w : 1.0f));
369 float_write_helper(top_data + 5 * i + 4, res[i].y1 / (instance.argument.normalize ? img_h : 1.0f));
372 for (size_t i = res_num_rois; i < (size_t)instance.argument.post_nms_topn; i++)
374 float_write_helper(top_data + 5*i + 0, -1.0f);
375 float_write_helper(top_data + 5*i + 1, 0.0f);
376 float_write_helper(top_data + 5*i + 2, 0.0f);
377 float_write_helper(top_data + 5*i + 3, 0.0f);
378 float_write_helper(top_data + 5*i + 4, 0.0f);
383 event_impl::ptr execute_impl(const std::vector<event_impl::ptr>& events, proposal_inst& instance) override
385 for (auto& a : events)
390 auto ev = instance.get_network().get_engine().create_user_event(false);
392 if (instance.dep_memory(proposal_inst::cls_scores_index).get_layout().data_type == data_types::f16)
394 execute<data_type_to_type<data_types::f16>::type>(instance);
398 execute<data_type_to_type<data_types::f32>::type>(instance);
401 dynamic_cast<cldnn::user_event*>(ev.get())->set(); // set as complete
405 static primitive_impl* create(const proposal_node& arg)
407 const layout & l = arg.image_info().get_output_layout();
408 const size_t count = static_cast<size_t>(l.size.feature[0]);
410 //Supported image_info sizes and components meaning:
411 // - image_info[3] = { img_height, img_width, img_depth }
412 // - image_info[4] = { img_height, img_width, scale_min_bbox_y, scale_min_bbox_x }
413 // - image_info[6] = { img_height, img_width, img_depth, scale_min_bbox_y, scale_min_bbox_x, scale_depth_index }
414 if (count != 3 && count != 4 && count != 6) {
415 CLDNN_ERROR_MESSAGE(arg.id(), "image_info must have either 3, 4 or 6 items");
418 return new proposal_gpu(arg);
426 implementation_map<proposal>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), proposal_gpu::create);
427 implementation_map<proposal>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), proposal_gpu::create);