Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / src / gpu / proposal_gpu.cpp
1 /*
2 // Copyright (c) 2016-2018 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include "proposal_inst.h"
18 #include "kernel.h"
19 #include "implementation_map.h"
20 #include "network_impl.h"
21 #include "engine_impl.h"
22 #include "math_utils.h"
23 #include "error_handler.h"
24
25 #include <algorithm>
26 #include <string>
27
28 #define EPSILON 0.00001f
29
30 namespace cldnn { namespace gpu {
31
32 namespace {
33
34     /****************************************************************************
35     *                                                                          *
36     *                              Common Utils                                *
37     *                                                                          *
38     ****************************************************************************/
39
40     inline const float& clamp(const float & v, const float & lower, const float & upper)
41     {
42         return std::max(lower, std::min(v, upper));
43     }
44
45     inline bool hasSingleBatchOutput(const program_node & node)
46     {
47         const auto & batch = node.get_output_layout().size.batch;
48
49         return batch.empty() || (batch.size() == 1 && batch[0] == 1);
50     }
51
52     struct roi_t
53     {
54         float x0, y0, x1, y1;
55
56         inline float area() const
57         {
58             return std::max(0.f, y1 - y0 + 1.f) * std::max(0.f, x1 - x0 + 1.f);
59         }
60     };
61
62     struct delta_t { float shift_x, shift_y, log_w, log_h; };
63
64     struct proposal_t
65     {
66         proposal_t() = default;
67         proposal_t(const roi_t& r, const float c, const size_t& o) : roi(r), confidence(c), ord(o) {}
68
69         roi_t roi;
70         float confidence;
71         size_t ord;
72     };
73
74     inline float float_read_helper(const float* mem)
75     {
76         return *mem;
77     }
78
79     inline float float_read_helper(const half_t* mem)
80     {
81         return float16_to_float32(*((uint16_t*)(mem)));
82     }
83
84     inline void float_write_helper(float* mem, float f)
85     {
86         *mem = f;
87     }
88
89     inline void float_write_helper(half_t* mem, float f)
90     {
91         *mem = (half_t)float32_to_float16(f);
92     }
93
94     /****************************************************************************
95      *                                                                          *
96      *                              Impl Details                                *
97      *                                                                          *
98      ****************************************************************************/
99
100     void sort_and_keep_n_items(std::vector<proposal_t>& proposals, size_t n)
101     {
102         auto cmp_fn = [](const proposal_t& a, const proposal_t& b)
103         {
104             return (a.confidence > b.confidence);
105         };
106
107         if (proposals.size() > n)
108         {
109             std::partial_sort(proposals.begin(), proposals.begin() + n, proposals.end(), cmp_fn);
110             proposals.resize(n);
111         }
112         else
113         {
114             std::sort(proposals.begin(), proposals.end(), cmp_fn);
115         }
116     }
117
118     roi_t gen_bbox(
119             const proposal_inst::anchor& box,
120             const delta_t& delta,
121             int anchor_shift_x,
122             int anchor_shift_y,
123             int img_w,
124             int img_h,
125             float coordinates_offset,
126             bool initial_clip,
127             bool clip_before_nms)
128     {
129         float x0 = box.start_x + anchor_shift_x;
130         float y0 = box.start_y + anchor_shift_y;
131         float x1 = box.end_x + anchor_shift_x;
132         float y1 = box.end_y + anchor_shift_y;
133
134         if (initial_clip)
135         {
136             x0 = clamp(x0, 0.0f, static_cast<float>(img_w));
137             y0 = clamp(y0, 0.0f, static_cast<float>(img_h));
138             x1 = clamp(x1, 0.0f, static_cast<float>(img_w));
139             y1 = clamp(y1, 0.0f, static_cast<float>(img_h));
140         }
141
142         const float anchor_w = x1 - x0 + coordinates_offset;
143         const float anchor_h = y1 - y0 + coordinates_offset;
144         const float center_x = x0 + 0.5f * anchor_w;
145         const float center_y = y0 + 0.5f * anchor_h;
146
147         const float pred_center_x = delta.shift_x * anchor_w + center_x;
148         const float pred_center_y = delta.shift_y * anchor_h + center_y;
149         const float half_pred_w = std::exp(delta.log_w) * anchor_w * .5f;
150         const float half_pred_h = std::exp(delta.log_h) * anchor_h * .5f;
151
152         float new_x0 = pred_center_x - half_pred_w;
153         float new_y0 = pred_center_y - half_pred_h;
154         float new_x1 = pred_center_x + half_pred_w;
155         float new_y1 = pred_center_y + half_pred_h;
156
157         if (clip_before_nms)
158         {
159             new_x0 = clamp(new_x0, 0.f, img_w - coordinates_offset);
160             new_y0 = clamp(new_y0, 0.f, img_h - coordinates_offset);
161             new_x1 = clamp(new_x1, 0.f, img_w - coordinates_offset);
162             new_y1 = clamp(new_y1, 0.f, img_h - coordinates_offset);
163         }
164
165         return { new_x0, new_y0, new_x1, new_y1 };
166     }
167
168     std::vector<roi_t> perform_nms(
169             const std::vector<proposal_t>& proposals,
170             float iou_threshold,
171             size_t top_n,
172             float coordinates_offset)
173     {
174         std::vector<roi_t> res;
175         res.reserve(top_n);
176
177         for (const auto & prop : proposals)
178         {
179             const roi_t& bbox = prop.roi;
180
181             bool overlaps = std::any_of(res.begin(), res.end(), [&](const roi_t& res_bbox)
182             {
183                 bool intersecting = (bbox.x0 < res_bbox.x1) & (res_bbox.x0 < bbox.x1) & (bbox.y0 < res_bbox.y1) & (res_bbox.y0 < bbox.y1);
184                 float overlap = 0.0f;
185                 if (intersecting)
186                 {
187                     const float x0 = std::max(bbox.x0, res_bbox.x0);
188                     const float y0 = std::max(bbox.y0, res_bbox.y0);
189                     const float x1 = std::min(bbox.x1, res_bbox.x1);
190                     const float y1 = std::min(bbox.y1, res_bbox.y1);
191
192                     const float intersect_width = std::max(0.0f, x1 - x0 + coordinates_offset);
193                     const float intersect_height = std::max(0.0f, y1 - y0 + coordinates_offset);
194                     const float intersect_size = intersect_width * intersect_height;
195
196                     const float A_area = (bbox.x1 - bbox.x0 + coordinates_offset) * (bbox.y1 - bbox.y0 + coordinates_offset);
197                     const float B_area = (res_bbox.x1 - res_bbox.x0 + coordinates_offset) * (res_bbox.y1 - res_bbox.y0 + coordinates_offset);
198
199                     overlap = intersect_size / (A_area + B_area - intersect_size);
200                 }
201                 return overlap > iou_threshold;
202             });
203
204             if (!overlaps)
205             {
206                 res.push_back(bbox);
207                 if (res.size() == top_n) break;
208             }
209         }
210
211         res.resize(top_n);
212         return res;
213     }
214 } // anonymous namespace
215
216
217 /****************************************************************************
218 *                                                                          *
219 *                              Proposal Layer                              *
220 *                                                                          *
221 ****************************************************************************/
222
223 struct proposal_gpu : typed_primitive_impl<proposal>
224 {
225     const proposal_node& outer;
226
227     proposal_gpu(const proposal_node& arg)
228             : outer(arg)
229     {}
230
231     template<typename dtype>
232     void execute(proposal_inst& instance)
233     {
234         const std::vector<proposal_inst::anchor>& anchors = instance.get_anchors();
235
236         size_t anchors_num = anchors.size();
237
238         auto& cls_scores = instance.dep_memory(proposal_inst::cls_scores_index);
239         auto& bbox_pred  = instance.dep_memory(proposal_inst::bbox_pred_index);
240         auto& image_info = instance.dep_memory(proposal_inst::image_info_index);
241
242         // original input image to the graph (after possible scaling etc.) so that coordinates are valid for it
243         mem_lock<dtype> image_info_ptr{ image_info };
244         const dtype* image_info_mem = image_info_ptr.data();
245
246         int img_w = 1;
247         int img_h = 1;
248         int img_z = 1;
249         int min_bbox_x = 1;
250         int min_bbox_y = 1;
251         int scaled_min_bbox_size = instance.argument.min_bbox_size;
252
253         bool swap_xy = instance.argument.swap_xy;
254         bool initial_clip = instance.argument.initial_clip;
255         bool clip_before_nms = instance.argument.clip_before_nms;
256         bool clip_after_nms = instance.argument.clip_after_nms;
257         float coordinates_offset = instance.argument.coordinates_offset;
258         float box_coordinate_scale = instance.argument.box_coordinate_scale;
259         float box_size_scale = instance.argument.box_size_scale;
260
261         if (image_info.get_layout().size.feature[0] == 4)
262         {
263             img_w = static_cast<int>(float_read_helper(image_info_mem + proposal_inst::image_info_width_index) + EPSILON);
264             img_h = static_cast<int>(float_read_helper(image_info_mem + proposal_inst::image_info_height_index) + EPSILON);
265             min_bbox_x = static_cast<int>(scaled_min_bbox_size * float_read_helper(image_info_mem + 3));
266             min_bbox_y = static_cast<int>(scaled_min_bbox_size * float_read_helper(image_info_mem + 2));
267         }
268         else
269         {
270             img_w = static_cast<int>(float_read_helper(image_info_mem + proposal_inst::image_info_width_index) + EPSILON);
271             img_h = static_cast<int>(float_read_helper(image_info_mem + proposal_inst::image_info_height_index) + EPSILON);
272             img_z = static_cast<int>(float_read_helper(image_info_mem + proposal_inst::image_info_depth_index) + EPSILON);
273
274             scaled_min_bbox_size *= img_z;
275
276             min_bbox_x = scaled_min_bbox_size;
277             if (image_info.get_layout().size.feature[0] > proposal_inst::image_info_scale_min_bbox_x)
278             {
279                 min_bbox_x = static_cast<int>(min_bbox_x * float_read_helper(image_info_mem + proposal_inst::image_info_scale_min_bbox_x));
280             }
281
282             min_bbox_y = scaled_min_bbox_size;
283             if (image_info.get_layout().size.feature[0] > proposal_inst::image_info_scale_min_bbox_y)
284             {
285                 min_bbox_y = static_cast<int>(min_bbox_y * float_read_helper(image_info_mem + proposal_inst::image_info_scale_min_bbox_y));
286             }
287         }
288
289         if (swap_xy)
290         {
291             std::swap(img_w, img_h);
292         }
293
294         // feat map sizes
295         const auto& score_size = cls_scores.get_layout().size;
296         int fm_h = score_size.spatial[1];
297         int fm_w = score_size.spatial[0];
298
299         int fm_sz = fm_w * fm_h;
300
301         mem_lock<dtype> cls_scores_ptr{ cls_scores };
302         mem_lock<dtype> bbox_pred_ptr{ bbox_pred };
303         const dtype* cls_scores_mem = cls_scores_ptr.data();
304         const dtype* bbox_pred_mem  = bbox_pred_ptr.data();
305
306         for (int n = 0; n < score_size.batch[0]; n++)
307         {
308             std::vector<proposal_t> sorted_proposals_confidence;
309             size_t num_proposals = fm_h * fm_w * anchors_num;
310             sorted_proposals_confidence.reserve(num_proposals);
311             for (int y = 0; y < fm_h; ++y)
312             {
313                 for (int x = 0; x < fm_w; ++x)
314                 {
315                     const int anchor_shift_x = (swap_xy ? y : x) * instance.argument.feature_stride;
316                     const int anchor_shift_y = (swap_xy ? x : y) * instance.argument.feature_stride;
317                     const int location_index = y * fm_w + x;
318
319                     // we assume proposals are grouped by window location
320                     for (unsigned int anchor_index = 0; anchor_index < anchors_num ; anchor_index++)
321                     {
322                         float dx0 = float_read_helper(bbox_pred_mem + n*num_proposals*4 + location_index + fm_sz * (anchor_index * 4 + 0)) / box_coordinate_scale;
323                         float dy0 = float_read_helper(bbox_pred_mem + n*num_proposals*4 + location_index + fm_sz * (anchor_index * 4 + 1)) / box_coordinate_scale;
324                         float dx1 = float_read_helper(bbox_pred_mem + n*num_proposals*4 + location_index + fm_sz * (anchor_index * 4 + 2)) / box_size_scale;
325                         float dy1 = float_read_helper(bbox_pred_mem + n*num_proposals*4 + location_index + fm_sz * (anchor_index * 4 + 3)) / box_size_scale;
326
327                         delta_t bbox_delta { dx0, dy0, dx1, dy1 };
328
329                         const roi_t& roi = gen_bbox(anchors[anchor_index], bbox_delta, anchor_shift_x, anchor_shift_y,
330                                                     img_w, img_h, coordinates_offset, initial_clip, clip_before_nms);
331
332                         int bbox_w = (int)(roi.x1 - roi.x0 + coordinates_offset);
333                         int bbox_h = (int)(roi.y1 - roi.y0 + coordinates_offset);
334
335                         size_t scores_index = n*num_proposals * 2 + location_index + fm_sz * (anchor_index + anchors_num);
336                         float proposal_confidence = (min_bbox_x <= bbox_w)* (min_bbox_y <= bbox_h) * float_read_helper(cls_scores_mem + scores_index);
337                         sorted_proposals_confidence.emplace_back(roi, proposal_confidence, sorted_proposals_confidence.size());
338                     }
339                 }
340             }
341
342             size_t pre_nms = std::min(instance.argument.pre_nms_topn, (int)sorted_proposals_confidence.size());
343             sort_and_keep_n_items(sorted_proposals_confidence, pre_nms);
344             std::vector<roi_t> res = perform_nms(sorted_proposals_confidence, instance.argument.iou_threshold,
345                                                  instance.argument.post_nms_topn, coordinates_offset);
346
347             auto& output = instance.output_memory();
348
349             mem_lock<dtype> output_ptr{ output };
350             dtype* top_data = output_ptr.data() + n*instance.argument.post_nms_topn*5;
351
352             size_t res_num_rois = res.size();
353
354
355             for (size_t i = 0; i < res_num_rois; ++i)
356             {
357                 if (clip_after_nms)
358                 {
359                     res[i].x0 = clamp(res[i].x0, 0.0f, float(img_w));
360                     res[i].y0 = clamp(res[i].y0, 0.0f, float(img_h));
361                     res[i].x1 = clamp(res[i].x1, 0.0f, float(img_w));
362                     res[i].y1 = clamp(res[i].y1, 0.0f, float(img_h));
363                 }
364
365                 float_write_helper(top_data + 5 * i + 0, float(n));
366                 float_write_helper(top_data + 5 * i + 1, res[i].x0 / (instance.argument.normalize ? img_w : 1.0f));
367                 float_write_helper(top_data + 5 * i + 2, res[i].y0 / (instance.argument.normalize ? img_h : 1.0f));
368                 float_write_helper(top_data + 5 * i + 3, res[i].x1 / (instance.argument.normalize ? img_w : 1.0f));
369                 float_write_helper(top_data + 5 * i + 4, res[i].y1 / (instance.argument.normalize ? img_h : 1.0f));
370             }
371
372             for (size_t i = res_num_rois; i < (size_t)instance.argument.post_nms_topn; i++)
373             {
374                 float_write_helper(top_data + 5*i + 0, -1.0f);
375                 float_write_helper(top_data + 5*i + 1,  0.0f);
376                 float_write_helper(top_data + 5*i + 2,  0.0f);
377                 float_write_helper(top_data + 5*i + 3,  0.0f);
378                 float_write_helper(top_data + 5*i + 4,  0.0f);
379             }
380         }
381     }
382
383     event_impl::ptr execute_impl(const std::vector<event_impl::ptr>& events, proposal_inst& instance) override
384     {
385         for (auto& a : events)
386         {
387             a->wait();
388         }
389
390         auto ev = instance.get_network().get_engine().create_user_event(false);
391
392         if (instance.dep_memory(proposal_inst::cls_scores_index).get_layout().data_type == data_types::f16)
393         {
394             execute<data_type_to_type<data_types::f16>::type>(instance);
395         }
396         else
397         {
398             execute<data_type_to_type<data_types::f32>::type>(instance);
399         }
400
401         dynamic_cast<cldnn::user_event*>(ev.get())->set(); // set as complete
402         return ev;
403     }
404
405     static primitive_impl* create(const proposal_node& arg)
406     {
407         const layout & l = arg.image_info().get_output_layout();
408         const size_t count = static_cast<size_t>(l.size.feature[0]);
409
410         //Supported image_info sizes and components meaning:
411         // - image_info[3] = { img_height, img_width, img_depth }
412         // - image_info[4] = { img_height, img_width, scale_min_bbox_y, scale_min_bbox_x }
413         // - image_info[6] = { img_height, img_width, img_depth, scale_min_bbox_y, scale_min_bbox_x, scale_depth_index }
414         if (count != 3 && count != 4 && count != 6) {
415             CLDNN_ERROR_MESSAGE(arg.id(), "image_info must have either 3, 4 or 6 items");
416         }
417
418         return new proposal_gpu(arg);
419     }
420 };
421
422 namespace {
423     struct attach {
424         attach()
425         {
426             implementation_map<proposal>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), proposal_gpu::create);
427             implementation_map<proposal>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), proposal_gpu::create);
428         }
429
430         ~attach() {}
431     };
432     attach attach_impl;
433 }
434 } }