From: Dmitry Matveev Date: Wed, 27 Nov 2019 14:54:17 +0000 (+0300) Subject: Merge pull request #15753 from dmatveev:dm/ng-5000-security_barrier-interactive_face X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~1^2~47 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=fb5e7964b366ddd6cc3d90e5edba7650c81e3caf;p=platform%2Fupstream%2Fopencv.git Merge pull request #15753 from dmatveev:dm/ng-5000-security_barrier-interactive_face G-API: Introduced Security Barrier & Interactive Face Detection samples * G-API-NG/Samples: Added samples & relevant changes - Security barrier camera sample - Age/Gender/Emotions recognition sample - GIEBackend now loads CPU extension libraries - A couple of API-level workarounds added to deal with cv::Mat/Blob conversions * G-API-NG/Samples: removed HAVE_INF_ENGINE remnants --- diff --git a/modules/gapi/include/opencv2/gapi/infer/ie.hpp b/modules/gapi/include/opencv2/gapi/infer/ie.hpp index f9f5979..6e8c2c3 100644 --- a/modules/gapi/include/opencv2/gapi/infer/ie.hpp +++ b/modules/gapi/include/opencv2/gapi/infer/ie.hpp @@ -25,6 +25,22 @@ namespace ie { GAPI_EXPORTS cv::gapi::GBackend backend(); +/** + * Specify how G-API and IE should trait input data + * + * In OpenCV, the same cv::Mat is used to represent both + * image and tensor data. Sometimes those are hardly distinguishable, + * so this extra parameter is used to give G-API a hint. + * + * This hint controls how G-API reinterprets the data when converting + * it to IE Blob format (and which layout/etc is assigned to this data). + */ +enum class TraitAs: int +{ + TENSOR, //!< G-API traits an associated cv::Mat as a raw tensor and passes dimensions as-is + IMAGE //!< G-API traits an associated cv::Mat as an image so creates an "image" blob (NCHW/NHWC, etc) +}; + namespace detail { struct ParamDesc { std::string model_path; @@ -35,7 +51,8 @@ namespace detail { std::vector input_names; std::vector output_names; - std::unordered_map const_inputs; + using ConstInput = std::pair; + std::unordered_map const_inputs; // NB: nun_* may differ from topology's real input/output port numbers // (e.g. topology's partial execution) @@ -83,8 +100,9 @@ public: } Params& constInput(const std::string &layer_name, - const cv::Mat &data) { - desc.const_inputs[layer_name] = data; + const cv::Mat &data, + TraitAs hint = TraitAs::TENSOR) { + desc.const_inputs[layer_name] = {data, hint}; return *this; } diff --git a/modules/gapi/src/backends/ie/giebackend.cpp b/modules/gapi/src/backends/ie/giebackend.cpp index 3f01bfd..d754a27 100644 --- a/modules/gapi/src/backends/ie/giebackend.cpp +++ b/modules/gapi/src/backends/ie/giebackend.cpp @@ -12,7 +12,7 @@ #ifdef HAVE_INF_ENGINE -#if INF_ENGINE_RELEASE <= 2018050000 +#if INF_ENGINE_RELEASE <= 2019010000 # error G-API IE module supports only OpenVINO IE >= 2019 R1 #endif @@ -26,11 +26,13 @@ #include #include +#include +#include + #include #include #include #include - #include #include "compiler/gobjref.hpp" @@ -66,6 +68,21 @@ inline std::vector toCV(const IE::SizeVector &vsz) { return result; } +inline IE::Layout toIELayout(const std::size_t ndims) { + static const IE::Layout lts[] = { + IE::Layout::SCALAR, + IE::Layout::C, + IE::Layout::NC, + IE::Layout::CHW, + IE::Layout::NCHW, + IE::Layout::NCDHW, + }; + // FIXME: This is not really a good conversion, + // since it may also stand for NHWC/HW/CN/NDHWC data + CV_Assert(ndims < sizeof(lts) / sizeof(lts[0])); + return lts[ndims]; +} + inline IE::Precision toIE(int depth) { switch (depth) { case CV_8U: return IE::Precision::U8; @@ -83,13 +100,16 @@ inline int toCV(IE::Precision prec) { return -1; } -inline IE::TensorDesc toIE(const cv::Mat &mat) { +inline IE::TensorDesc toIE(const cv::Mat &mat, cv::gapi::ie::TraitAs hint) { const auto &sz = mat.size; // NB: For some reason RGB image is 2D image // (since channel component is not counted here). - if (sz.dims() == 2) { + // Note: regular 2D vectors also fall into this category + if (sz.dims() == 2 && hint == cv::gapi::ie::TraitAs::IMAGE) + { // NB: This logic is mainly taken from IE samples + const size_t pixsz = CV_ELEM_SIZE1(mat.type()); const size_t channels = mat.channels(); const size_t height = mat.size().height; const size_t width = mat.size().width; @@ -98,8 +118,8 @@ inline IE::TensorDesc toIE(const cv::Mat &mat) { const size_t strideW = mat.step.buf[1]; const bool is_dense = - strideW == channels && - strideH == channels * width; + strideW == pixsz * channels && + strideH == strideW * width; if (!is_dense) cv::util::throw_error(std::logic_error("Doesn't support conversion" @@ -110,12 +130,11 @@ inline IE::TensorDesc toIE(const cv::Mat &mat) { IE::Layout::NHWC); } - GAPI_Assert(sz.dims() == 4); // NB: Will relax when needed (to known use) - return IE::TensorDesc(toIE(mat.depth()), toIE(sz), IE::Layout::NCHW); + return IE::TensorDesc(toIE(mat.depth()), toIE(sz), toIELayout(sz.dims())); } -inline IE::Blob::Ptr wrapIE(const cv::Mat &mat) { - const auto tDesc = toIE(mat); +inline IE::Blob::Ptr wrapIE(const cv::Mat &mat, cv::gapi::ie::TraitAs hint) { + const auto tDesc = toIE(mat, hint); switch (mat.depth()) { // NB: Seems there's no way to create an untyped (T-less) Blob::Ptr // in IE given only precision via TensorDesc. So we have to do this: @@ -187,15 +206,62 @@ struct IEUnit { } // This method is [supposed to be] called at Island compilation stage + // TODO: Move to a new OpenVINO Core API! cv::gimpl::ie::IECompiled compile() const { auto this_plugin = IE::PluginDispatcher().getPluginByDevice(params.device_id); + + // Load extensions (taken from DNN module) + if (params.device_id == "CPU" || params.device_id == "FPGA") + { + const std::string suffixes[] = { "_avx2", "_sse4", ""}; + const bool haveFeature[] = { + cv::checkHardwareSupport(CPU_AVX2), + cv::checkHardwareSupport(CPU_SSE4_2), + true + }; + std::vector candidates; + for (auto &&it : ade::util::zip(ade::util::toRange(suffixes), + ade::util::toRange(haveFeature))) + { + std::string suffix; + bool available = false; + std::tie(suffix, available) = it; + if (!available) continue; +#ifdef _WIN32 + candidates.push_back("cpu_extension" + suffix + ".dll"); +#elif defined(__APPLE__) + candidates.push_back("libcpu_extension" + suffix + ".so"); // built as loadable module + candidates.push_back("libcpu_extension" + suffix + ".dylib"); // built as shared library +#else + candidates.push_back("libcpu_extension" + suffix + ".so"); +#endif // _WIN32 + } + for (auto &&extlib : candidates) + { + try + { + this_plugin.AddExtension(IE::make_so_pointer(extlib)); + CV_LOG_INFO(NULL, "DNN-IE: Loaded extension plugin: " << extlib); + break; + } + catch(...) + { + CV_LOG_WARNING(NULL, "Failed to load IE extension " << extlib); + } + } + } + auto this_network = this_plugin.LoadNetwork(net, {}); // FIXME: 2nd parameter to be // configurable via the API auto this_request = this_network.CreateInferRequest(); // Bind const data to infer request for (auto &&p : params.const_inputs) { - this_request.SetBlob(p.first, wrapIE(p.second)); + // FIXME: SetBlob is known to be inefficient, + // it is worth to make a customizable "initializer" and pass the + // cv::Mat-wrapped blob there to support IE's optimal "GetBlob idiom" + // Still, constant data is to set only once. + this_request.SetBlob(p.first, wrapIE(p.second.first, p.second.second)); } return {this_plugin, this_network, this_request}; @@ -444,7 +510,9 @@ struct Infer: public cv::detail::KernelTag { // (A memory dialog comes to the picture again) const cv::Mat this_mat = to_ocv(ctx.inMat(i)); - IE::Blob::Ptr this_blob = wrapIE(this_mat); + // FIXME: By default here we trait our inputs as images. + // May be we need to make some more intelligence here about it + IE::Blob::Ptr this_blob = wrapIE(this_mat, cv::gapi::ie::TraitAs::IMAGE); iec.this_request.SetBlob(uu.params.input_names[i], this_blob); } iec.this_request.Infer(); @@ -514,7 +582,8 @@ struct InferList: public cv::detail::KernelTag { const auto& in_roi_vec = ctx.inArg(0u).rref(); const cv::Mat this_mat = to_ocv(ctx.inMat(1u)); - IE::Blob::Ptr this_blob = wrapIE(this_mat); + // Since we do a ROI list inference, always assume our input buffer is image + IE::Blob::Ptr this_blob = wrapIE(this_mat, cv::gapi::ie::TraitAs::IMAGE); // FIXME: This could be done ONCE at graph compile stage! std::vector< std::vector > cached_dims(uu.params.num_out); @@ -601,10 +670,11 @@ std::vector cv::gapi::ie::util::to_ocv(const InferenceEngine::SizeVector &d } InferenceEngine::Blob::Ptr cv::gapi::ie::util::to_ie(cv::Mat &blob) { - return wrapIE(blob); + return wrapIE(blob, cv::gapi::ie::TraitAs::IMAGE); } -#else +#else // HAVE_INF_ENGINE + cv::gapi::GBackend cv::gapi::ie::backend() { // Still provide this symbol to avoid linking issues util::throw_error(std::runtime_error("G-API has been compiled without OpenVINO IE support")); diff --git a/samples/cpp/tutorial_code/gapi/age_gender_emotion_recognition/age_gender_emotion_recognition.cpp b/samples/cpp/tutorial_code/gapi/age_gender_emotion_recognition/age_gender_emotion_recognition.cpp new file mode 100644 index 0000000..e4a5be7 --- /dev/null +++ b/samples/cpp/tutorial_code/gapi/age_gender_emotion_recognition/age_gender_emotion_recognition.cpp @@ -0,0 +1,352 @@ +#include "opencv2/opencv_modules.hpp" +#if defined(HAVE_OPENCV_GAPI) + +#include +#include + +#include "opencv2/imgproc.hpp" +#include "opencv2/highgui.hpp" + +#include "opencv2/gapi.hpp" +#include "opencv2/gapi/core.hpp" +#include "opencv2/gapi/imgproc.hpp" +#include "opencv2/gapi/infer.hpp" +#include "opencv2/gapi/infer/ie.hpp" +#include "opencv2/gapi/cpu/gcpukernel.hpp" +#include "opencv2/gapi/streaming/cap.hpp" + +namespace { +const std::string about = + "This is an OpenCV-based version of Security Barrier Camera example"; +const std::string keys = + "{ h help | | print this help message }" + "{ input | | Path to an input video file }" + "{ fdm | | IE face detection model IR }" + "{ fdw | | IE face detection model weights }" + "{ fdd | | IE face detection device }" + "{ agem | | IE age/gender recognition model IR }" + "{ agew | | IE age/gender recognition model weights }" + "{ aged | | IE age/gender recognition model device }" + "{ emom | | IE emotions recognition model IR }" + "{ emow | | IE emotions recognition model weights }" + "{ emod | | IE emotions recognition model device }" + "{ pure | | When set, no output is displayed. Useful for benchmarking }"; + +struct Avg { + struct Elapsed { + explicit Elapsed(double ms) : ss(ms/1000.), mm(static_cast(ss)/60) {} + const double ss; + const int mm; + }; + + using MS = std::chrono::duration>; + using TS = std::chrono::time_point; + TS started; + + void start() { started = now(); } + TS now() const { return std::chrono::high_resolution_clock::now(); } + double tick() const { return std::chrono::duration_cast(now() - started).count(); } + Elapsed elapsed() const { return Elapsed{tick()}; } + double fps(std::size_t n) const { return static_cast(n) / (tick() / 1000.); } +}; +std::ostream& operator<<(std::ostream &os, const Avg::Elapsed &e) { + os << e.mm << ':' << (e.ss - 60*e.mm); + return os; +} +} // namespace + +namespace custom { +// Describe networks we use in our program. +// In G-API, topologies act like "operations". Here we define our +// topologies as operations which have inputs and outputs. + +// Every network requires three parameters to define: +// 1) Network's TYPE name - this TYPE is then used as a template +// parameter to generic functions like cv::gapi::infer<>(), +// and is used to define network's configuration (per-backend). +// 2) Network's SIGNATURE - a std::function<>-like record which defines +// networks' input and output parameters (its API) +// 3) Network's IDENTIFIER - a string defining what the network is. +// Must be unique within the pipeline. + +// Note: these definitions are neutral to _how_ the networks are +// executed. The _how_ is defined at graph compilation stage (via parameters), +// not on the graph construction stage. + +// Face detector: takes one Mat, returns another Mat +G_API_NET(Faces, , "face-detector"); + +// Age/Gender recognition - takes one Mat, returns two: +// one for Age and one for Gender. In G-API, multiple-return-value operations +// are defined using std::tuple<>. +using AGInfo = std::tuple; +G_API_NET(AgeGender, , "age-gender-recoginition"); + +// Emotion recognition - takes one Mat, returns another. +G_API_NET(Emotions, , "emotions-recognition"); + +// SSD Post-processing function - this is not a network but a kernel. +// The kernel body is declared separately, this is just an interface. +// This operation takes two Mats (detections and the source image), +// and returns a vector of ROI (filtered by a default threshold). +// Threshold (or a class to select) may become a parameter, but since +// this kernel is custom, it doesn't make a lot of sense. +G_API_OP(PostProc, (cv::GMat, cv::GMat)>, "custom.fd_postproc") { + static cv::GArrayDesc outMeta(const cv::GMatDesc &, const cv::GMatDesc &) { + // This function is required for G-API engine to figure out + // what the output format is, given the input parameters. + // Since the output is an array (with a specific type), + // there's nothing to describe. + return cv::empty_array_desc(); + } +}; + +GAPI_OCV_KERNEL(OCVPostProc, PostProc) { + static void run(const cv::Mat &in_ssd_result, + const cv::Mat &in_frame, + std::vector &out_faces) { + const int MAX_PROPOSALS = 200; + const int OBJECT_SIZE = 7; + const cv::Size upscale = in_frame.size(); + const cv::Rect surface({0,0}, upscale); + + out_faces.clear(); + + const float *data = in_ssd_result.ptr(); + for (int i = 0; i < MAX_PROPOSALS; i++) { + const float image_id = data[i * OBJECT_SIZE + 0]; // batch id + const float confidence = data[i * OBJECT_SIZE + 2]; + const float rc_left = data[i * OBJECT_SIZE + 3]; + const float rc_top = data[i * OBJECT_SIZE + 4]; + const float rc_right = data[i * OBJECT_SIZE + 5]; + const float rc_bottom = data[i * OBJECT_SIZE + 6]; + + if (image_id < 0.f) { // indicates end of detections + break; + } + if (confidence < 0.5f) { // fixme: hard-coded snapshot + continue; + } + + cv::Rect rc; + rc.x = static_cast(rc_left * upscale.width); + rc.y = static_cast(rc_top * upscale.height); + rc.width = static_cast(rc_right * upscale.width) - rc.x; + rc.height = static_cast(rc_bottom * upscale.height) - rc.y; + out_faces.push_back(rc & surface); + } + } +}; +} // namespace custom + +namespace labels { +const std::string genders[] = { + "Female", "Male" +}; +const std::string emotions[] = { + "neutral", "happy", "sad", "surprise", "anger" +}; +namespace { +void DrawResults(cv::Mat &frame, + const std::vector &faces, + const std::vector &out_ages, + const std::vector &out_genders, + const std::vector &out_emotions) { + CV_Assert(faces.size() == out_ages.size()); + CV_Assert(faces.size() == out_genders.size()); + CV_Assert(faces.size() == out_emotions.size()); + + for (auto it = faces.begin(); it != faces.end(); ++it) { + const auto idx = std::distance(faces.begin(), it); + const auto &rc = *it; + + const float *ages_data = out_ages[idx].ptr(); + const float *genders_data = out_genders[idx].ptr(); + const float *emotions_data = out_emotions[idx].ptr(); + const auto gen_id = std::max_element(genders_data, genders_data + 2) - genders_data; + const auto emo_id = std::max_element(emotions_data, emotions_data + 5) - emotions_data; + + std::stringstream ss; + ss << static_cast(ages_data[0]*100) + << ' ' + << genders[gen_id] + << ' ' + << emotions[emo_id]; + + const int ATTRIB_OFFSET = 15; + cv::rectangle(frame, rc, {0, 255, 0}, 4); + cv::putText(frame, ss.str(), + cv::Point(rc.x, rc.y - ATTRIB_OFFSET), + cv::FONT_HERSHEY_COMPLEX_SMALL, + 1, + cv::Scalar(0, 0, 255)); + } +} + +void DrawFPS(cv::Mat &frame, std::size_t n, double fps) { + std::ostringstream out; + out << "FRAME " << n << ": " + << std::fixed << std::setprecision(2) << fps + << " FPS (AVG)"; + cv::putText(frame, out.str(), + cv::Point(0, frame.rows), + cv::FONT_HERSHEY_SIMPLEX, + 1, + cv::Scalar(0, 255, 0), + 2); +} +} // anonymous namespace +} // namespace labels + +int main(int argc, char *argv[]) +{ + cv::CommandLineParser cmd(argc, argv, keys); + cmd.about(about); + if (cmd.has("help")) { + cmd.printMessage(); + return 0; + } + const std::string input = cmd.get("input"); + const bool no_show = cmd.get("pure"); + + // Express our processing pipeline. Lambda-based constructor + // is used to keep all temporary objects in a dedicated scope. + cv::GComputation pp([]() { + // Declare an empty GMat - the beginning of the pipeline. + cv::GMat in; + + // Run face detection on the input frame. Result is a single GMat, + // internally representing an 1x1x200x7 SSD output. + // This is a single-patch version of infer: + // - Inference is running on the whole input image; + // - Image is converted and resized to the network's expected format + // automatically. + cv::GMat detections = cv::gapi::infer(in); + + // Parse SSD output to a list of ROI (rectangles) using + // a custom kernel. Note: parsing SSD may become a "standard" kernel. + cv::GArray faces = custom::PostProc::on(detections, in); + + // Now run Age/Gender model on every detected face. This model has two + // outputs (for age and gender respectively). + // A special ROI-list-oriented form of infer<>() is used here: + // - First input argument is the list of rectangles to process, + // - Second one is the image where to take ROI from; + // - Crop/Resize/Layout conversion happens automatically for every image patch + // from the list + // - Inference results are also returned in form of list (GArray<>) + // - Since there're two outputs, infer<> return two arrays (via std::tuple). + cv::GArray ages; + cv::GArray genders; + std::tie(ages, genders) = cv::gapi::infer(faces, in); + + // Recognize emotions on every face. + // ROI-list-oriented infer<>() is used here as well. + // Since custom::Emotions network produce a single output, only one + // GArray<> is returned here. + cv::GArray emotions = cv::gapi::infer(faces, in); + + // Return the decoded frame as a result as well. + // Input matrix can't be specified as output one, so use copy() here + // (this copy will be optimized out in the future). + cv::GMat frame = cv::gapi::copy(in); + + // Now specify the computation's boundaries - our pipeline consumes + // one images and produces five outputs. + return cv::GComputation(cv::GIn(in), + cv::GOut(frame, faces, ages, genders, emotions)); + }); + + // Note: it might be very useful to have dimensions loaded at this point! + // After our computation is defined, specify how it should be executed. + // Execution is defined by inference backends and kernel backends we use to + // compile the pipeline (it is a different step). + + // Declare IE parameters for FaceDetection network. Note here custom::Face + // is the type name we specified in GAPI_NETWORK() previously. + // cv::gapi::ie::Params<> is a generic configuration description which is + // specialized to every particular network we use. + // + // OpenCV DNN backend will have its own parmater structure with settings + // relevant to OpenCV DNN module. Same applies to other possible inference + // backends, like cuDNN, etc (:-)) + auto det_net = cv::gapi::ie::Params { + cmd.get("fdm"), // read cmd args: path to topology IR + cmd.get("fdw"), // read cmd args: path to weights + cmd.get("fdd"), // read cmd args: device specifier + }; + + auto age_net = cv::gapi::ie::Params { + cmd.get("agem"), // read cmd args: path to topology IR + cmd.get("agew"), // read cmd args: path to weights + cmd.get("aged"), // read cmd args: device specifier + }.cfgOutputLayers({ "age_conv3", "prob" }); + + auto emo_net = cv::gapi::ie::Params { + cmd.get("emom"), // read cmd args: path to topology IR + cmd.get("emow"), // read cmd args: path to weights + cmd.get("emod"), // read cmd args: device specifier + }; + + // Form a kernel package (with a single OpenCV-based implementation of our + // post-processing) and a network package (holding our three networks).x + auto kernels = cv::gapi::kernels(); + auto networks = cv::gapi::networks(det_net, age_net, emo_net); + + // Compile our pipeline for a specific input image format (TBD - can be relaxed) + // and pass our kernels & networks as parameters. + // This is the place where G-API learns which networks & kernels we're actually + // operating with (the graph description itself known nothing about that). + auto cc = pp.compileStreaming(cv::GMatDesc{CV_8U,3,cv::Size(1280,720)}, + cv::compile_args(kernels, networks)); + + std::cout << "Reading " << input << std::endl; + cc.setSource(cv::gapi::wip::make_src(input)); + + Avg avg; + avg.start(); + cc.start(); + + cv::Mat frame; + std::vector faces; + std::vector out_ages; + std::vector out_genders; + std::vector out_emotions; + std::size_t frames = 0u; + + // Implement different execution policies depending on the display option + // for the best performance. + while (cc.running()) { + auto out_vector = cv::gout(frame, faces, out_ages, out_genders, out_emotions); + if (no_show) { + // This is purely a video processing. No need to balance with UI rendering. + // Use a blocking pull() to obtain data. Break the loop if the stream is over. + if (!cc.pull(std::move(out_vector))) + break; + } else if (!cc.try_pull(std::move(out_vector))) { + // Use a non-blocking try_pull() to obtain data. + // If there's no data, let UI refresh (and handle keypress) + if (cv::waitKey(1) >= 0) break; + else continue; + } + // At this point we have data for sure (obtained in either blocking or non-blocking way). + frames++; + labels::DrawResults(frame, faces, out_ages, out_genders, out_emotions); + labels::DrawFPS(frame, frames, avg.fps(frames)); + if (!no_show) cv::imshow("Out", frame); + } + cc.stop(); + std::cout << "Processed " << frames << " frames in " << avg.elapsed() << std::endl; + + return 0; +} +#else +#include +int main() +{ + std::cerr << "This tutorial code requires G-API module " + "with Inference Engine backend to run" + << std::endl; + return 1; +} +#endif // HAVE_OPECV_GAPI diff --git a/samples/cpp/tutorial_code/gapi/security_barrier_camera/security_barrier_camera.cpp b/samples/cpp/tutorial_code/gapi/security_barrier_camera/security_barrier_camera.cpp new file mode 100644 index 0000000..db72ab9 --- /dev/null +++ b/samples/cpp/tutorial_code/gapi/security_barrier_camera/security_barrier_camera.cpp @@ -0,0 +1,351 @@ +#include "opencv2/opencv_modules.hpp" +#include +#if defined(HAVE_OPENCV_GAPI) + +#include +#include + +#include "opencv2/imgproc.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/gapi.hpp" +#include "opencv2/gapi/core.hpp" +#include "opencv2/gapi/imgproc.hpp" +#include "opencv2/gapi/infer.hpp" +#include "opencv2/gapi/infer/ie.hpp" +#include "opencv2/gapi/cpu/gcpukernel.hpp" +#include "opencv2/gapi/streaming/cap.hpp" +#include "opencv2/highgui.hpp" + +const std::string about = + "This is an OpenCV-based version of Security Barrier Camera example"; +const std::string keys = + "{ h help | | print this help message }" + "{ input | | Path to an input video file }" + "{ detm | | IE vehicle/license plate detection model IR }" + "{ detw | | IE vehicle/license plate detection model weights }" + "{ detd | | IE vehicle/license plate detection model device }" + "{ vehm | | IE vehicle attributes model IR }" + "{ vehw | | IE vehicle attributes model weights }" + "{ vehd | | IE vehicle attributes model device }" + "{ lprm | | IE license plate recognition model IR }" + "{ lprw | | IE license plate recognition model weights }" + "{ lprd | | IE license plate recognition model device }" + "{ pure | | When set, no output is displayed. Useful for benchmarking }" + "{ ser | | When set, runs a regular (serial) pipeline }"; + +namespace { +struct Avg { + struct Elapsed { + explicit Elapsed(double ms) : ss(ms/1000.), mm(static_cast(ss)/60) {} + const double ss; + const int mm; + }; + + using MS = std::chrono::duration>; + using TS = std::chrono::time_point; + TS started; + + void start() { started = now(); } + TS now() const { return std::chrono::high_resolution_clock::now(); } + double tick() const { return std::chrono::duration_cast(now() - started).count(); } + Elapsed elapsed() const { return Elapsed{tick()}; } + double fps(std::size_t n) const { return static_cast(n) / (tick() / 1000.); } +}; +std::ostream& operator<<(std::ostream &os, const Avg::Elapsed &e) { + os << e.mm << ':' << (e.ss - 60*e.mm); + return os; +} +} // namespace + + +namespace custom { +G_API_NET(VehicleLicenseDetector, , "vehicle-license-plate-detector"); + +using Attrs = std::tuple; +G_API_NET(VehicleAttributes, , "vehicle-attributes"); +G_API_NET(LPR, , "license-plate-recognition"); + +using GVehiclesPlates = std::tuple< cv::GArray + , cv::GArray >; +G_API_OP_M(ProcessDetections, + , + "custom.security_barrier.detector.postproc") { + static std::tuple + outMeta(const cv::GMatDesc &, const cv::GMatDesc) { + // FIXME: Need to get rid of this - literally there's nothing useful + return std::make_tuple(cv::empty_array_desc(), cv::empty_array_desc()); + } +}; + +GAPI_OCV_KERNEL(OCVProcessDetections, ProcessDetections) { + static void run(const cv::Mat &in_ssd_result, + const cv::Mat &in_frame, + std::vector &out_vehicles, + std::vector &out_plates) { + const int MAX_PROPOSALS = 200; + const int OBJECT_SIZE = 7; + const cv::Size upscale = in_frame.size(); + const cv::Rect surface({0,0}, upscale); + + out_vehicles.clear(); + out_plates.clear(); + + const float *data = in_ssd_result.ptr(); + for (int i = 0; i < MAX_PROPOSALS; i++) { + const float image_id = data[i * OBJECT_SIZE + 0]; // batch id + const float label = data[i * OBJECT_SIZE + 1]; + const float confidence = data[i * OBJECT_SIZE + 2]; + const float rc_left = data[i * OBJECT_SIZE + 3]; + const float rc_top = data[i * OBJECT_SIZE + 4]; + const float rc_right = data[i * OBJECT_SIZE + 5]; + const float rc_bottom = data[i * OBJECT_SIZE + 6]; + + if (image_id < 0.f) { // indicates end of detections + break; + } + if (confidence < 0.5f) { // fixme: hard-coded snapshot + continue; + } + + cv::Rect rc; + rc.x = static_cast(rc_left * upscale.width); + rc.y = static_cast(rc_top * upscale.height); + rc.width = static_cast(rc_right * upscale.width) - rc.x; + rc.height = static_cast(rc_bottom * upscale.height) - rc.y; + + using PT = cv::Point; + using SZ = cv::Size; + switch (static_cast(label)) { + case 1: out_vehicles.push_back(rc & surface); break; + case 2: out_plates.emplace_back((rc-PT(15,15)+SZ(30,30)) & surface); break; + default: CV_Assert(false && "Unknown object class"); + } + } + } +}; +} // namespace custom + +namespace labels { +const std::string colors[] = { + "white", "gray", "yellow", "red", "green", "blue", "black" +}; +const std::string types[] = { + "car", "van", "truck", "bus" +}; +const std::vector license_text = { + "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", + "", "", "", "", + "", "", "", "", + "", "", "", "", + "", "", "", "", + "", "", "", "", + "", "", "", "", + "", "", "", "", + "", "", "", "", + "", "", + "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", + "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", + "U", "V", "W", "X", "Y", "Z" +}; +namespace { +void DrawResults(cv::Mat &frame, + const std::vector &vehicles, + const std::vector &out_colors, + const std::vector &out_types, + const std::vector &plates, + const std::vector &out_numbers) { + CV_Assert(vehicles.size() == out_colors.size()); + CV_Assert(vehicles.size() == out_types.size()); + CV_Assert(plates.size() == out_numbers.size()); + + for (auto it = vehicles.begin(); it != vehicles.end(); ++it) { + const auto idx = std::distance(vehicles.begin(), it); + const auto &rc = *it; + + const float *colors_data = out_colors[idx].ptr(); + const float *types_data = out_types [idx].ptr(); + const auto color_id = std::max_element(colors_data, colors_data + 7) - colors_data; + const auto type_id = std::max_element(types_data, types_data + 4) - types_data; + + const int ATTRIB_OFFSET = 25; + cv::rectangle(frame, rc, {0, 255, 0}, 4); + cv::putText(frame, labels::colors[color_id], + cv::Point(rc.x + 5, rc.y + ATTRIB_OFFSET), + cv::FONT_HERSHEY_COMPLEX_SMALL, + 1, + cv::Scalar(255, 0, 0)); + cv::putText(frame, labels::types[type_id], + cv::Point(rc.x + 5, rc.y + ATTRIB_OFFSET * 2), + cv::FONT_HERSHEY_COMPLEX_SMALL, + 1, + cv::Scalar(255, 0, 0)); + } + + for (auto it = plates.begin(); it != plates.end(); ++it) { + const int MAX_LICENSE = 88; + const int LPR_OFFSET = 50; + + const auto &rc = *it; + const auto idx = std::distance(plates.begin(), it); + + std::string result; + const auto *lpr_data = out_numbers[idx].ptr(); + for (int i = 0; i < MAX_LICENSE; i++) { + if (lpr_data[i] == -1) break; + result += labels::license_text[static_cast(lpr_data[i])]; + } + + const int y_pos = std::max(0, rc.y + rc.height - LPR_OFFSET); + cv::rectangle(frame, rc, {0, 0, 255}, 4); + cv::putText(frame, result, + cv::Point(rc.x, y_pos), + cv::FONT_HERSHEY_COMPLEX_SMALL, + 1, + cv::Scalar(0, 0, 255)); + } +} + +void DrawFPS(cv::Mat &frame, std::size_t n, double fps) { + std::ostringstream out; + out << "FRAME " << n << ": " + << std::fixed << std::setprecision(2) << fps + << " FPS (AVG)"; + cv::putText(frame, out.str(), + cv::Point(0, frame.rows), + cv::FONT_HERSHEY_SIMPLEX, + 1, + cv::Scalar(0, 0, 0), + 2); +} +} // anonymous namespace +} // namespace labels + +int main(int argc, char *argv[]) +{ + cv::CommandLineParser cmd(argc, argv, keys); + cmd.about(about); + if (cmd.has("help")) { + cmd.printMessage(); + return 0; + } + const std::string input = cmd.get("input"); + const bool no_show = cmd.get("pure"); + + cv::GComputation pp([]() { + cv::GMat in; + cv::GMat detections = cv::gapi::infer(in); + cv::GArray vehicles; + cv::GArray plates; + std::tie(vehicles, plates) = custom::ProcessDetections::on(detections, in); + cv::GArray colors; + cv::GArray types; + std::tie(colors, types) = cv::gapi::infer(vehicles, in); + cv::GArray numbers = cv::gapi::infer(plates, in); + cv::GMat frame = cv::gapi::copy(in); // pass-through the input frame + return cv::GComputation(cv::GIn(in), + cv::GOut(frame, vehicles, colors, types, plates, numbers)); + }); + + // Note: it might be very useful to have dimensions loaded at this point! + auto det_net = cv::gapi::ie::Params { + cmd.get("detm"), // path to topology IR + cmd.get("detw"), // path to weights + cmd.get("detd"), // device specifier + }; + + auto attr_net = cv::gapi::ie::Params { + cmd.get("vehm"), // path to topology IR + cmd.get("vehw"), // path to weights + cmd.get("vehd"), // device specifier + }.cfgOutputLayers({ "color", "type" }); + + // Fill a special LPR input (seq_ind) with a predefined value + // First element is 0.f, the rest 87 are 1.f + const std::vector lpr_seq_dims = {88,1}; + cv::Mat lpr_seq(lpr_seq_dims, CV_32F, cv::Scalar(1.f)); + lpr_seq.ptr()[0] = 0.f; + auto lpr_net = cv::gapi::ie::Params { + cmd.get("lprm"), // path to topology IR + cmd.get("lprw"), // path to weights + cmd.get("lprd"), // device specifier + }.constInput("seq_ind", lpr_seq); + + auto kernels = cv::gapi::kernels(); + auto networks = cv::gapi::networks(det_net, attr_net, lpr_net); + + Avg avg; + cv::Mat frame; + std::vector vehicles, plates; + std::vector out_colors; + std::vector out_types; + std::vector out_numbers; + std::size_t frames = 0u; + + std::cout << "Reading " << input << std::endl; + + if (cmd.get("ser")) { + std::cout << "Going serial..." << std::endl; + cv::VideoCapture cap(input); + + auto cc = pp.compile(cv::GMatDesc{CV_8U,3,cv::Size(1920,1080)}, + cv::compile_args(kernels, networks)); + + avg.start(); + while (cv::waitKey(1) < 0) { + cap >> frame; + if (frame.empty()) break; + + cc(cv::gin(frame), + cv::gout(frame, vehicles, out_colors, out_types, plates, out_numbers)); + frames++; + labels::DrawResults(frame, vehicles, out_colors, out_types, plates, out_numbers); + labels::DrawFPS(frame, frames, avg.fps(frames)); + if (!no_show) cv::imshow("Out", frame); + } + } else { + std::cout << "Going pipelined..." << std::endl; + + auto cc = pp.compileStreaming(cv::GMatDesc{CV_8U,3,cv::Size(1920,1080)}, + cv::compile_args(kernels, networks)); + + cc.setSource(cv::gapi::wip::make_src(input)); + + avg.start(); + cc.start(); + + // Implement different execution policies depending on the display option + // for the best performance. + while (cc.running()) { + auto out_vector = cv::gout(frame, vehicles, out_colors, out_types, plates, out_numbers); + if (no_show) { + // This is purely a video processing. No need to balance with UI rendering. + // Use a blocking pull() to obtain data. Break the loop if the stream is over. + if (!cc.pull(std::move(out_vector))) + break; + } else if (!cc.try_pull(std::move(out_vector))) { + // Use a non-blocking try_pull() to obtain data. + // If there's no data, let UI refresh (and handle keypress) + if (cv::waitKey(1) >= 0) break; + else continue; + } + // At this point we have data for sure (obtained in either blocking or non-blocking way). + frames++; + labels::DrawResults(frame, vehicles, out_colors, out_types, plates, out_numbers); + labels::DrawFPS(frame, frames, avg.fps(frames)); + if (!no_show) cv::imshow("Out", frame); + } + cc.stop(); + } + std::cout << "Processed " << frames << " frames in " << avg.elapsed() << std::endl; + + return 0; +} +#else +int main() +{ + std::cerr << "This tutorial code requires G-API module " + "with Inference Engine backend to run" + << std::endl; + return 1; +} +#endif // HAVE_OPECV_GAPI