Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / tests / test_cases / detection_output_test.cpp
index df2799a..ff920a9 100644 (file)
@@ -123,6 +123,33 @@ public:
         }
     }
 
+    void init_buffer_sort(cldnn::memory input_buff)
+    {
+        auto input_data_ptr = input_buff.pointer<T>();
+
+        EXPECT_EQ((int)input_buff.count(), 128);
+
+        T* input_data = input_data_ptr.data();
+        input_data[0] = 8;
+        input_data[1] = 3;
+        input_data[16] = 0; input_data[17] = 0; input_data[18] = 0.6f; input_data[19] = 0.55f; input_data[20] = 0.55f; input_data[21] = 0.85f; input_data[22] = 0.85f;
+        input_data[23] = 0; input_data[24] = 0; input_data[25] = 0.4f; input_data[26] = 0.15f; input_data[27] = 0.55f; input_data[28] = 0.45f; input_data[29] = 0.85f;
+        input_data[30] = 0; input_data[31] = 0; input_data[32] = 0.2f; input_data[33] = 0.55f; input_data[34] = 0.15f; input_data[35] = 0.85f; input_data[36] = 0.45f;
+        input_data[37] = 0; input_data[38] = 0; input_data[39] = 0.0f; input_data[40] = 0.15f; input_data[41] = 0.15f; input_data[42] = 0.45f; input_data[43] = 0.45f;
+        input_data[44] = 0; input_data[45] = 1; input_data[46] = 1.0f; input_data[47] = 0.20f; input_data[48] = 0.20f; input_data[49] = 0.50f; input_data[50] = 0.50f;
+        input_data[51] = 0; input_data[52] = 1; input_data[53] = 0.8f; input_data[54] = 0.50f; input_data[55] = 0.20f; input_data[56] = 0.80f; input_data[57] = 0.50f;
+        input_data[58] = 0; input_data[59] = 1; input_data[60] = 0.6f; input_data[61] = 0.20f; input_data[62] = 0.50f; input_data[63] = 0.50f; input_data[64] = 0.80f;
+        input_data[65] = 0; input_data[66] = 1; input_data[67] = 0.4f; input_data[68] = 0.50f; input_data[69] = 0.50f; input_data[70] = 0.80f; input_data[71] = 0.80f;
+        input_data[72] = 1; input_data[73] = 0; input_data[74] = 1.0f; input_data[75] = 0.25f; input_data[76] = 0.25f; input_data[77] = 0.55f; input_data[78] = 0.55f;
+        input_data[79] = 1; input_data[80] = 0; input_data[81] = 0.4f; input_data[82] = 0.45f; input_data[83] = 0.45f; input_data[84] = 0.75f; input_data[85] = 0.75f;
+        input_data[86] = -1; input_data[87] = 0; input_data[88] = 0; input_data[89] = 0; input_data[90] = 0; input_data[91] = 0; input_data[92] = 0;
+        input_data[93] = -1; input_data[94] = 0; input_data[95] = 0; input_data[96] = 0; input_data[97] = 0; input_data[98] = 0; input_data[99] = 0;
+        input_data[100] = 1; input_data[101] = 1; input_data[102] = 0.6f; input_data[103] = 0.40f; input_data[104] = 0.40f; input_data[105] = 0.70f; input_data[106] = 0.70f;
+        input_data[107] = -1; input_data[108] = 0; input_data[109] = 0; input_data[110] = 0; input_data[111] = 0; input_data[112] = 0; input_data[113] = 0;
+        input_data[114] = -1; input_data[115] = 0; input_data[116] = 0; input_data[117] = 0; input_data[118] = 0; input_data[119] = 0; input_data[120] = 0;
+        input_data[121] = -1; input_data[122] = 0; input_data[123] = 0; input_data[124] = 0; input_data[125] = 0; input_data[126] = 0; input_data[127] = 0;
+    }
+
     void check_results(const memory& output, const int num, const std::string values)
     {
         assert(num < output.get_layout().size.spatial[1]);
@@ -145,526 +172,817 @@ public:
             EXPECT_TRUE(floating_point_equal(data[num * output.get_layout().size.spatial[0] + i], (T)(float)atof(items[i].c_str())));
         }
     }
-    static const int num_of_images = 2;
-    static const int num_classes = 2;
-    static const int num_priors = 4;
-    static const int img_size = 300;
-    const float nms_threshold;
-};
 
-typedef ::testing::Types<float, FLOAT16> detection_output_test_types;
-TYPED_TEST_CASE(detection_output_test, detection_output_test_types);
+    void setup_basic(bool runOnGPU)
+    {
+        const bool share_location = true;
+        const int num_loc_classes = share_location ? 1 : this->num_classes;
+        const int keep_top_k = 150;
 
+        const auto& engine = get_test_engine();
+        cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
+        cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
+        cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } });
 
-TYPED_TEST(detection_output_test, test_setup_basic)
-{
-    const bool share_location = true;
-    const int num_loc_classes = share_location ? 1 : this->num_classes;
-    const int keep_top_k = 150;
+        topology topology;
+        topology.add(input_layout("input_location", input_location.get_layout()));
+        topology.add(input_layout("input_confidence", input_confidence.get_layout()));
+        topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
 
-    cldnn::engine engine;
-    cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
-    cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
-    cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4} });
+        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k));
 
-    topology topology;
-    topology.add(input_layout("input_location", input_location.get_layout()));
-    topology.add(input_layout("input_confidence", input_confidence.get_layout()));
-    topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
+        build_options opts;
+        if (runOnGPU)
+        {
+            opts.set_option(build_option::detection_output_gpu(true));
+        }
 
-    topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k));
-    network network(engine, topology);
-    network.set_input_data("input_location", input_location);
-    network.set_input_data("input_confidence", input_confidence);
-    network.set_input_data("input_prior_box", input_prior_box);
+        network network(engine, topology, opts);
+        network.set_input_data("input_location", input_location);
+        network.set_input_data("input_confidence", input_confidence);
+        network.set_input_data("input_prior_box", input_prior_box);
 
-    auto outputs = network.execute();
-    
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "detection_output");
-    
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
-}
+        auto outputs = network.execute();
 
-TYPED_TEST(detection_output_test, test_forward_share_location)
-{
-    const bool share_location = true;
-    const int num_loc_classes = share_location ? 1 : this->num_classes;
-    const int keep_top_k = 4;
-    const int background_label_id = 0;
+        EXPECT_EQ(outputs.size(), size_t(1));
+        EXPECT_EQ(outputs.begin()->first, "detection_output");
 
-    cldnn::engine engine;
-    cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
-    cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
-    cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4} });
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+    }
 
-    this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
+    void setup_two_layers(bool runOnGPU)
+    {
+        const bool share_location = true;
+        const int num_loc_classes = share_location ? 1 : this->num_classes;
+        const int keep_top_k = 150;
 
-    topology topology;
-    topology.add(input_layout("input_location", input_location.get_layout()));
-    topology.add(input_layout("input_confidence", input_confidence.get_layout()));
-    topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
+        const auto& engine = get_test_engine();
+        cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
+        cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
+        cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } });
 
-    topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));
-    network network(engine, topology);
-    network.set_input_data("input_location", input_location);
-    network.set_input_data("input_confidence", input_confidence);
-    network.set_input_data("input_prior_box", input_prior_box);
+        topology topology;
+        topology.add(input_layout("input_location", input_location.get_layout()));
+        topology.add(input_layout("input_confidence", input_confidence.get_layout()));
+        topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
 
-    auto outputs = network.execute();
+        topology.add(detection_output("detection_output_1", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k));
+        topology.add(detection_output("detection_output_2", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k));
 
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "detection_output");
+        build_options opts;
+        if (runOnGPU)
+        {
+            opts.set_option(build_option::detection_output_gpu(true));
+        }
 
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+        network network(engine, topology, opts);
+        network.set_input_data("input_location", input_location);
+        network.set_input_data("input_confidence", input_confidence);
+        network.set_input_data("input_prior_box", input_prior_box);
 
-    auto output_prim = outputs.begin()->second.get_memory();
-    
-    this->check_results(output_prim, 0, "0 1 1.0 0.15 0.15 0.45 0.45");
-    this->check_results(output_prim, 1, "0 1 0.8 0.55 0.15 0.85 0.45");
-    this->check_results(output_prim, 2, "0 1 0.6 0.15 0.55 0.45 0.85");
-    this->check_results(output_prim, 3, "0 1 0.4 0.55 0.55 0.85 0.85");
-    this->check_results(output_prim, 4, "1 1 0.6 0.45 0.45 0.75 0.75");
-    this->check_results(output_prim, 5, "1 1 0.0 0.25 0.25 0.55 0.55");
-    this->check_results(output_prim, 6, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 7, "-1 0 0 0 0 0 0");
-}
+        auto outputs = network.execute();
 
-TYPED_TEST(detection_output_test, test_forward_num_detections_greater_than_keep_top_k)
-{
-    const bool share_location = true;
-    const int num_loc_classes = share_location ? 1 : this->num_classes;
-    const int keep_top_k = 1;
-    const int background_label_id = 0;
+        EXPECT_EQ(outputs.size(), size_t(2));
+        unsigned i = 1;
+        for (auto it = outputs.begin(); it != outputs.begin(); it++)
+        {
 
-    cldnn::engine engine;
-    cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
-    cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
-    cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4} });
+            EXPECT_EQ(it->first, "detection_output_" + std::to_string(i));
 
-    this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
+            EXPECT_EQ(it->second.get_memory().get_layout().size.batch[0], 1);
+            EXPECT_EQ(it->second.get_memory().get_layout().size.feature[0], 1);
+            EXPECT_EQ(it->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
+            EXPECT_EQ(it->second.get_memory().get_layout().size.spatial[0], 7);
+            i++;
+        }
+    }
 
-    topology topology;
-    topology.add(input_layout("input_location", input_location.get_layout()));
-    topology.add(input_layout("input_confidence", input_confidence.get_layout()));
-    topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
+    void forward_share_location(bool runOnGPU)
+    {
+        const bool share_location = true;
+        const int num_loc_classes = share_location ? 1 : this->num_classes;
+        const int keep_top_k = 4;
+        const int background_label_id = 0;
 
-    topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));
-    network network(engine, topology);
-    network.set_input_data("input_location", input_location);
-    network.set_input_data("input_confidence", input_confidence);
-    network.set_input_data("input_prior_box", input_prior_box);
+        const auto& engine = get_test_engine();
+        cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
+        cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
+        cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } });
 
-    auto outputs = network.execute();
+        this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
 
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "detection_output");
+        topology topology;
+        topology.add(input_layout("input_location", input_location.get_layout()));
+        topology.add(input_layout("input_confidence", input_confidence.get_layout()));
+        topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
 
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));
 
-    auto output_prim = outputs.begin()->second.get_memory();
+        build_options opts;
+        if (runOnGPU)
+        {
+            opts.set_option(build_option::detection_output_gpu(true));
+        }
 
-    this->check_results(output_prim, 0, "0 1 1.0 0.15 0.15 0.45 0.45");
-    this->check_results(output_prim, 1, "1 1 0.6 0.45 0.45 0.75 0.75");
-}
+        network network(engine, topology, opts);
+        network.set_input_data("input_location", input_location);
+        network.set_input_data("input_confidence", input_confidence);
+        network.set_input_data("input_prior_box", input_prior_box);
 
-TYPED_TEST(detection_output_test, test_forward_num_detections_smaller_than_keep_top_k)
-{
-    const bool share_location = true;
-    const int num_loc_classes = share_location ? 1 : this->num_classes;
-    const int keep_top_k = 6;
-    const int background_label_id = 0;
+        auto outputs = network.execute();
 
-    cldnn::engine engine;
-    cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
-    cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
-    cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4} });
+        EXPECT_EQ(outputs.size(), size_t(1));
+        EXPECT_EQ(outputs.begin()->first, "detection_output");
 
-    this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
 
-    topology topology;
-    topology.add(input_layout("input_location", input_location.get_layout()));
-    topology.add(input_layout("input_confidence", input_confidence.get_layout()));
-    topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
+        auto output_prim = outputs.begin()->second.get_memory();
 
-    topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));
-    network network(engine, topology);
-    network.set_input_data("input_location", input_location);
-    network.set_input_data("input_confidence", input_confidence);
-    network.set_input_data("input_prior_box", input_prior_box);
+        check_results(output_prim, 0, "0 1 1.0 0.15 0.15 0.45 0.45");
+        check_results(output_prim, 1, "0 1 0.8 0.55 0.15 0.85 0.45");
+        check_results(output_prim, 2, "0 1 0.6 0.15 0.55 0.45 0.85");
+        check_results(output_prim, 3, "0 1 0.4 0.55 0.55 0.85 0.85");
+        check_results(output_prim, 4, "1 1 0.6 0.45 0.45 0.75 0.75");
+        check_results(output_prim, 5, "1 1 0.0 0.25 0.25 0.55 0.55");
+        check_results(output_prim, 6, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 7, "-1 0 0 0 0 0 0");
+    }
 
-    auto outputs = network.execute();
+    void forward_num_detections_greater_than_keep_top_k(bool runOnGPU)
+    {
+        const bool share_location = true;
+        const int num_loc_classes = share_location ? 1 : this->num_classes;
+        const int keep_top_k = 1;
+        const int background_label_id = 0;
 
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "detection_output");
+        const auto& engine = get_test_engine();
+        cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
+        cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
+        cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } });
 
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+        this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
 
-    auto output_prim = outputs.begin()->second.get_memory();
+        topology topology;
+        topology.add(input_layout("input_location", input_location.get_layout()));
+        topology.add(input_layout("input_confidence", input_confidence.get_layout()));
+        topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
 
-    this->check_results(output_prim, 0, "0 1 1.0 0.15 0.15 0.45 0.45");
-    this->check_results(output_prim, 1, "0 1 0.8 0.55 0.15 0.85 0.45");
-    this->check_results(output_prim, 2, "0 1 0.6 0.15 0.55 0.45 0.85");
-    this->check_results(output_prim, 3, "0 1 0.4 0.55 0.55 0.85 0.85");
-    this->check_results(output_prim, 4, "1 1 0.6 0.45 0.45 0.75 0.75");
-    this->check_results(output_prim, 5, "1 1 0.0 0.25 0.25 0.55 0.55");
-    this->check_results(output_prim, 6, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 7, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 8, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 9, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 10, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 11, "-1 0 0 0 0 0 0");
-}
+        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));
 
-TYPED_TEST(detection_output_test, test_forward_share_location_top_k)
-{
-    const bool share_location = true;
-    const int num_loc_classes = share_location ? 1 : this->num_classes;
-    const int keep_top_k = 2;
-    const int top_k = 2;
-    const int background_label_id = 0;
+        build_options opts;
+        if (runOnGPU)
+        {
+            opts.set_option(build_option::detection_output_gpu(true));
+        }
 
-    cldnn::engine engine;
-    cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
-    cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
-    cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } });
+        network network(engine, topology, opts);
+        network.set_input_data("input_location", input_location);
+        network.set_input_data("input_confidence", input_confidence);
+        network.set_input_data("input_prior_box", input_prior_box);
 
-    this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
+        auto outputs = network.execute();
 
-    topology topology;
-    topology.add(input_layout("input_location", input_location.get_layout()));
-    topology.add(input_layout("input_confidence", input_confidence.get_layout()));
-    topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
+        EXPECT_EQ(outputs.size(), size_t(1));
+        EXPECT_EQ(outputs.begin()->first, "detection_output");
 
-    topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));
-    network network(engine, topology);
-    network.set_input_data("input_location", input_location);
-    network.set_input_data("input_confidence", input_confidence);
-    network.set_input_data("input_prior_box", input_prior_box);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
 
-    auto outputs = network.execute();
+        auto output_prim = outputs.begin()->second.get_memory();
 
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "detection_output");
+        check_results(output_prim, 0, "0 1 1.0 0.15 0.15 0.45 0.45");
+        check_results(output_prim, 1, "1 1 0.6 0.45 0.45 0.75 0.75");
+    }
 
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+    void forward_num_detections_smaller_than_keep_top_k(bool runOnGPU)
+    {
+        const bool share_location = true;
+        const int num_loc_classes = share_location ? 1 : this->num_classes;
+        const int keep_top_k = 6;
+        const int background_label_id = 0;
 
-    auto output_prim = outputs.begin()->second.get_memory();
+        const auto& engine = get_test_engine();
+        cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
+        cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
+        cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } });
 
-    this->check_results(output_prim, 0, "0 1 1.0 0.15 0.15 0.45 0.45");
-    this->check_results(output_prim, 1, "0 1 0.8 0.55 0.15 0.85 0.45");
-    this->check_results(output_prim, 2, "1 1 0.6 0.45 0.45 0.75 0.75");
-    this->check_results(output_prim, 3, "-1 0 0 0 0 0 0");
-}
+        this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
 
-TYPED_TEST(detection_output_test, test_forward_no_share_location)
-{
-    const bool share_location = false;
-    const int num_loc_classes = share_location ? 1 : this->num_classes;
-    const int keep_top_k = 10;
-    const int background_label_id = -1;
+        topology topology;
+        topology.add(input_layout("input_location", input_location.get_layout()));
+        topology.add(input_layout("input_confidence", input_confidence.get_layout()));
+        topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
 
-    cldnn::engine engine;
-    cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
-    cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
-    cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } });
+        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));
 
-    this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
+        build_options opts;
+        if (runOnGPU)
+        {
+            opts.set_option(build_option::detection_output_gpu(true));
+        }
 
-    topology topology;
-    topology.add(input_layout("input_location", input_location.get_layout()));
-    topology.add(input_layout("input_confidence", input_confidence.get_layout()));
-    topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
+        network network(engine, topology, opts);
+        network.set_input_data("input_location", input_location);
+        network.set_input_data("input_confidence", input_confidence);
+        network.set_input_data("input_prior_box", input_prior_box);
+
+        auto outputs = network.execute();
+
+        EXPECT_EQ(outputs.size(), size_t(1));
+        EXPECT_EQ(outputs.begin()->first, "detection_output");
+
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+
+        auto output_prim = outputs.begin()->second.get_memory();
+
+        check_results(output_prim, 0, "0 1 1.0 0.15 0.15 0.45 0.45");
+        check_results(output_prim, 1, "0 1 0.8 0.55 0.15 0.85 0.45");
+        check_results(output_prim, 2, "0 1 0.6 0.15 0.55 0.45 0.85");
+        check_results(output_prim, 3, "0 1 0.4 0.55 0.55 0.85 0.85");
+        check_results(output_prim, 4, "1 1 0.6 0.45 0.45 0.75 0.75");
+        check_results(output_prim, 5, "1 1 0.0 0.25 0.25 0.55 0.55");
+        check_results(output_prim, 6, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 7, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 8, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 9, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 10, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 11, "-1 0 0 0 0 0 0");
+    }
 
-    topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));
-    network network(engine, topology);
-    network.set_input_data("input_location", input_location);
-    network.set_input_data("input_confidence", input_confidence);
-    network.set_input_data("input_prior_box", input_prior_box);
+    void test_forward_share_location_top_k(bool runOnGPU)
+    {
+        const bool share_location = true;
+        const int num_loc_classes = share_location ? 1 : this->num_classes;
+        const int keep_top_k = 2;
+        const int top_k = 2;
+        const int background_label_id = 0;
 
-    auto outputs = network.execute();
+        const auto& engine = get_test_engine();
+        cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
+        cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
+        cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } });
 
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "detection_output");
+        this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
 
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+        topology topology;
+        topology.add(input_layout("input_location", input_location.get_layout()));
+        topology.add(input_layout("input_confidence", input_confidence.get_layout()));
+        topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
 
-    auto output_prim = outputs.begin()->second.get_memory();
+        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));
 
-    this->check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85");
-    this->check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85");
-    this->check_results(output_prim, 2, "0 0 0.2 0.55 0.15 0.85 0.45");
-    this->check_results(output_prim, 3, "0 0 0.0 0.15 0.15 0.45 0.45");
-    this->check_results(output_prim, 4, "0 1 1.0 0.20 0.20 0.50 0.50");
-    this->check_results(output_prim, 5, "0 1 0.8 0.50 0.20 0.80 0.50");
-    this->check_results(output_prim, 6, "0 1 0.6 0.20 0.50 0.50 0.80");
-    this->check_results(output_prim, 7, "0 1 0.4 0.50 0.50 0.80 0.80");
-    this->check_results(output_prim, 8, "1 0 1.0 0.25 0.25 0.55 0.55");
-    this->check_results(output_prim, 9, "1 0 0.4 0.45 0.45 0.75 0.75");
-    this->check_results(output_prim, 10, "1 1 0.6 0.40 0.40 0.70 0.70");
-    this->check_results(output_prim, 11, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 12, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 13, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 14, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 15, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 16, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 17, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 18, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 19, "-1 0 0 0 0 0 0");
-}
+        build_options opts;
+        if (runOnGPU)
+        {
+            opts.set_option(build_option::detection_output_gpu(true));
+        }
 
-TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k)
-{
-    const bool share_location = false;
-    const int num_loc_classes = share_location ? 1 : this->num_classes;
-    const int keep_top_k = 4;
-    const int background_label_id = -1;
-    const int top_k = 2;
+        network network(engine, topology, opts);
+        network.set_input_data("input_location", input_location);
+        network.set_input_data("input_confidence", input_confidence);
+        network.set_input_data("input_prior_box", input_prior_box);
 
-    cldnn::engine engine;
-    cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
-    cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
-    cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } });
+        auto outputs = network.execute();
 
-    this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
+        EXPECT_EQ(outputs.size(), size_t(1));
+        EXPECT_EQ(outputs.begin()->first, "detection_output");
 
-    topology topology;
-    topology.add(input_layout("input_location", input_location.get_layout()));
-    topology.add(input_layout("input_confidence", input_confidence.get_layout()));
-    topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
 
-    topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));
-    network network(engine, topology);
-    network.set_input_data("input_location", input_location);
-    network.set_input_data("input_confidence", input_confidence);
-    network.set_input_data("input_prior_box", input_prior_box);
+        auto output_prim = outputs.begin()->second.get_memory();
 
-    auto outputs = network.execute();
+        check_results(output_prim, 0, "0 1 1.0 0.15 0.15 0.45 0.45");
+        check_results(output_prim, 1, "0 1 0.8 0.55 0.15 0.85 0.45");
+        check_results(output_prim, 2, "1 1 0.6 0.45 0.45 0.75 0.75");
+        check_results(output_prim, 3, "-1 0 0 0 0 0 0");
+    }
 
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "detection_output");
+    void forward_no_share_location(bool runOnGPU)
+    {
+        const bool share_location = false;
+        const int num_loc_classes = share_location ? 1 : this->num_classes;
+        const int keep_top_k = 10;
+        const int background_label_id = -1;
 
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+        const auto& engine = get_test_engine();
+        cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
+        cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
+        cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } });
 
-    auto output_prim = outputs.begin()->second.get_memory();
+        this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
 
-    this->check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85");
-    this->check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85");
-    this->check_results(output_prim, 2, "0 1 1.0 0.20 0.20 0.50 0.50");
-    this->check_results(output_prim, 3, "0 1 0.8 0.50 0.20 0.80 0.50");
-    this->check_results(output_prim, 4, "1 0 1.0 0.25 0.25 0.55 0.55");
-    this->check_results(output_prim, 5, "1 1 0.6 0.40 0.40 0.70 0.70");
-    this->check_results(output_prim, 6, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 7, "-1 0 0 0 0 0 0");
-}
+        topology topology;
+        topology.add(input_layout("input_location", input_location.get_layout()));
+        topology.add(input_layout("input_confidence", input_confidence.get_layout()));
+        topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
 
-TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0)
-{
-    const bool share_location = false;
-    const int num_loc_classes = share_location ? 1 : this->num_classes;
-    const int keep_top_k = 5;
-    const int background_label_id = 0;
+        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));
 
-    cldnn::engine engine;
-    cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
-    cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
-    cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } });
+        build_options opts;
+        if (runOnGPU)
+        {
+            opts.set_option(build_option::detection_output_gpu(true));
+        }
 
-    this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
+        network network(engine, topology, opts);
+        network.set_input_data("input_location", input_location);
+        network.set_input_data("input_confidence", input_confidence);
+        network.set_input_data("input_prior_box", input_prior_box);
+
+        auto outputs = network.execute();
+
+        EXPECT_EQ(outputs.size(), size_t(1));
+        EXPECT_EQ(outputs.begin()->first, "detection_output");
+
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+
+        auto output_prim = outputs.begin()->second.get_memory();
+
+        check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85");
+        check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85");
+        check_results(output_prim, 2, "0 0 0.2 0.55 0.15 0.85 0.45");
+        check_results(output_prim, 3, "0 0 0.0 0.15 0.15 0.45 0.45");
+        check_results(output_prim, 4, "0 1 1.0 0.20 0.20 0.50 0.50");
+        check_results(output_prim, 5, "0 1 0.8 0.50 0.20 0.80 0.50");
+        check_results(output_prim, 6, "0 1 0.6 0.20 0.50 0.50 0.80");
+        check_results(output_prim, 7, "0 1 0.4 0.50 0.50 0.80 0.80");
+        check_results(output_prim, 8, "1 0 1.0 0.25 0.25 0.55 0.55");
+        check_results(output_prim, 9, "1 0 0.4 0.45 0.45 0.75 0.75");
+        check_results(output_prim, 10, "1 1 0.6 0.40 0.40 0.70 0.70");
+        check_results(output_prim, 11, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 12, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 13, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 14, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 15, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 16, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 17, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 18, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 19, "-1 0 0 0 0 0 0");
+    }
 
-    topology topology;
-    topology.add(input_layout("input_location", input_location.get_layout()));
-    topology.add(input_layout("input_confidence", input_confidence.get_layout()));
-    topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
+    void forward_no_share_location_top_k(bool runOnGPU)
+    {
+        const bool share_location = false;
+        const int num_loc_classes = share_location ? 1 : this->num_classes;
+        const int keep_top_k = 4;
+        const int background_label_id = -1;
+        const int top_k = 2;
 
-    topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));
-    network network(engine, topology);
-    network.set_input_data("input_location", input_location);
-    network.set_input_data("input_confidence", input_confidence);
-    network.set_input_data("input_prior_box", input_prior_box);
+        const auto& engine = get_test_engine();
+        cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
+        cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
+        cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } });
 
-    auto outputs = network.execute();
+        this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
 
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "detection_output");
+        topology topology;
+        topology.add(input_layout("input_location", input_location.get_layout()));
+        topology.add(input_layout("input_confidence", input_confidence.get_layout()));
+        topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
 
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));
 
-    auto output_prim = outputs.begin()->second.get_memory();
+        build_options opts;
+        if (runOnGPU)
+        {
+            opts.set_option(build_option::detection_output_gpu(true));
+        }
 
-    this->check_results(output_prim, 0, "0 1 1.0 0.20 0.20 0.50 0.50");
-    this->check_results(output_prim, 1, "0 1 0.8 0.50 0.20 0.80 0.50");
-    this->check_results(output_prim, 2, "0 1 0.6 0.20 0.50 0.50 0.80");
-    this->check_results(output_prim, 3, "0 1 0.4 0.50 0.50 0.80 0.80");
-    this->check_results(output_prim, 4, "1 1 0.6 0.40 0.40 0.70 0.70");
-    this->check_results(output_prim, 5, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 6, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 7, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 8, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 9, "-1 0 0 0 0 0 0");
+        network network(engine, topology, opts);
+        network.set_input_data("input_location", input_location);
+        network.set_input_data("input_confidence", input_confidence);
+        network.set_input_data("input_prior_box", input_prior_box);
+
+        auto outputs = network.execute();
+
+        EXPECT_EQ(outputs.size(), size_t(1));
+        EXPECT_EQ(outputs.begin()->first, "detection_output");
+
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+
+        auto output_prim = outputs.begin()->second.get_memory();
+
+        check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85");
+        check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85");
+        check_results(output_prim, 2, "0 1 1.0 0.20 0.20 0.50 0.50");
+        check_results(output_prim, 3, "0 1 0.8 0.50 0.20 0.80 0.50");
+        check_results(output_prim, 4, "1 0 1.0 0.25 0.25 0.55 0.55");
+        check_results(output_prim, 5, "1 1 0.6 0.40 0.40 0.70 0.70");
+        check_results(output_prim, 6, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 7, "-1 0 0 0 0 0 0");
+    }
+
+    void forward_no_share_location_neg_0(bool runOnGPU)
+    {
+        const bool share_location = false;
+        const int num_loc_classes = share_location ? 1 : this->num_classes;
+        const int keep_top_k = 5;
+        const int background_label_id = 0;
+
+        const auto& engine = get_test_engine();
+        cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
+        cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
+        cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } });
+
+        this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
+
+        topology topology;
+        topology.add(input_layout("input_location", input_location.get_layout()));
+        topology.add(input_layout("input_confidence", input_confidence.get_layout()));
+        topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
+
+        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));
+
+        build_options opts;
+        if (runOnGPU)
+        {
+            opts.set_option(build_option::detection_output_gpu(true));
+        }
+
+        network network(engine, topology, opts);
+        network.set_input_data("input_location", input_location);
+        network.set_input_data("input_confidence", input_confidence);
+        network.set_input_data("input_prior_box", input_prior_box);
+
+        auto outputs = network.execute();
+
+        EXPECT_EQ(outputs.size(), size_t(1));
+        EXPECT_EQ(outputs.begin()->first, "detection_output");
+
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+
+        auto output_prim = outputs.begin()->second.get_memory();
+
+        check_results(output_prim, 0, "0 1 1.0 0.20 0.20 0.50 0.50");
+        check_results(output_prim, 1, "0 1 0.8 0.50 0.20 0.80 0.50");
+        check_results(output_prim, 2, "0 1 0.6 0.20 0.50 0.50 0.80");
+        check_results(output_prim, 3, "0 1 0.4 0.50 0.50 0.80 0.80");
+        check_results(output_prim, 4, "1 1 0.6 0.40 0.40 0.70 0.70");
+        check_results(output_prim, 5, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 6, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 7, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 8, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 9, "-1 0 0 0 0 0 0");
+    }
+
+    void forward_no_share_location_neg_0_top_k(bool runOnGPU)
+    {
+        const bool share_location = false;
+        const int num_loc_classes = share_location ? 1 : this->num_classes;
+        const int keep_top_k = 2;
+        const int background_label_id = 0;
+        const int top_k = 2;
+
+        const auto& engine = get_test_engine();
+        cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
+        cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
+        cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } });
+
+        this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
+
+        topology topology;
+        topology.add(input_layout("input_location", input_location.get_layout()));
+        topology.add(input_layout("input_confidence", input_confidence.get_layout()));
+        topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
+
+        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));
+
+        build_options opts;
+        if (runOnGPU)
+        {
+            opts.set_option(build_option::detection_output_gpu(true));
+        }
+
+        network network(engine, topology, opts);
+        network.set_input_data("input_location", input_location);
+        network.set_input_data("input_confidence", input_confidence);
+        network.set_input_data("input_prior_box", input_prior_box);
+
+        auto outputs = network.execute();
+
+        EXPECT_EQ(outputs.size(), size_t(1));
+        EXPECT_EQ(outputs.begin()->first, "detection_output");
+
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+
+        auto output_prim = outputs.begin()->second.get_memory();
+
+        check_results(output_prim, 0, "0 1 1.0 0.20 0.20 0.50 0.50");
+        check_results(output_prim, 1, "0 1 0.8 0.50 0.20 0.80 0.50");
+        check_results(output_prim, 2, "1 1 0.6 0.40 0.40 0.70 0.70");
+        check_results(output_prim, 3, "-1 0 0 0 0 0 0");
+    }
+
+    void forward_no_share_location_top_k_input_padding(bool runOnGPU)
+    {
+        const bool share_location = false;
+        const int num_loc_classes = share_location ? 1 : this->num_classes;
+        const int keep_top_k = 4;
+        const int background_label_id = -1;
+        const int top_k = 2;
+
+        const auto& engine = get_test_engine();
+        cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
+        cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
+        cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } });
+
+        this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
+        topology topology;
+        topology.add(input_layout("input_location", input_location.get_layout()));
+        topology.add(input_layout("input_confidence", input_confidence.get_layout()));
+        topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
+        topology.add(reorder("input_location_padded", "input_location", input_location.get_layout().with_padding({ { 0, 0, 12, 3 },{ 0, 0, 5, 11 } })));
+        topology.add(reorder("input_confidence_padded", "input_confidence", input_location.get_layout().with_padding({ { 0, 0, 2, 7 },{ 0, 0, 13, 1 } })));
+
+        topology.add(detection_output("detection_output", "input_location_padded", "input_confidence_padded", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));
+
+        build_options opts;
+        if (runOnGPU)
+        {
+            opts.set_option(build_option::detection_output_gpu(true));
+        }
+
+        network network(engine, topology, opts);
+        network.set_input_data("input_location", input_location);
+        network.set_input_data("input_confidence", input_confidence);
+        network.set_input_data("input_prior_box", input_prior_box);
+
+        auto outputs = network.execute();
+
+        EXPECT_EQ(outputs.size(), size_t(1));
+        EXPECT_EQ(outputs.begin()->first, "detection_output");
+
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+
+        auto output_prim = outputs.begin()->second.get_memory();
+
+        check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85");
+        check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85");
+        check_results(output_prim, 2, "0 1 1.0 0.20 0.20 0.50 0.50");
+        check_results(output_prim, 3, "0 1 0.8 0.50 0.20 0.80 0.50");
+        check_results(output_prim, 4, "1 0 1.0 0.25 0.25 0.55 0.55");
+        check_results(output_prim, 5, "1 1 0.6 0.40 0.40 0.70 0.70");
+        check_results(output_prim, 6, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 7, "-1 0 0 0 0 0 0");
+    }
+
+    void test_forward_no_share_location_top_k_faster_rcnn_case(bool runOnGPU)
+    {
+        const bool share_location = false;
+        const int num_loc_classes = share_location ? 1 : this->num_classes;
+        const int keep_top_k = 4;
+        const int background_label_id = -1;
+        const int top_k = 2;
+        const float eta = 1.0f;
+        const prior_box_code_type code_type = prior_box_code_type::corner;
+        const bool variance_encoded_in_target = true;
+        const float confidence_threshold = -std::numeric_limits<float>::max();
+        const int32_t prior_info_size = 5;
+        const int32_t prior_coordinates_offset = 1;
+        const bool prior_is_normalized = true;
+
+        const auto& engine = get_test_engine();
+        cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
+        cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
+        cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1, 1, 1, this->num_priors * prior_info_size } });
+
+        this->init_buffers(input_prior_box, input_confidence, input_location, share_location, variance_encoded_in_target,
+            prior_info_size, prior_coordinates_offset, prior_is_normalized);
+
+        topology topology;
+        topology.add(input_layout("input_location", input_location.get_layout()));
+        topology.add(input_layout("input_confidence", input_confidence.get_layout()));
+        topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
+        topology.add(reorder("input_location_padded", "input_location", input_location.get_layout().with_padding({ { 0, 0, 12, 3 },{ 0, 0, 5, 11 } })));
+        topology.add(reorder("input_confidence_padded", "input_confidence", input_location.get_layout().with_padding({ { 0, 0, 2, 7 },{ 0, 0, 13, 1 } })));
+
+        topology.add(detection_output("detection_output", "input_location_padded", "input_confidence_padded", "input_prior_box",
+            this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k,
+            eta, code_type, variance_encoded_in_target, confidence_threshold, prior_info_size, prior_coordinates_offset,
+            prior_is_normalized, this->img_size, this->img_size
+        ));
+
+        build_options opts;
+        if (runOnGPU)
+        {
+            opts.set_option(build_option::detection_output_gpu(true));
+        }
+
+        network network(engine, topology, opts);
+        network.set_input_data("input_location", input_location);
+        network.set_input_data("input_confidence", input_confidence);
+        network.set_input_data("input_prior_box", input_prior_box);
+
+        auto outputs = network.execute();
+
+        EXPECT_EQ(outputs.size(), size_t(1));
+        EXPECT_EQ(outputs.begin()->first, "detection_output");
+
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
+        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+
+        auto output_prim = outputs.begin()->second.get_memory();
+
+        check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85");
+        check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85");
+        check_results(output_prim, 2, "0 1 1.0 0.20 0.20 0.50 0.50");
+        check_results(output_prim, 3, "0 1 0.8 0.50 0.20 0.80 0.50");
+        check_results(output_prim, 4, "1 0 1.0 0.25 0.25 0.55 0.55");
+        check_results(output_prim, 5, "1 1 0.6 0.40 0.40 0.70 0.70");
+        check_results(output_prim, 6, "-1 0 0 0 0 0 0");
+        check_results(output_prim, 7, "-1 0 0 0 0 0 0");
+    }
+
+    static const int num_of_images = 2;
+    static const int num_classes = 2;
+    static const int num_priors = 4;
+    static const int img_size = 300;
+    const float nms_threshold;
+};
+
+typedef ::testing::Types<float, FLOAT16> detection_output_test_types;
+TYPED_TEST_CASE(detection_output_test, detection_output_test_types);
+
+
+TYPED_TEST(detection_output_test, test_setup_basic)
+{
+    this->setup_basic(false);
 }
 
-TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0_top_k)
+TYPED_TEST(detection_output_test, test_setup_basic_gpu)
 {
-    const bool share_location = false;
-    const int num_loc_classes = share_location ? 1 : this->num_classes;
-    const int keep_top_k = 2;
-    const int background_label_id = 0;
-    const int top_k = 2;
+    this->setup_basic(true);
+}
 
-    cldnn::engine engine;
-    cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx, { this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
-    cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx, { this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
-    cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx, { 1, 2, 1, this->num_priors * 4 } });
+TYPED_TEST(detection_output_test, test_setup_two_layers)
+{
+    this->setup_two_layers(false);
+}
 
-    this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
+TYPED_TEST(detection_output_test, test_setup_two_layers_gpu)
+{
+    this->setup_two_layers(true);
+}
 
-    topology topology;
-    topology.add(input_layout("input_location", input_location.get_layout()));
-    topology.add(input_layout("input_confidence", input_confidence.get_layout()));
-    topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
+TYPED_TEST(detection_output_test, test_forward_share_location)
+{
+    this->forward_share_location(false);
+}
 
-    topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));
-    network network(engine, topology);
-    network.set_input_data("input_location", input_location);
-    network.set_input_data("input_confidence", input_confidence);
-    network.set_input_data("input_prior_box", input_prior_box);
+TYPED_TEST(detection_output_test, test_forward_share_location_gpu)
+{
+    this->forward_share_location(true);
+}
 
-    auto outputs = network.execute();
+TYPED_TEST(detection_output_test, test_forward_num_detections_greater_than_keep_top_k)
+{
+    this->forward_num_detections_greater_than_keep_top_k(false);
+}
 
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "detection_output");
+TYPED_TEST(detection_output_test, test_forward_num_detections_greater_than_keep_top_k_gpu)
+{
+    this->forward_num_detections_greater_than_keep_top_k(true);
+}
 
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+TYPED_TEST(detection_output_test, test_forward_num_detections_smaller_than_keep_top_k)
+{
+    this->forward_num_detections_smaller_than_keep_top_k(false);
+}
 
-    auto output_prim = outputs.begin()->second.get_memory();
+TYPED_TEST(detection_output_test, test_forward_num_detections_smaller_than_keep_top_k_gpu)
+{
+    this->forward_num_detections_smaller_than_keep_top_k(true);
+}
 
-    this->check_results(output_prim, 0, "0 1 1.0 0.20 0.20 0.50 0.50");
-    this->check_results(output_prim, 1, "0 1 0.8 0.50 0.20 0.80 0.50");
-    this->check_results(output_prim, 2, "1 1 0.6 0.40 0.40 0.70 0.70");
-    this->check_results(output_prim, 3, "-1 0 0 0 0 0 0");
+TYPED_TEST(detection_output_test, test_forward_share_location_top_k)
+{
+    this->test_forward_share_location_top_k(false);
 }
 
-TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_input_padding)
+TYPED_TEST(detection_output_test, test_forward_share_location_top_k_gpu)
 {
-    const bool share_location = false;
-    const int num_loc_classes = share_location ? 1 : this->num_classes;
-    const int keep_top_k = 4;
-    const int background_label_id = -1;
-    const int top_k = 2;
+    this->test_forward_share_location_top_k(true);
+}
 
-    cldnn::engine engine;
-    cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
-    cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
-    cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } });
+TYPED_TEST(detection_output_test, test_forward_no_share_location)
+{
+    this->forward_no_share_location(false);
+}
 
-    this->init_buffers(input_prior_box, input_confidence, input_location, share_location);
-    topology topology;
-    topology.add(input_layout("input_location", input_location.get_layout()));
-    topology.add(input_layout("input_confidence", input_confidence.get_layout()));
-    topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
-    topology.add(reorder("input_location_padded", "input_location", input_location.get_layout().with_padding({ { 0, 0, 12, 3 },{ 0, 0, 5, 11 } })));
-    topology.add(reorder("input_confidence_padded", "input_confidence", input_location.get_layout().with_padding({ { 0, 0, 2, 7 },{ 0, 0, 13, 1 } })));
+TYPED_TEST(detection_output_test, test_forward_no_share_location_gpu)
+{
+    this->forward_no_share_location(true);
+}
 
-    topology.add(detection_output("detection_output", "input_location_padded", "input_confidence_padded", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));
-    network network(engine, topology);
-    network.set_input_data("input_location", input_location);
-    network.set_input_data("input_confidence", input_confidence);
-    network.set_input_data("input_prior_box", input_prior_box);
+TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k)
+{
+    this->forward_no_share_location_top_k(false);
+}
 
-    auto outputs = network.execute();
+TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_gpu)
+{
+    this->forward_no_share_location_top_k(true);
+}
 
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "detection_output");
+TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0)
+{
+    this->forward_no_share_location_neg_0(false);
+}
 
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
+TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0_gpu)
+{
+    this->forward_no_share_location_neg_0(true);
+}
 
-    auto output_prim = outputs.begin()->second.get_memory();
+TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0_top_k)
+{
+    this->forward_no_share_location_neg_0_top_k(false);
+}
 
-    this->check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85");
-    this->check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85");
-    this->check_results(output_prim, 2, "0 1 1.0 0.20 0.20 0.50 0.50");
-    this->check_results(output_prim, 3, "0 1 0.8 0.50 0.20 0.80 0.50");
-    this->check_results(output_prim, 4, "1 0 1.0 0.25 0.25 0.55 0.55");
-    this->check_results(output_prim, 5, "1 1 0.6 0.40 0.40 0.70 0.70");
-    this->check_results(output_prim, 6, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 7, "-1 0 0 0 0 0 0");
+TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0_top_k_gpu)
+{
+    this->forward_no_share_location_neg_0_top_k(true);
+}
+
+TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_input_padding)
+{
+    this->forward_no_share_location_top_k_input_padding(false);
+}
+
+TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_input_padding_gpu)
+{
+    this->forward_no_share_location_top_k_input_padding(true);
 }
 
 TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_faster_rcnn_case)
 {
+    this->test_forward_no_share_location_top_k_faster_rcnn_case(false);
+}
+
+TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_faster_rcnn_case_gpu)
+{
+    this->test_forward_no_share_location_top_k_faster_rcnn_case(true);
+}
+
+TYPED_TEST(detection_output_test, test_detection_output_sort_gpu)
+{
     const bool share_location = false;
     const int num_loc_classes = share_location ? 1 : this->num_classes;
-    const int keep_top_k = 4;
+    const int keep_top_k = 10;
     const int background_label_id = -1;
-    const int top_k = 2;
-    const float eta = 1.0f;
-    const prior_box_code_type code_type = prior_box_code_type::corner;
-    const bool variance_encoded_in_target = true;
-    const float confidence_threshold = -std::numeric_limits<float>::max();
-    const int32_t prior_info_size = 5;
-    const int32_t prior_coordinates_offset = 1;
-    const bool prior_is_normalized = true;
-
-    cldnn::engine engine;
-    cldnn::memory input_location = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } });
-    cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } });
-    cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ 1, 1, 1, this->num_priors * prior_info_size } });
-
-    this->init_buffers(input_prior_box, input_confidence, input_location, share_location, variance_encoded_in_target,
-                       prior_info_size, prior_coordinates_offset, prior_is_normalized);
+    const int top_k = -1;
+
+    const unsigned out_row_size = 7;
+    const unsigned score_space = ((this->num_of_images + 15) / 16) * 16;
+    int input_size = this->num_of_images * num_loc_classes * this->num_priors * out_row_size + score_space;
+
+    const auto& engine = get_test_engine();
+    cldnn::memory input_buff = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ 1, 1, 1, input_size } });
+
+    this->init_buffer_sort(input_buff);
 
     topology topology;
-    topology.add(input_layout("input_location", input_location.get_layout()));
-    topology.add(input_layout("input_confidence", input_confidence.get_layout()));
-    topology.add(input_layout("input_prior_box", input_prior_box.get_layout()));
-    topology.add(reorder("input_location_padded", "input_location", input_location.get_layout().with_padding({ { 0, 0, 12, 3 },{ 0, 0, 5, 11 } })));
-    topology.add(reorder("input_confidence_padded", "input_confidence", input_location.get_layout().with_padding({ { 0, 0, 2, 7 },{ 0, 0, 13, 1 } })));
-
-    topology.add(detection_output("detection_output", "input_location_padded", "input_confidence_padded", "input_prior_box",
-                                  this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k,
-                                  eta, code_type, variance_encoded_in_target, confidence_threshold, prior_info_size, prior_coordinates_offset,
-                                  prior_is_normalized, this->img_size, this->img_size
-                    ));
+    topology.add(input_layout("input_location", input_buff.get_layout()));
+
+    topology.add(detection_output_sort("detection_output_sort", "input_location", this->num_of_images, this->num_classes, keep_top_k, share_location, top_k, background_label_id));
     network network(engine, topology);
-    network.set_input_data("input_location", input_location);
-    network.set_input_data("input_confidence", input_confidence);
-    network.set_input_data("input_prior_box", input_prior_box);
+    network.set_input_data("input_location", input_buff);
 
     auto outputs = network.execute();
 
     EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "detection_output");
+    EXPECT_EQ(outputs.begin()->first, "detection_output_sort");
 
     EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
     EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
@@ -675,11 +993,23 @@ TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_faster_rc
 
     this->check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85");
     this->check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85");
-    this->check_results(output_prim, 2, "0 1 1.0 0.20 0.20 0.50 0.50");
-    this->check_results(output_prim, 3, "0 1 0.8 0.50 0.20 0.80 0.50");
-    this->check_results(output_prim, 4, "1 0 1.0 0.25 0.25 0.55 0.55");
-    this->check_results(output_prim, 5, "1 1 0.6 0.40 0.40 0.70 0.70");
-    this->check_results(output_prim, 6, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 7, "-1 0 0 0 0 0 0");
+    this->check_results(output_prim, 2, "0 0 0.2 0.55 0.15 0.85 0.45");
+    this->check_results(output_prim, 3, "0 0 0.0 0.15 0.15 0.45 0.45");
+    this->check_results(output_prim, 4, "0 1 1.0 0.20 0.20 0.50 0.50");
+    this->check_results(output_prim, 5, "0 1 0.8 0.50 0.20 0.80 0.50");
+    this->check_results(output_prim, 6, "0 1 0.6 0.20 0.50 0.50 0.80");
+    this->check_results(output_prim, 7, "0 1 0.4 0.50 0.50 0.80 0.80");
+    this->check_results(output_prim, 8, "1 0 1.0 0.25 0.25 0.55 0.55");
+    this->check_results(output_prim, 9, "1 0 0.4 0.45 0.45 0.75 0.75");
+    this->check_results(output_prim, 10, "1 1 0.6 0.40 0.40 0.70 0.70");
+    this->check_results(output_prim, 11, "-1 0 0 0 0 0 0");
+    this->check_results(output_prim, 12, "-1 0 0 0 0 0 0");
+    this->check_results(output_prim, 13, "-1 0 0 0 0 0 0");
+    this->check_results(output_prim, 14, "-1 0 0 0 0 0 0");
+    this->check_results(output_prim, 15, "-1 0 0 0 0 0 0");
+    this->check_results(output_prim, 16, "-1 0 0 0 0 0 0");
+    this->check_results(output_prim, 17, "-1 0 0 0 0 0 0");
+    this->check_results(output_prim, 18, "-1 0 0 0 0 0 0");
+    this->check_results(output_prim, 19, "-1 0 0 0 0 0 0");
 }