2 // Copyright (c) 2016 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 ///////////////////////////////////////////////////////////////////////////////////////////////////
18 #include <gtest/gtest.h>
19 #include "api/CPP/memory.hpp"
20 #include <api/CPP/input_layout.hpp>
21 #include "api/CPP/reorder.hpp"
22 #include "api/CPP/crop.hpp"
23 #include <api/CPP/topology.hpp>
24 #include <api/CPP/network.hpp>
25 #include <api/CPP/engine.hpp>
26 #include "test_utils/test_utils.h"
27 #include <api/CPP/data.hpp>
30 #include <gmock/gmock.h>
33 using namespace cldnn;
34 using namespace tests;
35 using namespace testing;
37 TEST(reorder_gpu_f32, basic)
39 // Input : yxfb:2x2x2x2
40 // Output : bfyx:2x2x2x2
43 // f0: b0: 1 2 b1: 0 0
44 // f0: b0: 3 4 b1: 0.5 -0.5
45 // f1: b0: 5 6 b1: 1.5 5.2
46 // f1: b0: 7 8 b1: 12 8
62 const auto& engine = get_test_engine();
64 auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } });
65 layout output_layout(data_types::f32, format::bfyx,{ 2,2,2,2 });
82 input_layout("input", input.get_layout()),
83 reorder("reorder", "input", output_layout));
85 network network(engine, topology);
86 network.set_input_data("input", input);
88 auto outputs = network.execute();
89 EXPECT_EQ(outputs.size(), size_t(1));
90 EXPECT_EQ(outputs.begin()->first, "reorder");
92 auto output = outputs.begin()->second.get_memory();
108 auto output_ptr = output.pointer<float>();
109 for (int i = 0; i < 16; i++)
111 EXPECT_FLOAT_EQ(answers[i], output_ptr[i]);
116 TEST(reorder_gpu_f32, basic_subtract) {
119 // Subtract : 1x2x2x2 (only first batch is taken into consideration)
122 // f0: b0: 1 2 b1: 0 0
123 // f0: b0: 3 4 b1: 0.5 -0.5
124 // f1: b0: 5 6 b1: 1.5 5.2
125 // f1: b0: 7 8 b1: 12 8
148 const auto& engine = get_test_engine();
150 auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } });
151 layout output_layout( data_types::f32, format::bfyx, {2,2,2,2} );
152 auto subtract = memory::allocate(engine, { data_types::f32, format::byxf, { 1, 2, 2, 2 } });
168 set_values(subtract, {
169 1.0f, 4.0f, 1.5f, 3.0f,
170 2.0f, 2.0f, 2.5f, 1.0f,
174 input_layout("input", input.get_layout()),
175 input_layout("subtract", subtract.get_layout()),
176 reorder("reorder", "input", output_layout, "subtract"));
178 network network(engine, topology);
179 network.set_input_data("input", input);
180 network.set_input_data("subtract", subtract);
182 auto outputs = network.execute();
183 EXPECT_EQ(outputs.size(), size_t(1));
184 EXPECT_EQ(outputs.begin()->first, "reorder");
186 auto output = outputs.begin()->second.get_memory();
188 float answers[16] = { 0.0f, 0.5f,
201 auto output_ptr = output.pointer<float>();
202 for (int i = 0; i < 16; i++)
204 EXPECT_FLOAT_EQ(answers[i], output_ptr[i]);
208 TEST(reorder_gpu_f32, basic_subtract_value) {
209 // Values_to_subtract : 2
214 // f0: b0: 1 2 b1: 0 0
215 // f0: b0: 3 4 b1: 0.5 -0.5
216 // f1: b0: 5 6 b1: 1.5 5.2
217 // f1: b0: 7 8 b1: 12 8
237 const auto& engine = get_test_engine();
239 auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } });
240 layout output_layout(data_types::f32, format::bfyx,{ 2,2,2,2 });
241 std::vector<float> subtract_val = { 0.5, 2.5 };
258 topology.add(input_layout("input", input.get_layout()), reorder("reorder", "input", output_layout, subtract_val));
260 network network(engine, topology);
261 network.set_input_data("input", input);
263 auto outputs = network.execute();
264 EXPECT_EQ(outputs.size(), size_t(1));
265 EXPECT_EQ(outputs.begin()->first, "reorder");
267 auto output = outputs.begin()->second.get_memory();
269 float answers[16] = { 0.5f, 1.5f,
282 auto output_ptr = output.pointer<float>();
283 for (int i = 0; i < 16; i++)
285 EXPECT_TRUE(are_equal(answers[i], output_ptr[i]));
289 TEST(reorder_gpu_f16, basic_subtract_f32_output_f32) {
290 // Input : 2x2x2x2 (FP16)
291 // Output : 2x2x2x2 (FP32)
292 // Subtract : 1x2x2x2 (FP32, only first batch is taken into consideration)
295 // f0: b0: 1 2 b1: 0 0
296 // f0: b0: 3 4 b1: 0.5 -0.5
297 // f1: b0: 5 6 b1: 1.5 5.2
298 // f1: b0: 7 8 b1: 12 8
300 // Subtract (FP32 - converted internally to FP16 before subtraction):
321 const auto& engine = get_test_engine();
323 if (!engine.get_info().supports_fp16)
325 std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
330 auto input = memory::allocate(engine, { data_types::f16, format::yxfb, { 2, 2, 2, 2 } });
331 layout output_layout(data_types::f32, format::bfyx,{ 2,2,2,2 });
332 auto subtract = memory::allocate(engine, { data_types::f32, format::byxf, { 1, 2, 2, 2 } });
335 half_t(0x3C00), half_t(0x0000), // 1.f, 0.f,
336 half_t(0x4500), half_t(0x3E00), // 5.f, 1.5f,
338 half_t(0x4000), half_t(0x0000), // 2.f, 0.f,
339 half_t(0x4600), half_t(0x4533), // 6.f, 5.2f,
341 half_t(0x4200), half_t(0x3800), // 3.f, 0.5f,
342 half_t(0x4700), half_t(0x4A00), // 7.f, 12.f,
344 half_t(0x4400), half_t(0xB800), // 4.f, -0.5f,
345 half_t(0x4800), half_t(0x4800) // 8.f, 8.f
348 set_values(subtract, {
349 1.0f, 4.0f, 1.5f, 3.0f,
350 2.0f, 2.0f, 2.5f, 1.0f,
354 topology.add(input_layout("input", input.get_layout()));
355 topology.add(data("subtract", subtract));
356 topology.add(reorder("reorder", "input", output_layout, "subtract"));
358 network network(engine, topology);
359 network.set_input_data("input", input);
361 auto outputs = network.execute();
362 EXPECT_EQ(outputs.size(), size_t(1));
363 EXPECT_EQ(outputs.begin()->first, "reorder");
365 auto output = outputs.begin()->second.get_memory();
367 float answers[16] = { 0.0f, 0.5f,
380 auto output_ptr = output.pointer<float>();
381 for (int i = 0; i < 16; i++)
383 EXPECT_TRUE(are_equal(answers[i], output_ptr[i]));
387 TEST(reorder_gpu_f16, basic_subtract_value) {
388 // Values_to_subtract : 2
389 // Input : 2x2x2x2 (FP16)
390 // Output : 2x2x2x2 (FP16)
393 // f0: b0: 1 2 b1: 0 0
394 // f0: b0: 3 4 b1: 0.5 -0.5
395 // f1: b0: 5 6 b1: 1.5 5.2
396 // f1: b0: 7 8 b1: 12 8
398 // subtract values (FP32 - converted internally to FP16 before subtraction)
416 const auto& engine = get_test_engine();
417 if (!engine.get_info().supports_fp16)
419 std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
424 auto input = memory::allocate(engine, { data_types::f16, format::yxfb, { 2, 2, 2, 2 } });
425 layout output_layout(data_types::f16, format::bfyx,{ 2,2,2,2 });
426 std::vector<float> subtract_val = { 0.5, 2.5 };
429 half_t(0x3C00), half_t(0x0000), // 1.f, 0.f,
430 half_t(0x4500), half_t(0x3E00), // 5.f, 1.5f,
432 half_t(0x4000), half_t(0x0000), // 2.f, 0.f,
433 half_t(0x4600), half_t(0x4533), // 6.f, 5.2f,
435 half_t(0x4200), half_t(0x3800), // 3.f, 0.5f,
436 half_t(0x4700), half_t(0x4A00), // 7.f, 12.f,
438 half_t(0x4400), half_t(0xB800), // 4.f, -0.5f,
439 half_t(0x4800), half_t(0x4800) // 8.f, 8.f
443 topology.add(input_layout("input", input.get_layout()));
444 topology.add(reorder("reorder", "input", output_layout, subtract_val));
446 network network(engine, topology);
447 network.set_input_data("input", input);
449 auto outputs = network.execute();
450 EXPECT_EQ(outputs.size(), size_t(1));
451 EXPECT_EQ(outputs.begin()->first, "reorder");
453 auto output = outputs.begin()->second.get_memory();
455 half_t answers[16] = { half_t(0x3800), half_t(0x3E00), // 0.5f, 1.5f,
456 half_t(0x4100), half_t(0x4300), // 2.5f, 3.5f,
458 half_t(0x4100), half_t(0x4300), // 2.5f, 3.5f,
459 half_t(0x4480), half_t(0x4580), // 4.5f, 5.5f,
461 half_t(0xB800), half_t(0xB800), // -0.5f, -0.5f,
462 half_t(0x0000), half_t(0xBC00), // 0.0f, -1.0f,
464 half_t(0xBC00), half_t(0x4166), // -1.0f, 2.7f,
465 half_t(0x48C0), half_t(0x4580) // 9.5f, 5.5f
468 auto output_ptr = output.pointer<half_t>();
469 for (int i = 0; i < 16; i++)
471 EXPECT_TRUE(are_equal(static_cast<uint16_t>(answers[i]), static_cast<uint16_t>(output_ptr[i])));
475 TEST(reorder_gpu, basic_convert_f16_f32_f16) {
476 // Converts entire unambiguous range of FP16 numbers to FP32 and back.
478 // Input : 2x2x15873x1 (FP16)
479 // Intermediate : 1x2x2x15873 (FP32) {different mem format but the same ordering because batch is 1}
480 // Output : 2x2x15673x1 (FP16)
482 // Output is expected to contain the same value as input in range of indices from 0x0000 to 0xF801.
485 const auto& engine = get_test_engine();
487 if (!engine.get_info().supports_fp16)
489 std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
494 std::vector<half_t> expected_values;
495 expected_values.resize(0xF804);
496 for (int i = 0; i < 0x7C00; ++i)
497 expected_values[i] = half_t(i); // norms/denorms/zero (positive).
498 for (int i = 0x7C00; i < 0xF800; ++i)
499 expected_values[i] = half_t(i + 0x0400); // norms/denorms (negative).
500 expected_values[0x7C00] = half_t(0x0000); // NOTE: do not do final test for negative 0 (-0).
502 expected_values[0xF800] = half_t(0x7C00); // +infinity
503 expected_values[0xF801] = half_t(0xFC00); // -infinity
504 // Special values (ambiguous ones).
505 expected_values[0xF802] = half_t(0x8000); // -0
506 expected_values[0xF803] = half_t(0xFC12); // A NaN (sample: -NaN.0x12).
508 auto input = memory::allocate(engine, { data_types::f16, format::yxfb, { 1, static_cast<int32_t>(expected_values.size()) / 4, 2, 2 } });
509 layout interm_layout( data_types::f32, format::byxf, { 1, static_cast<int32_t>(expected_values.size()) / 4, 2, 2 });
510 auto output_layout = input.get_layout();
512 set_values(input, expected_values);
515 topology.add(input_layout("input", input.get_layout()));
516 topology.add(reorder("reorder_f16_f32", "input", interm_layout));
517 topology.add(reorder("reorder_f32_f16", "reorder_f16_f32", output_layout));
523 build_option::outputs({"reorder_f16_f32", "reorder_f32_f16"})
526 network.set_input_data("input", input);
528 auto outputs = network.execute();
529 EXPECT_EQ(outputs.size(), size_t(2));
530 EXPECT_TRUE(outputs.find("reorder_f16_f32") != outputs.end());
531 EXPECT_TRUE(outputs.find("reorder_f32_f16") != outputs.end());
533 auto interm = outputs.at("reorder_f16_f32").get_memory();
534 auto interm_ptr = interm.pointer<float>();
537 EXPECT_TRUE(are_equal(interm_ptr[0x3400], 0.25f));
538 EXPECT_TRUE(are_equal(interm_ptr[0x3800], 0.5f));
539 EXPECT_TRUE(are_equal(interm_ptr[0x3C00], 1.0f));
540 EXPECT_TRUE(are_equal(interm_ptr[0x4000], 2.0f));
541 EXPECT_TRUE(are_equal(interm_ptr[0x4400], 4.0f));
543 EXPECT_TRUE(are_equal(interm_ptr[0x3400 + 0x7C00], -0.25f));
544 EXPECT_TRUE(are_equal(interm_ptr[0x3800 + 0x7C00], -0.5f));
545 EXPECT_TRUE(are_equal(interm_ptr[0x3C00 + 0x7C00], -1.0f));
546 EXPECT_TRUE(are_equal(interm_ptr[0x4000 + 0x7C00], -2.0f));
547 EXPECT_TRUE(are_equal(interm_ptr[0x4400 + 0x7C00], -4.0f));
549 EXPECT_TRUE(are_equal(interm_ptr[0xF800], std::numeric_limits<float>::infinity()));
550 EXPECT_TRUE(are_equal(interm_ptr[0xF801], -std::numeric_limits<float>::infinity()));
551 EXPECT_TRUE(are_equal(interm_ptr[0xF802], -0.0f));
552 EXPECT_TRUE(std::isnan(interm_ptr[0xF803]));
554 auto output = outputs.at("reorder_f32_f16").get_memory();
555 auto output_ptr = output.pointer<half_t>();
556 for (int i = 0; i < 0xF802; ++i) // NOTE: do not test for possibly ambiguous values of floating point (-0, NaNs).
558 EXPECT_TRUE(are_equal(static_cast<uint16_t>(expected_values[i]), static_cast<uint16_t>(output_ptr[i])));
563 TEST(reorder_gpu, basic_convert_int8) {
565 const auto& engine = get_test_engine();
566 layout in_layout = { type_to_data_type<float>::value,format::byxf,{ 1,1,3,3 } };
567 layout byte_layout = { type_to_data_type<int8_t>::value, format::bfyx,{ 1,1,3,3 } };
568 std::initializer_list<float> input_f = { 1.0f, -2.5f, 3.1f, -4.0f, 5.03f, -6.99f, 7.0f, -8.0f, 9.0f };
569 std::list<float> final_results = { 1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, -8.0f, 9.0f };
571 // Allocate memory for input image.
572 auto input_memory = memory::allocate(engine, in_layout);
573 set_values(input_memory, input_f);
575 // Create input_layout description
576 // "input" - is the primitive id inside topology
577 input_layout input("input", in_layout);
580 // 1. input layout primitive.
582 // 2. reorder primitive with id "reorder_input"
583 reorder("reorder_input",
584 // input primitive for reorder (implicitly converted to primitive_id)
586 // output layout for reorder
588 reorder("reorder2", "reorder_input", in_layout)
595 build_option::outputs({ "reorder2"})
598 network.set_input_data("input", input_memory);
600 auto outputs = network.execute();
602 auto interm = outputs.at("reorder2").get_memory();
603 auto interm_ptr = interm.pointer<float>();
604 auto output_size = outputs.at("reorder2").get_memory().count();
605 unsigned int cntr = 0;
606 for (const auto& exp : final_results)
608 EXPECT_EQ(exp, interm_ptr[cntr++]);
612 TEST(reorder_gpu, basic_convert_uint8rgbabyxf_to_fp32_bfyx) {
613 // Converts an ARGB(uint8) image to common clDNN input of bfyx FP32
615 // Input : 1x5x5x4 (UINT8)
616 // Intermediate : 1x4x5x5 (FP32) {different mem format and ordering}
617 // Output : 1x3x5x5 (FP32) {using crop layer to reduce feature dimention and drop A from RGBA}
619 // Output is expected to contain the same value as input
621 const int kernel_size = 5;
622 const int feature_size = 4;
623 const auto& engine = get_test_engine();
625 if (!engine.get_info().supports_fp16)
627 std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
632 std::initializer_list<uint8_t> input_i8 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
633 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36,
634 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
635 155, 154, 153, 152, 151, 150, 149, 148, 147, 146, 145, 144, 143, 142, 141, 140, 139, 138, 137, 136,
636 255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240, 239, 238, 237, 236
639 layout in_layout = { type_to_data_type<uint8_t>::value,format::byxf,{ 1,4,kernel_size,kernel_size } };
640 layout output_layout = { type_to_data_type<float>::value, format::bfyx, {1,4,kernel_size,kernel_size } };
642 // Allocate memory for input image.
643 auto input_memory = memory::allocate(engine, in_layout);
644 set_values(input_memory, input_i8);
646 // Create input_layout description
647 // "input" - is the primitive id inside topology
648 input_layout input("input", in_layout);
650 // Create topology object with 2 primitives
652 // 1. input layout primitive.
654 // 2. reorder primitive with id "reorder_input"
655 reorder("reorder_input",
656 // input primitive for reorder (implicitly converted to primitive_id)
658 // output layout for reorder
662 tensor crop_reference_input_tensor(spatial(kernel_size, kernel_size), batch(1), feature(4 - 1));
663 tensor crop_offset_tensor(spatial(0, 0), batch(0), feature(0));
664 padding output_padding = padding({ 0,0,0,0 }, { 0,0,0,0 }, 0);
666 // cropping primitive with id "crop1"
668 "reorder_input", // primitive id of the cropping input
669 crop_reference_input_tensor, // input tensor
670 crop_offset_tensor, // bias primitive id
679 build_option::outputs({ "reorder_input", "crop" })
682 network.set_input_data("input", input_memory);
684 auto outputs = network.execute();
685 EXPECT_EQ(outputs.size(), size_t(2));
686 EXPECT_TRUE(outputs.find("reorder_input") != outputs.end());
687 EXPECT_TRUE(outputs.find("crop") != outputs.end());
689 auto interm = outputs.at("reorder_input").get_memory();
690 auto interm_ptr = interm.pointer<float>();
691 auto interm_size = outputs.at("reorder_input").get_memory().count();
692 EXPECT_EQ(interm_size,(size_t) (1*feature_size*kernel_size*kernel_size));
695 EXPECT_TRUE(are_equal(interm_ptr[0], 1.0f));
696 size_t source_index = 0;
697 size_t target_index = 0;
698 std::vector<uint8_t> testinput;// This will be used to direct access elements of test input in the next test
699 for (auto it = input_i8.begin(); it < input_i8.end(); it++)
703 testinput.push_back(val); // This will be used to direct access elements of test input in the next test
704 size_t current_feature = source_index % feature_size;
705 size_t current_x = (source_index / feature_size) % kernel_size;
706 size_t current_y = (source_index / (feature_size * kernel_size));
707 target_index = current_x + current_y*kernel_size + current_feature*(kernel_size*kernel_size);
708 EXPECT_TRUE(are_equal(val, interm_ptr[target_index]));
712 auto output = outputs.at("crop").get_memory();
713 auto output_ptr = output.pointer<float>();
714 auto output_size = outputs.at("crop").get_memory().count();
715 EXPECT_EQ(output_size,(size_t) (1 * (feature_size-1)*kernel_size*kernel_size));
717 for (target_index = 0; target_index < output_size; target_index++)
719 float output_val = output_ptr[target_index];
720 int current_x = target_index % kernel_size;
721 int current_y = (target_index / kernel_size) % kernel_size;
722 size_t current_feature = target_index / (kernel_size*kernel_size);
724 source_index = current_x*feature_size + current_y*(kernel_size*feature_size) + current_feature;
725 EXPECT_TRUE(are_equal(output_val, testinput[source_index]));
730 TEST(reorder_gpu_f32, basic_yxfb_to_bfyx_input_padding)
732 // Input : yxfb:2x2x2x2
733 // Output : bfyx:2x2x2x2
749 // f0: b0: 1 2 b1: 0 0
750 // f0: b0: 3 4 b1: 0.5 -0.5
751 // f1: b0: 5 6 b1: 1.5 5.2
752 // f1: b0: 7 8 b1: 12 8
754 const auto& engine = get_test_engine();
756 auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } });
757 layout output_layout(data_types::f32, format::bfyx, { 2,2,2,2 });
774 input_layout("input", input.get_layout()),
775 reorder("reorder", "input", input.get_layout().format, input.get_layout().data_type, "", cldnn_reorder_mean_mode::mean_subtract, { { 0, 0, 1, 2 }, 0 }),
776 reorder("reorder2", "reorder", output_layout));
778 network network(engine, topology);
779 network.set_input_data("input", input);
781 auto outputs = network.execute();
782 EXPECT_EQ(outputs.size(), size_t(1));
783 EXPECT_EQ(outputs.begin()->first, "reorder2");
785 auto output = outputs.begin()->second.get_memory();
787 float answers[16] = {
800 auto output_ptr = output.pointer<float>();
801 for (int i = 0; i < 16; i++)
803 EXPECT_FLOAT_EQ(answers[i], output_ptr[i]);
808 TEST(reorder_gpu_f32, basic_bfyx_to_yxfb_input_padding)
810 // Input : bfyx:2x2x2x2
811 // Output : yxfb:2x2x2x2
814 // f0: b0: 1 2 b1: 0 0
815 // f0: b0: 3 4 b1: 0.5 -0.5
816 // f1: b0: 5 6 b1: 1.5 5.2
817 // f1: b0: 7 8 b1: 12 8
833 const auto& engine = get_test_engine();
835 auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } });
836 layout output_layout(data_types::f32, format::yxfb, { 2,2,2,2 });
853 input_layout("input", input.get_layout()),
854 reorder("reorder", "input", input.get_layout().format, input.get_layout().data_type, "", cldnn_reorder_mean_mode::mean_subtract, { { 0, 0, 2, 1 }, 0 }),
855 reorder("reorder2", "reorder", output_layout));
857 network network(engine, topology);
858 network.set_input_data("input", input);
860 auto outputs = network.execute();
861 EXPECT_EQ(outputs.size(), size_t(1));
862 EXPECT_EQ(outputs.begin()->first, "reorder2");
864 auto output = outputs.begin()->second.get_memory();
866 float answers[16] = {
879 std::vector<float> out;
880 auto output_ptr = output.pointer<float>();
881 for (int i = 0; i < 16; i++)
883 out.push_back(output_ptr[i]);
884 EXPECT_FLOAT_EQ(answers[i], output_ptr[i]);
889 TEST(reorder_gpu_opt, basic_remove_redundant)
893 memory in = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 2, 2, 1 } });
895 input_layout("in", in.get_layout()),
896 reorder("r1", "in", format::bfyx, data_types::f32),
897 reorder("r2", "r1", format::yxfb, data_types::f32)
901 opts.set_option(build_option::optimize_data(true));
903 network net(eng, tpl, opts);
904 net.set_input_data("in", in);
905 auto outputs = net.execute();
906 auto executed_primitives = net.get_executed_primitives();
908 EXPECT_TRUE(executed_primitives.count("r1") == 0);
909 ASSERT_TRUE(outputs.count("r2") == 1);
910 EXPECT_TRUE(outputs.at("r2").get_memory().get_layout().format == format::yxfb);
913 TEST(reorder_gpu_opt, remove_redundant_activation_fuse)
917 memory in = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 1, 2, 1 } });
918 set_values(in, { -1.0f, -1.0f });
919 memory scale_mem = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{1, 1, 1, 1 } });
920 set_values(scale_mem, { 2.0f });
922 input_layout("in", in.get_layout()),
923 reorder("r1", "in", format::bfyx, data_types::f32),
924 activation("relu", "r1", cldnn_activation_func::activation_relu_negative_slope, {0.01f, 0.0f}),
925 data("scale_data", scale_mem),
926 scale("output", "relu", "scale_data")
930 opts.set_option(build_option::optimize_data(true));
932 network net(eng, tpl, opts);
933 net.set_input_data("in", in);
934 auto outputs = net.execute();
935 auto out_ptr = outputs.begin()->second.get_memory().pointer<float>();
936 EXPECT_FLOAT_EQ(out_ptr[0], -0.02f);
937 EXPECT_FLOAT_EQ(out_ptr[1], -0.02f);
940 TEST(reorder_gpu_opt, basic_do_not_remove_redundant_due_it_is_output)
944 memory in = memory::allocate(eng, { data_types::f32, format::yxfb, tensor{ 1, 2, 2, 1 } });
945 memory weights = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 2, 2, 1 } });
947 input_layout("in", in.get_layout()),
948 convolution("conv", "in", { "weights" }),
949 data("weights", weights),
950 reorder("r1", "conv", format::bfyx, data_types::f32) //reoder is output - do not optimize
954 opts.set_option(build_option::optimize_data(true));
956 network net(eng, tpl, opts);
957 net.set_input_data("in", in);
958 auto outputs = net.execute();
959 auto executed_primitives = net.get_executed_primitives();
961 //all pirmitives in this test needs to be executed
962 EXPECT_TRUE(executed_primitives.count("conv") == 1);
963 EXPECT_TRUE(executed_primitives.count("in") == 1);
964 EXPECT_TRUE(executed_primitives.count("r1") == 1);
965 ASSERT_TRUE(outputs.count("r1") == 1);
966 EXPECT_TRUE(outputs.at("r1").get_memory().get_layout().format == format::bfyx);
969 TEST(reorder_gpu_opt, basic_remove_redundant_output_due_to_implicit_reorders)
973 memory in = memory::allocate(eng, { data_types::f32, format::yxfb, tensor{ 1, 2, 2, 1 } });
974 memory weights = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 2, 2, 1 } });
976 input_layout("in", in.get_layout()),
977 convolution("conv", "in",{ "weights" }),
978 data("weights", weights),
979 reorder("r1", "conv", format::bfyx, data_types::f32) //optimize data should add conversion from yxfb to bfyx and 'conv' should output data in bfyx as well (IE case)
984 //we need to check if r1 will be successfully opimized and still we should be able to query for r1's output which should point to conv's output (note conv cannot be marked as output in this case)
985 opts.set_option(build_option::outputs({ "r1" }));
986 opts.set_option(build_option::optimize_data(true));
988 network net(eng, tpl, opts);
989 net.set_input_data("in", in);
990 auto outputs = net.execute();
992 EXPECT_TRUE(outputs.count("conv") == 0);
993 ASSERT_TRUE(outputs.count("r1") == 1);
994 EXPECT_TRUE(outputs.at("r1").get_memory().get_layout().format == format::bfyx);
997 TEST(reorder_gpu_opt, basic_remove_redundant_due_to_implicit_reorders)
1001 memory in = memory::allocate(eng, { data_types::f32, format::yxfb, tensor{ 1, 2, 2, 1 } });
1002 memory weights = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 2, 2, 1 } });
1004 input_layout("in", in.get_layout()),
1005 convolution("conv", "in",{ "weights" }),
1006 data("weights", weights),
1007 reorder("r1", "conv", format::bfyx, data_types::f32), //optimize data should add conversion from yxfb to bfyx and 'conv' should output data in bfyx as well (IE case)
1008 softmax("output", "r1")
1012 opts.set_option(build_option::optimize_data(true));
1014 network net(eng, tpl, opts);
1015 net.set_input_data("in", in);
1016 auto outputs = net.execute();
1017 auto executed_primitives = net.get_executed_primitives();
1019 //remove redundant reorder optimization should remove r1 node
1020 EXPECT_TRUE(executed_primitives.count("r1") == 0);
1021 //all pirmitives in this test needs to be executed
1022 ASSERT_TRUE(outputs.count("output") == 1);
1023 EXPECT_TRUE(outputs.at("output").get_memory().get_layout().format == format::bfyx);
1026 TEST(reorder_gpu_opt, non_trivial_remove_redundant)
1030 memory in = memory::allocate(eng, { data_types::f32, format::yxfb, tensor{ 1, 1, 5, 2 } });
1032 input_layout("in", in.get_layout()),
1033 reorder("r1", "in", format::bfyx, data_types::f32)
1038 opts.set_option(build_option::optimize_data(true));
1040 network net(eng, tpl, opts);
1041 net.set_input_data("in", in);
1042 auto outputs = net.execute();
1043 auto executed_primitives = net.get_executed_primitives();
1044 auto all_primitives = net.get_all_primitives();
1046 ASSERT_TRUE(executed_primitives.count("in") == 1);
1047 //ASSERT_TRUE(all_primitives.at("r1") == "_optimized_");
1048 EXPECT_TRUE(executed_primitives.at("in") != outputs.at("r1").get_event());
1049 ASSERT_TRUE(outputs.count("r1") == 1);
1050 EXPECT_TRUE(outputs.at("r1").get_memory().get_layout().format == format::bfyx);
1054 TEST(reorder_gpu_opt, mean_mul)
1058 memory in = memory::allocate(eng, { data_types::i8, format::bfyx, tensor{ 1, 3, 1, 2 } });
1059 memory mul = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{1, 3, 1, 2 } });
1061 set_values<char>(in,
1065 set_values<float>(mul,
1066 { 0.5f, 2.5f, -5.0f, 4.3f, 1.2f, -3.5f });
1069 input_layout("in", in.get_layout()),
1071 reorder("r1", "in", format::bfyx, data_types::f32,"mul", cldnn_reorder_mean_mode::mean_mul)
1074 float answers[] = { 0.5f, 5.0f, -15.0f, 17.2f, 6.0f, -21.0f };
1076 opts.set_option(build_option::optimize_data(true));
1077 network net(eng, tpl, opts);
1078 net.set_input_data("in", in);
1080 auto outputs = net.execute();
1081 auto output = outputs.begin()->second.get_memory();
1082 auto ptr = output.pointer<float>();
1083 float* a_ptr = answers;
1084 for (auto& val : ptr)
1085 EXPECT_FLOAT_EQ(*(a_ptr++), val);;
1090 TEST(reorder_gpu_opt, mean_div)
1094 memory in = memory::allocate(eng, { data_types::i8, format::bfyx, tensor{ 1, 3, 1, 2 } });
1095 memory mul = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 3, 1, 2 } });
1097 set_values<char>(in,
1101 set_values<float>(mul,
1102 { 0.5f, 2.0f, -3.0f, 8.0f, 1.25f, -3.0f });
1105 input_layout("in", in.get_layout()),
1107 reorder("r1", "in", format::bfyx, data_types::f32,"mul", cldnn_reorder_mean_mode::mean_div)
1110 float answers[] = { 2.0f, 1.0f, -1.0f, 0.5f, 4.0f, -2.0f };
1112 opts.set_option(build_option::optimize_data(true));
1113 network net(eng, tpl, opts);
1114 net.set_input_data("in", in);
1116 auto outputs = net.execute();
1117 auto output = outputs.begin()->second.get_memory();
1118 auto ptr = output.pointer<float>();
1119 float* a_ptr = answers;
1120 for (auto& val : ptr)
1121 EXPECT_FLOAT_EQ(*(a_ptr++), val);;
1126 TEST(reorder_gpu_opt, mean_mul_val)
1130 memory in = memory::allocate(eng, { data_types::i8, format::bfyx, tensor{ 1, 3, 1, 2 } });
1132 set_values<char>(in,
1136 std::vector<float> mul_val = { 2.0f, 0.5f, 10.0f };
1138 input_layout("in", in.get_layout()),
1139 reorder("r1", "in", format::bfyx, data_types::f32, mul_val, cldnn_reorder_mean_mode::mean_mul)
1142 float answers[] = { 2.0f, 4.0f, 1.5f, 2.0f, 50.0f, 600.0f };
1144 opts.set_option(build_option::optimize_data(true));
1145 network net(eng, tpl, opts);
1146 net.set_input_data("in", in);
1148 auto outputs = net.execute();
1149 auto output = outputs.begin()->second.get_memory();
1150 auto ptr = output.pointer<float>();
1151 float* a_ptr = answers;
1152 for (auto& val : ptr)
1153 EXPECT_FLOAT_EQ(*(a_ptr++), val);;
1157 TEST(reorder_gpu_opt, mean_mul_val_float_to_int)
1161 memory in = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 3, 1, 2 } });
1163 set_values<float>(in,
1167 std::vector<float> mul_val = { 1.4f, 0.5f, 5.0f };
1169 input_layout("in", in.get_layout()),
1170 reorder("r1", "in", format::bfyx, data_types::i8, mul_val, cldnn_reorder_mean_mode::mean_mul)
1173 char answers[] = { 0, 2, 1, 2, 25, 127 };
1175 opts.set_option(build_option::optimize_data(true));
1176 network net(eng, tpl, opts);
1177 net.set_input_data("in", in);
1179 auto outputs = net.execute();
1180 auto output = outputs.begin()->second.get_memory();
1181 auto ptr = output.pointer<char>();
1182 char* a_ptr = answers;
1183 for (auto& val : ptr)
1184 EXPECT_EQ(*(a_ptr++), val);
1187 TEST(reorder_gpu_i32, basic)
1189 // Test for converting data types f32->i32
1190 const auto& engine = get_test_engine();
1192 auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } });
1193 layout output_layout(data_types::i32, format::bfyx, { 2,2,2,2 });
1196 1.f, 0.f, 5.f, 1.5f,
1197 2.f, 0.f, 6.f, 5.2f,
1198 3.f, 0.5f, 7.f, 12.f,
1199 4.f, -0.5f, 8.f, 8.f
1203 input_layout("input", input.get_layout()),
1204 reorder("reorder", "input", output_layout));
1206 network network(engine, topology);
1207 network.set_input_data("input", input);
1209 auto outputs = network.execute();
1210 EXPECT_EQ(outputs.size(), size_t(1));
1211 EXPECT_EQ(outputs.begin()->first, "reorder");
1213 auto output = outputs.begin()->second.get_memory();
1215 int32_t answers[16] = {
1222 int32_t* a_ptr = answers;
1223 auto output_ptr = output.pointer<int32_t>();
1224 for (auto& val : output_ptr)
1225 EXPECT_EQ(*(a_ptr++), val);
1228 TEST(reorder_gpu_i64, basic)
1230 // Test for converting data types f32->i64
1231 const auto& engine = get_test_engine();
1233 auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } });
1234 layout output_layout(data_types::i64, format::bfyx, { 2,2,2,2 });
1237 1.f, 0.f, 5.f, 1.5f,
1238 2.f, 0.f, 6.f, 5.2f,
1239 3.f, 0.5f, 7.f, 12.f,
1240 4.f, -0.5f, 8.f, 8.f
1244 input_layout("input", input.get_layout()),
1245 reorder("reorder", "input", output_layout));
1247 network network(engine, topology);
1248 network.set_input_data("input", input);
1250 auto outputs = network.execute();
1251 EXPECT_EQ(outputs.size(), size_t(1));
1252 EXPECT_EQ(outputs.begin()->first, "reorder");
1254 auto output = outputs.begin()->second.get_memory();
1256 int64_t answers[16] = {
1263 int64_t* a_ptr = answers;
1264 auto output_ptr = output.pointer<int64_t>();
1265 for (auto& val : output_ptr)
1266 EXPECT_EQ(*(a_ptr++), val);
1269 using namespace cldnn;
1271 class reorder_test : public tests::generic_test
1276 static void TearDownTestCase()
1278 for (auto generic_params : all_generic_params)
1280 delete generic_params;
1282 for (auto test_param : all_test_params)
1284 auto primitive = std::get<1>(test_param);
1290 static std::vector<std::tuple<test_params*, cldnn::primitive*>> generate_specific_test_params()
1292 generic_test::generate_generic_test_params(all_generic_params);
1294 const auto data_types = test_data_types();
1296 for (const auto& test_param : all_generic_params)
1298 cldnn::tensor input_tensor = test_param->input_layouts[0].size;
1300 std::vector<cldnn::layout> output_layouts = {};
1302 for (const auto& dt : data_types)
1304 for (const auto& fmt : generic_test::test_input_formats)
1306 output_layouts.push_back({ dt, fmt, input_tensor });
1309 // TODO: check unsupported formats.
1311 //TODO: check subtract.
1312 std::vector<float> subtract = {};
1314 for (const auto& output_layout : output_layouts)
1316 //TODO: check input + output padding.
1317 all_test_params.push_back(std::make_tuple(test_param, new reorder("reorder", "input0", output_layout, subtract)));
1322 return all_test_params;
1325 virtual bool is_format_supported(cldnn::format format)
1327 return ( (format == cldnn_format_type::cldnn_format_yxfb) ||
1328 (format == cldnn_format_type::cldnn_format_byxf) ||
1329 (format == cldnn_format_type::cldnn_format_bfyx) ||
1330 (format == cldnn_format_type::cldnn_format_fyxb)
1334 template<typename InputType, typename OutputType>
1335 memory generate_reference_typed(const std::vector<cldnn::memory>& inputs)
1337 const cldnn::reorder* reorder = (cldnn::reorder*)layer_params;
1338 primitive_id mean = reorder->mean;
1339 std::vector<float> subtract_per_feature = reorder->subtract_per_feature;
1341 assert(subtract_per_feature.size() == 0);
1343 auto output = memory::allocate(engine, cldnn::layout(*reorder->output_data_type, inputs[0].get_layout().format, inputs[0].get_layout().size));
1345 cldnn::pointer<InputType> input_mem = inputs[0].pointer<InputType>();
1346 cldnn::pointer<OutputType> output_mem = output.pointer<OutputType>();
1348 for (size_t i = 0; i < inputs[0].get_layout().get_linear_size(); i++)
1350 // Write the output in the same order as the input with type conversion as needed.
1351 // The correct order will be checked in generic_test::compare_buffers.
1352 output_mem[i] = (OutputType)input_mem[i];
1358 virtual memory generate_reference(const std::vector<cldnn::memory>& inputs)
1360 if (generic_params->data_type == data_types::f32)
1362 if (*layer_params->output_data_type == data_types::f32)
1364 return generate_reference_typed<float, float>(inputs);
1368 return generate_reference_typed<float, FLOAT16>(inputs);
1373 if (*layer_params->output_data_type == data_types::f32)
1375 return generate_reference_typed<FLOAT16, float>(inputs);
1379 return generate_reference_typed<FLOAT16, FLOAT16>(inputs);
1386 static std::vector<tests::test_params*> all_generic_params;
1387 static std::vector<std::tuple<test_params*, cldnn::primitive*>> all_test_params;
1391 std::vector<tests::test_params*> reorder_test::all_generic_params = {};
1392 std::vector<std::tuple<test_params*, cldnn::primitive*>> reorder_test::all_test_params = {};
1394 TEST_P(reorder_test, REORDER)
1399 INSTANTIATE_TEST_CASE_P(DISABLED_REORDER,
1401 ::testing::ValuesIn(reorder_test::generate_specific_test_params()),
1402 tests::generic_test::custom_param_name_functor());