inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp

   1 /*
   2 // Copyright (c) 2016 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 ///////////////////////////////////////////////////////////////////////////////////////////////////
  18 #include <gtest/gtest.h>
  19 #include "api/CPP/memory.hpp"
  20 #include <api/CPP/input_layout.hpp>
  21 #include "api/CPP/reorder.hpp"
  22 #include "api/CPP/crop.hpp"
  23 #include <api/CPP/topology.hpp>
  24 #include <api/CPP/network.hpp>
  25 #include <api/CPP/engine.hpp>
  26 #include "test_utils/test_utils.h"
  27 #include <api/CPP/data.hpp>
  28
  29 #include <cmath>
  30 #include <gmock/gmock.h>
  31 #include <limits>
  32
  33 using namespace cldnn;
  34 using namespace tests;
  35 using namespace testing;
  36
  37 TEST(reorder_gpu_f32, basic)
  38 {
  39     //  Input               : yxfb:2x2x2x2
  40     //  Output              : bfyx:2x2x2x2
  41     //
  42     //  Input:
  43     //  f0: b0:  1    2  b1:   0    0
  44     //  f0: b0:  3    4  b1:   0.5 -0.5
  45     //  f1: b0:  5    6  b1:   1.5  5.2
  46     //  f1: b0:  7    8  b1:   12   8
  47     //
  48     //  Output:
  49     //  b0 f0:  1    2
  50     //  b0 f0:  3    4
  51     //
  52     //  b0 f1:  5    6
  53     //  b0 f1:  7    8
  54     //
  55     //  b1 f0:  0    0
  56     //  b1 f0: 0.5 -0.5
  57     //
  58     //  b1 f1: 1.5  5.2
  59     //  b1 f1: 12    8
  60     //
  61
  62     const auto& engine = get_test_engine();
  63
  64     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } });
  65     layout output_layout(data_types::f32, format::bfyx,{ 2,2,2,2 });
  66
  67     set_values(input, {
  68         1.f, 0.f,
  69         5.f, 1.5f,
  70
  71         2.f, 0.f,
  72         6.f, 5.2f,
  73
  74         3.f, 0.5f,
  75         7.f, 12.f,
  76
  77         4.f, -0.5f,
  78         8.f, 8.f
  79     });
  80
  81     topology topology(
  82         input_layout("input", input.get_layout()),
  83         reorder("reorder", "input", output_layout));
  84
  85     network network(engine, topology);
  86     network.set_input_data("input", input);
  87
  88     auto outputs = network.execute();
  89     EXPECT_EQ(outputs.size(), size_t(1));
  90     EXPECT_EQ(outputs.begin()->first, "reorder");
  91
  92     auto output = outputs.begin()->second.get_memory();
  93
  94     float answers[16] = {
  95         1.0f,  2.0f,
  96         3.0f,  4.0f,
  97
  98         5.0f,  6.0f,
  99         7.0f,  8.0f,
 100
 101         0.0f,  0.0f,
 102         0.5f, -0.5f,
 103
 104         1.5f,  5.2f,
 105         12.0f, 8.0f
 106     };
 107
 108     auto output_ptr = output.pointer<float>();
 109     for (int i = 0; i < 16; i++)
 110     {
 111         EXPECT_FLOAT_EQ(answers[i], output_ptr[i]);
 112     }
 113
 114 }
 115
 116 TEST(reorder_gpu_f32, basic_subtract) {
 117     //  Input               : 2x2x2x2
 118     //  Output              : 2x2x2x2
 119     //  Subtract            : 1x2x2x2 (only first batch is taken into consideration)
 120     //
 121     //  Input:
 122     //  f0: b0:  1    2  b1:   0    0
 123     //  f0: b0:  3    4  b1:   0.5 -0.5
 124     //  f1: b0:  5    6  b1:   1.5  5.2
 125     //  f1: b0:  7    8  b1:   12   8
 126     //
 127     //  Subtract:
 128     //  f0: b0:  1    1.5
 129     //  f0: b0:  2    2.5
 130     //  f1: b0:  4    3
 131     //  f1: b0:  2    1
 132     //
 133     //
 134     //  Output:
 135     //  b0 f0:  0    0.5
 136     //  b0 f0:  1    1.5
 137     //
 138     //  b0 f1:  1    3
 139     //  b0 f1:  5    7
 140     //
 141     //  b1 f0: -1   -1.5
 142     //  b1 f0: -1.5 -3
 143     //
 144     //  b1 f1: -2.5  2.2
 145     //  b1 f1: 10    7
 146     //
 147
 148     const auto& engine = get_test_engine();
 149
 150     auto input = memory::allocate(engine, { data_types::f32,  format::yxfb, { 2, 2, 2, 2 } });
 151     layout output_layout( data_types::f32, format::bfyx, {2,2,2,2} );
 152     auto subtract = memory::allocate(engine, { data_types::f32, format::byxf, { 1, 2, 2, 2 } });
 153
 154     set_values(input, {
 155         1.f, 0.f,
 156         5.f, 1.5f,
 157
 158         2.f, 0.f,
 159         6.f, 5.2f,
 160
 161         3.f, 0.5f,
 162         7.f, 12.f,
 163
 164         4.f, -0.5f,
 165         8.f, 8.f
 166     });
 167
 168     set_values(subtract, {
 169         1.0f,  4.0f,      1.5f,  3.0f,
 170         2.0f,  2.0f,      2.5f,  1.0f,
 171     });
 172
 173     topology topology(
 174         input_layout("input", input.get_layout()),
 175         input_layout("subtract", subtract.get_layout()),
 176         reorder("reorder", "input", output_layout, "subtract"));
 177
 178     network network(engine, topology);
 179     network.set_input_data("input", input);
 180     network.set_input_data("subtract", subtract);
 181
 182     auto outputs = network.execute();
 183     EXPECT_EQ(outputs.size(), size_t(1));
 184     EXPECT_EQ(outputs.begin()->first, "reorder");
 185
 186     auto output = outputs.begin()->second.get_memory();
 187
 188     float answers[16] = { 0.0f,  0.5f,
 189                           1.0f,  1.5f,
 190
 191                           1.0f,  3.0f,
 192                           5.0f,  7.0f,
 193
 194                          -1.0f, -1.5f,
 195                          -1.5f, -3.0f,
 196
 197                          -2.5f,  2.2f,
 198                          10.0f,  7.0f
 199     };
 200
 201     auto output_ptr = output.pointer<float>();
 202     for (int i = 0; i < 16; i++)
 203     {
 204         EXPECT_FLOAT_EQ(answers[i], output_ptr[i]);
 205     }
 206 }
 207
 208 TEST(reorder_gpu_f32, basic_subtract_value) {
 209     //  Values_to_subtract  : 2
 210     //  Input               : 2x2x2x2
 211     //  Output              : 2x2x2x2
 212     //
 213     //  Input:
 214     //  f0: b0:  1    2  b1:   0    0
 215     //  f0: b0:  3    4  b1:   0.5 -0.5
 216     //  f1: b0:  5    6  b1:   1.5  5.2
 217     //  f1: b0:  7    8  b1:   12   8
 218     //
 219     //  subtract values
 220     //  f0: 0.5
 221     //  f1: 2.5
 222     //
 223     //  Output:
 224     //  b0 f0:  0.5  1.5
 225     //  b0 f0:  2.5  3.5
 226     //
 227     //  b0 f1:  2.5  3.5
 228     //  b0 f1:  4.5  5.5
 229     //
 230     //  b1 f0: -0.5 -0.5
 231     //  b1 f0:  0.0 -1.0
 232     //
 233     //  b1 f1: -1.0  2.7
 234     //  b1 f1:  9.5  5.5
 235     //
 236
 237     const auto& engine = get_test_engine();
 238
 239     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } });
 240     layout output_layout(data_types::f32, format::bfyx,{ 2,2,2,2 });
 241     std::vector<float> subtract_val = { 0.5, 2.5 };
 242
 243     set_values(input, {
 244         1.f, 0.f,
 245         5.f, 1.5f,
 246
 247         2.f, 0.f,
 248         6.f, 5.2f,
 249
 250         3.f, 0.5f,
 251         7.f, 12.f,
 252
 253         4.f, -0.5f,
 254         8.f, 8.f
 255     });
 256
 257     topology topology;
 258     topology.add(input_layout("input", input.get_layout()), reorder("reorder", "input", output_layout, subtract_val));
 259
 260     network network(engine, topology);
 261     network.set_input_data("input", input);
 262
 263     auto outputs = network.execute();
 264     EXPECT_EQ(outputs.size(), size_t(1));
 265     EXPECT_EQ(outputs.begin()->first, "reorder");
 266
 267     auto output = outputs.begin()->second.get_memory();
 268
 269     float answers[16] = { 0.5f, 1.5f,
 270                           2.5f, 3.5f,
 271
 272                           2.5f, 3.5f,
 273                           4.5f, 5.5f,
 274
 275                          -0.5f, -0.5f,
 276                           0.0f, -1.0f,
 277
 278                          -1.0f,  2.7f,
 279                           9.5f,  5.5f
 280     };
 281
 282     auto output_ptr = output.pointer<float>();
 283     for (int i = 0; i < 16; i++)
 284     {
 285         EXPECT_TRUE(are_equal(answers[i], output_ptr[i]));
 286     }
 287 }
 288
 289 TEST(reorder_gpu_f16, basic_subtract_f32_output_f32) {
 290     //  Input               : 2x2x2x2 (FP16)
 291     //  Output              : 2x2x2x2 (FP32)
 292     //  Subtract            : 1x2x2x2 (FP32, only first batch is taken into consideration)
 293     //
 294     //  Input:
 295     //  f0: b0:  1    2  b1:   0    0
 296     //  f0: b0:  3    4  b1:   0.5 -0.5
 297     //  f1: b0:  5    6  b1:   1.5  5.2
 298     //  f1: b0:  7    8  b1:   12   8
 299     //
 300     //  Subtract (FP32 - converted internally to FP16 before subtraction):
 301     //  f0: b0:  1    1.5
 302     //  f0: b0:  2    2.5
 303     //  f1: b0:  4    3
 304     //  f1: b0:  2    1
 305     //
 306     //
 307     //  Output:
 308     //  b0 f0:  0    0.5
 309     //  b0 f0:  1    1.5
 310     //
 311     //  b0 f1:  1    3
 312     //  b0 f1:  5    7
 313     //
 314     //  b1 f0: -1   -1.5
 315     //  b1 f0: -1.5 -3
 316     //
 317     //  b1 f1: -2.5  2.2
 318     //  b1 f1: 10    7
 319     //
 320
 321     const auto& engine = get_test_engine();
 322
 323     if (!engine.get_info().supports_fp16)
 324     {
 325         std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
 326         EXPECT_EQ(1, 1);
 327         return;
 328     }
 329
 330     auto input = memory::allocate(engine, { data_types::f16, format::yxfb, { 2, 2, 2, 2 } });
 331     layout output_layout(data_types::f32, format::bfyx,{ 2,2,2,2 });
 332     auto subtract = memory::allocate(engine, { data_types::f32, format::byxf, { 1, 2, 2, 2 } });
 333
 334     set_values(input, {
 335         half_t(0x3C00), half_t(0x0000), // 1.f, 0.f,
 336         half_t(0x4500), half_t(0x3E00), // 5.f, 1.5f,
 337
 338         half_t(0x4000), half_t(0x0000), // 2.f, 0.f,
 339         half_t(0x4600), half_t(0x4533), // 6.f, 5.2f,
 340
 341         half_t(0x4200), half_t(0x3800), // 3.f, 0.5f,
 342         half_t(0x4700), half_t(0x4A00), // 7.f, 12.f,
 343
 344         half_t(0x4400), half_t(0xB800), // 4.f, -0.5f,
 345         half_t(0x4800), half_t(0x4800)  // 8.f, 8.f
 346     });
 347
 348     set_values(subtract, {
 349         1.0f,  4.0f,      1.5f,  3.0f,
 350         2.0f,  2.0f,      2.5f,  1.0f,
 351     });
 352
 353     topology topology;
 354     topology.add(input_layout("input", input.get_layout()));
 355     topology.add(data("subtract", subtract));
 356     topology.add(reorder("reorder", "input", output_layout, "subtract"));
 357
 358     network network(engine, topology);
 359     network.set_input_data("input", input);
 360
 361     auto outputs = network.execute();
 362     EXPECT_EQ(outputs.size(), size_t(1));
 363     EXPECT_EQ(outputs.begin()->first, "reorder");
 364
 365     auto output = outputs.begin()->second.get_memory();
 366
 367     float answers[16] = { 0.0f,  0.5f,
 368                           1.0f,  1.5f,
 369
 370                           1.0f,  3.0f,
 371                           5.0f,  7.0f,
 372
 373                          -1.0f, -1.5f,
 374                          -1.5f, -3.0f,
 375
 376                          -2.5f,  2.2f,
 377                          10.0f,  7.0f
 378     };
 379
 380     auto output_ptr = output.pointer<float>();
 381     for (int i = 0; i < 16; i++)
 382     {
 383         EXPECT_TRUE(are_equal(answers[i], output_ptr[i]));
 384     }
 385 }
 386
 387 TEST(reorder_gpu_f16, basic_subtract_value) {
 388     //  Values_to_subtract  : 2
 389     //  Input               : 2x2x2x2 (FP16)
 390     //  Output              : 2x2x2x2 (FP16)
 391     //
 392     //  Input:
 393     //  f0: b0:  1    2  b1:   0    0
 394     //  f0: b0:  3    4  b1:   0.5 -0.5
 395     //  f1: b0:  5    6  b1:   1.5  5.2
 396     //  f1: b0:  7    8  b1:   12   8
 397     //
 398     //  subtract values (FP32 - converted internally to FP16 before subtraction)
 399     //  f0: 0.5
 400     //  f1: 2.5
 401     //
 402     //  Output:
 403     //  b0 f0:  0.5  1.5
 404     //  b0 f0:  2.5  3.5
 405     //
 406     //  b0 f1:  2.5  3.5
 407     //  b0 f1:  4.5  5.5
 408     //
 409     //  b1 f0: -0.5 -0.5
 410     //  b1 f0:  0.0 -1.0
 411     //
 412     //  b1 f1: -1.0  2.7
 413     //  b1 f1:  9.5  5.5
 414     //
 415
 416     const auto& engine = get_test_engine();
 417     if (!engine.get_info().supports_fp16)
 418     {
 419         std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
 420         EXPECT_EQ(1, 1);
 421         return;
 422     }
 423
 424     auto input = memory::allocate(engine, { data_types::f16, format::yxfb, { 2, 2, 2, 2 } });
 425     layout output_layout(data_types::f16, format::bfyx,{ 2,2,2,2 });
 426     std::vector<float> subtract_val = { 0.5, 2.5 };
 427
 428     set_values(input, {
 429         half_t(0x3C00), half_t(0x0000), // 1.f, 0.f,
 430         half_t(0x4500), half_t(0x3E00), // 5.f, 1.5f,
 431
 432         half_t(0x4000), half_t(0x0000), // 2.f, 0.f,
 433         half_t(0x4600), half_t(0x4533), // 6.f, 5.2f,
 434
 435         half_t(0x4200), half_t(0x3800), // 3.f, 0.5f,
 436         half_t(0x4700), half_t(0x4A00), // 7.f, 12.f,
 437
 438         half_t(0x4400), half_t(0xB800), // 4.f, -0.5f,
 439         half_t(0x4800), half_t(0x4800)  // 8.f, 8.f
 440     });
 441
 442     topology topology;
 443     topology.add(input_layout("input", input.get_layout()));
 444     topology.add(reorder("reorder", "input", output_layout, subtract_val));
 445
 446     network network(engine, topology);
 447     network.set_input_data("input", input);
 448
 449     auto outputs = network.execute();
 450     EXPECT_EQ(outputs.size(), size_t(1));
 451     EXPECT_EQ(outputs.begin()->first, "reorder");
 452
 453     auto output = outputs.begin()->second.get_memory();
 454
 455     half_t answers[16] = { half_t(0x3800), half_t(0x3E00), //  0.5f, 1.5f,
 456                            half_t(0x4100), half_t(0x4300), //  2.5f, 3.5f,
 457
 458                            half_t(0x4100), half_t(0x4300), //  2.5f, 3.5f,
 459                            half_t(0x4480), half_t(0x4580), //  4.5f, 5.5f,
 460
 461                            half_t(0xB800), half_t(0xB800), // -0.5f, -0.5f,
 462                            half_t(0x0000), half_t(0xBC00), //  0.0f, -1.0f,
 463
 464                            half_t(0xBC00), half_t(0x4166), // -1.0f,  2.7f,
 465                            half_t(0x48C0), half_t(0x4580)  //  9.5f,  5.5f
 466     };
 467
 468     auto output_ptr = output.pointer<half_t>();
 469     for (int i = 0; i < 16; i++)
 470     {
 471         EXPECT_TRUE(are_equal(static_cast<uint16_t>(answers[i]), static_cast<uint16_t>(output_ptr[i])));
 472     }
 473 }
 474
 475 TEST(reorder_gpu, basic_convert_f16_f32_f16) {
 476     //  Converts entire unambiguous range of FP16 numbers to FP32 and back.
 477     //
 478     //  Input               : 2x2x15873x1 (FP16)
 479     //  Intermediate        : 1x2x2x15873 (FP32) {different mem format but the same ordering because batch is 1}
 480     //  Output              : 2x2x15673x1 (FP16)
 481     //
 482     //  Output is expected to contain the same value as input in range of indices from 0x0000 to 0xF801.
 483     //
 484
 485     const auto& engine = get_test_engine();
 486
 487     if (!engine.get_info().supports_fp16)
 488     {
 489         std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
 490         EXPECT_EQ(1, 1);
 491         return;
 492     }
 493
 494     std::vector<half_t> expected_values;
 495     expected_values.resize(0xF804);
 496     for (int i = 0; i < 0x7C00; ++i)
 497         expected_values[i] = half_t(i);          // norms/denorms/zero (positive).
 498     for (int i = 0x7C00; i < 0xF800; ++i)
 499         expected_values[i] = half_t(i + 0x0400); // norms/denorms (negative).
 500     expected_values[0x7C00] = half_t(0x0000);    // NOTE: do not do final test for negative 0 (-0).
 501     // Special values.
 502     expected_values[0xF800] = half_t(0x7C00);    // +infinity
 503     expected_values[0xF801] = half_t(0xFC00);    // -infinity
 504     // Special values (ambiguous ones).
 505     expected_values[0xF802] = half_t(0x8000);    // -0
 506     expected_values[0xF803] = half_t(0xFC12);    // A NaN (sample: -NaN.0x12).
 507
 508     auto input = memory::allocate(engine, { data_types::f16, format::yxfb, { 1, static_cast<int32_t>(expected_values.size()) / 4, 2, 2 } });
 509     layout interm_layout( data_types::f32, format::byxf, { 1, static_cast<int32_t>(expected_values.size()) / 4, 2, 2 });
 510     auto output_layout = input.get_layout();
 511
 512     set_values(input, expected_values);
 513
 514     topology topology;
 515     topology.add(input_layout("input", input.get_layout()));
 516     topology.add(reorder("reorder_f16_f32", "input", interm_layout));
 517     topology.add(reorder("reorder_f32_f16", "reorder_f16_f32", output_layout));
 518
 519     network network(
 520                                 engine,
 521                                 topology,
 522                                 {
 523                                     build_option::outputs({"reorder_f16_f32", "reorder_f32_f16"})
 524                                 });
 525
 526     network.set_input_data("input", input);
 527
 528     auto outputs = network.execute();
 529     EXPECT_EQ(outputs.size(), size_t(2));
 530     EXPECT_TRUE(outputs.find("reorder_f16_f32") != outputs.end());
 531     EXPECT_TRUE(outputs.find("reorder_f32_f16") != outputs.end());
 532
 533     auto interm = outputs.at("reorder_f16_f32").get_memory();
 534     auto interm_ptr = interm.pointer<float>();
 535
 536     // Sample positive.
 537     EXPECT_TRUE(are_equal(interm_ptr[0x3400], 0.25f));
 538     EXPECT_TRUE(are_equal(interm_ptr[0x3800], 0.5f));
 539     EXPECT_TRUE(are_equal(interm_ptr[0x3C00], 1.0f));
 540     EXPECT_TRUE(are_equal(interm_ptr[0x4000], 2.0f));
 541     EXPECT_TRUE(are_equal(interm_ptr[0x4400], 4.0f));
 542     // Sample negative.
 543     EXPECT_TRUE(are_equal(interm_ptr[0x3400 + 0x7C00], -0.25f));
 544     EXPECT_TRUE(are_equal(interm_ptr[0x3800 + 0x7C00], -0.5f));
 545     EXPECT_TRUE(are_equal(interm_ptr[0x3C00 + 0x7C00], -1.0f));
 546     EXPECT_TRUE(are_equal(interm_ptr[0x4000 + 0x7C00], -2.0f));
 547     EXPECT_TRUE(are_equal(interm_ptr[0x4400 + 0x7C00], -4.0f));
 548     // Special values.
 549     EXPECT_TRUE(are_equal(interm_ptr[0xF800], std::numeric_limits<float>::infinity()));
 550     EXPECT_TRUE(are_equal(interm_ptr[0xF801], -std::numeric_limits<float>::infinity()));
 551     EXPECT_TRUE(are_equal(interm_ptr[0xF802], -0.0f));
 552     EXPECT_TRUE(std::isnan(interm_ptr[0xF803]));
 553
 554     auto output = outputs.at("reorder_f32_f16").get_memory();
 555     auto output_ptr = output.pointer<half_t>();
 556     for (int i = 0; i < 0xF802; ++i) // NOTE: do not test for possibly ambiguous values of floating point (-0, NaNs).
 557     {
 558         EXPECT_TRUE(are_equal(static_cast<uint16_t>(expected_values[i]), static_cast<uint16_t>(output_ptr[i])));
 559     }
 560 }
 561
 562
 563 TEST(reorder_gpu, basic_convert_int8) {
 564
 565     const auto& engine = get_test_engine();
 566     layout in_layout = { type_to_data_type<float>::value,format::byxf,{ 1,1,3,3 } };
 567     layout byte_layout = { type_to_data_type<int8_t>::value, format::bfyx,{ 1,1,3,3 } };
 568     std::initializer_list<float> input_f = { 1.0f, -2.5f, 3.1f, -4.0f, 5.03f, -6.99f, 7.0f, -8.0f, 9.0f };
 569     std::list<float> final_results = { 1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, -8.0f, 9.0f };
 570
 571     // Allocate memory for input image.
 572     auto input_memory = memory::allocate(engine, in_layout);
 573     set_values(input_memory, input_f);
 574
 575     // Create input_layout description
 576     // "input" - is the primitive id inside topology
 577     input_layout input("input", in_layout);
 578
 579     topology topology(
 580         // 1. input layout primitive.
 581         input,
 582         // 2. reorder primitive with id "reorder_input"
 583         reorder("reorder_input",
 584             // input primitive for reorder (implicitly converted to primitive_id)
 585             input,
 586             // output layout for reorder
 587             byte_layout),
 588         reorder("reorder2", "reorder_input", in_layout)
 589     );
 590
 591     network network(
 592         engine,
 593         topology,
 594         {
 595             build_option::outputs({ "reorder2"})
 596         });
 597
 598     network.set_input_data("input", input_memory);
 599
 600     auto outputs = network.execute();
 601
 602     auto interm = outputs.at("reorder2").get_memory();
 603     auto interm_ptr = interm.pointer<float>();
 604     auto output_size = outputs.at("reorder2").get_memory().count();
 605     unsigned int cntr = 0;
 606     for (const auto& exp : final_results)
 607     {
 608         EXPECT_EQ(exp, interm_ptr[cntr++]);
 609     }
 610 }
 611
 612 TEST(reorder_gpu, basic_convert_uint8rgbabyxf_to_fp32_bfyx) {
 613         //  Converts an ARGB(uint8) image to common clDNN input of bfyx FP32
 614         //
 615         //  Input               : 1x5x5x4 (UINT8)
 616         //  Intermediate        : 1x4x5x5 (FP32) {different mem format and ordering}
 617         //  Output              : 1x3x5x5 (FP32) {using crop layer to reduce feature dimention and drop A from RGBA}
 618         //
 619         //  Output is expected to contain the same value as input
 620         //
 621         const int kernel_size = 5;
 622         const int feature_size = 4;
 623         const auto& engine = get_test_engine();
 624
 625         if (!engine.get_info().supports_fp16)
 626         {
 627                 std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
 628                 EXPECT_EQ(1, 1);
 629                 return;
 630         }
 631
 632         std::initializer_list<uint8_t> input_i8 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
 633                 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36,
 634                 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
 635                 155, 154, 153, 152, 151, 150, 149, 148, 147, 146, 145, 144, 143, 142, 141, 140, 139, 138, 137, 136,
 636                 255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240, 239, 238, 237, 236
 637         };
 638
 639         layout in_layout = { type_to_data_type<uint8_t>::value,format::byxf,{ 1,4,kernel_size,kernel_size } };
 640         layout output_layout = { type_to_data_type<float>::value, format::bfyx, {1,4,kernel_size,kernel_size } };
 641
 642         // Allocate memory for input image.
 643         auto input_memory = memory::allocate(engine, in_layout);
 644         set_values(input_memory, input_i8);
 645
 646     // Create input_layout description
 647         // "input" - is the primitive id inside topology
 648         input_layout input("input", in_layout);
 649
 650         // Create topology object with 2 primitives
 651         topology topology(
 652                 // 1. input layout primitive.
 653                 input,
 654                 // 2. reorder primitive with id "reorder_input"
 655                 reorder("reorder_input",
 656                         // input primitive for reorder (implicitly converted to primitive_id)
 657                         input,
 658                         // output layout for reorder
 659                         output_layout)
 660         );
 661
 662         tensor crop_reference_input_tensor(spatial(kernel_size, kernel_size), batch(1), feature(4 - 1));
 663         tensor crop_offset_tensor(spatial(0, 0), batch(0), feature(0));
 664         padding output_padding = padding({ 0,0,0,0 }, { 0,0,0,0 }, 0);
 665         topology.add(
 666                 // cropping primitive with id "crop1"
 667                 crop("crop",
 668                         "reorder_input",    // primitive id of the cropping input
 669                         crop_reference_input_tensor,  // input tensor
 670                         crop_offset_tensor,    // bias primitive id
 671                         output_padding
 672                 )
 673         );
 674
 675         network network(
 676                 engine,
 677                 topology,
 678                 {
 679                         build_option::outputs({ "reorder_input", "crop" })
 680                 });
 681
 682     network.set_input_data("input", input_memory);
 683
 684     auto outputs = network.execute();
 685     EXPECT_EQ(outputs.size(), size_t(2));
 686     EXPECT_TRUE(outputs.find("reorder_input") != outputs.end());
 687     EXPECT_TRUE(outputs.find("crop") != outputs.end());
 688
 689     auto interm = outputs.at("reorder_input").get_memory();
 690     auto interm_ptr = interm.pointer<float>();
 691     auto interm_size = outputs.at("reorder_input").get_memory().count();
 692     EXPECT_EQ(interm_size,(size_t) (1*feature_size*kernel_size*kernel_size));
 693
 694     // Sample positive.
 695     EXPECT_TRUE(are_equal(interm_ptr[0], 1.0f));
 696     size_t source_index = 0;
 697     size_t target_index = 0;
 698     std::vector<uint8_t> testinput;// This will be used to direct access elements of test input in the next test
 699     for (auto it = input_i8.begin(); it < input_i8.end(); it++)
 700     {
 701
 702         uint8_t val = *it;
 703         testinput.push_back(val); // This will be used to direct access elements of test input in the next test
 704         size_t current_feature = source_index % feature_size;
 705         size_t current_x = (source_index / feature_size) % kernel_size;
 706         size_t current_y = (source_index / (feature_size * kernel_size));
 707         target_index = current_x + current_y*kernel_size + current_feature*(kernel_size*kernel_size);
 708         EXPECT_TRUE(are_equal(val, interm_ptr[target_index]));
 709         source_index++;
 710     }
 711
 712     auto output = outputs.at("crop").get_memory();
 713     auto output_ptr = output.pointer<float>();
 714     auto output_size = outputs.at("crop").get_memory().count();
 715     EXPECT_EQ(output_size,(size_t) (1 * (feature_size-1)*kernel_size*kernel_size));
 716
 717     for (target_index = 0; target_index < output_size; target_index++)
 718     {
 719         float output_val = output_ptr[target_index];
 720         int current_x = target_index % kernel_size;
 721         int current_y = (target_index / kernel_size) % kernel_size;
 722         size_t current_feature = target_index / (kernel_size*kernel_size);
 723
 724         source_index = current_x*feature_size + current_y*(kernel_size*feature_size) + current_feature;
 725         EXPECT_TRUE(are_equal(output_val, testinput[source_index]));
 726     }
 727
 728 }
 729
 730 TEST(reorder_gpu_f32, basic_yxfb_to_bfyx_input_padding)
 731 {
 732     //  Input               : yxfb:2x2x2x2
 733     //  Output              : bfyx:2x2x2x2
 734     //
 735     //  Input:
 736     //  b0 f0:  1    2
 737     //  b0 f0:  3    4
 738     //
 739     //  b0 f1:  5    6
 740     //  b0 f1:  7    8
 741     //
 742     //  b1 f0:  0    0
 743     //  b1 f0: 0.5 -0.5
 744     //
 745     //  b1 f1: 1.5  5.2
 746     //  b1 f1: 12    8
 747     //
 748     //  Output:
 749     //  f0: b0:  1    2  b1:   0    0
 750     //  f0: b0:  3    4  b1:   0.5 -0.5
 751     //  f1: b0:  5    6  b1:   1.5  5.2
 752     //  f1: b0:  7    8  b1:   12   8
 753
 754     const auto& engine = get_test_engine();
 755
 756     auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } });
 757     layout output_layout(data_types::f32, format::bfyx, { 2,2,2,2 });
 758
 759     set_values(input, {
 760         1.f, 0.f,
 761         5.f, 1.5f,
 762
 763         2.f, 0.f,
 764         6.f, 5.2f,
 765
 766         3.f, 0.5f,
 767         7.f, 12.f,
 768
 769         4.f, -0.5f,
 770         8.f, 8.f
 771     });
 772
 773     topology topology(
 774         input_layout("input", input.get_layout()),
 775         reorder("reorder", "input", input.get_layout().format, input.get_layout().data_type, "", cldnn_reorder_mean_mode::mean_subtract, { { 0, 0, 1, 2 }, 0 }),
 776         reorder("reorder2", "reorder", output_layout));
 777
 778     network network(engine, topology);
 779     network.set_input_data("input", input);
 780
 781     auto outputs = network.execute();
 782     EXPECT_EQ(outputs.size(), size_t(1));
 783     EXPECT_EQ(outputs.begin()->first, "reorder2");
 784
 785     auto output = outputs.begin()->second.get_memory();
 786
 787     float answers[16] = {
 788         1.0f,  2.0f,
 789         3.0f,  4.0f,
 790
 791         5.0f,  6.0f,
 792         7.0f,  8.0f,
 793
 794         0.0f,  0.0f,
 795         0.5f, -0.5f,
 796
 797         1.5f,  5.2f,
 798         12.0f, 8.0f
 799     };
 800     auto output_ptr = output.pointer<float>();
 801     for (int i = 0; i < 16; i++)
 802     {
 803         EXPECT_FLOAT_EQ(answers[i], output_ptr[i]);
 804     }
 805
 806 }
 807
 808 TEST(reorder_gpu_f32, basic_bfyx_to_yxfb_input_padding)
 809 {
 810     //  Input               : bfyx:2x2x2x2
 811     //  Output              : yxfb:2x2x2x2
 812     //
 813     //  Input:
 814     //  f0: b0:  1    2  b1:   0    0
 815     //  f0: b0:  3    4  b1:   0.5 -0.5
 816     //  f1: b0:  5    6  b1:   1.5  5.2
 817     //  f1: b0:  7    8  b1:   12   8
 818     //
 819     //  Output:
 820     //  b0 f0:  1    2
 821     //  b0 f0:  3    4
 822     //
 823     //  b0 f1:  5    6
 824     //  b0 f1:  7    8
 825     //
 826     //  b1 f0:  0    0
 827     //  b1 f0: 0.5 -0.5
 828     //
 829     //  b1 f1: 1.5  5.2
 830     //  b1 f1: 12    8
 831     //
 832
 833     const auto& engine = get_test_engine();
 834
 835     auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } });
 836     layout output_layout(data_types::f32, format::yxfb, { 2,2,2,2 });
 837
 838     set_values(input, {
 839         1.0f,  2.0f,
 840         3.0f,  4.0f,
 841
 842         5.0f,  6.0f,
 843         7.0f,  8.0f,
 844
 845         0.0f,  0.0f,
 846         0.5f, -0.5f,
 847
 848         1.5f,  5.2f,
 849         12.0f, 8.0f
 850     });
 851
 852     topology topology(
 853         input_layout("input", input.get_layout()),
 854         reorder("reorder", "input", input.get_layout().format, input.get_layout().data_type, "", cldnn_reorder_mean_mode::mean_subtract, { { 0, 0, 2, 1 }, 0 }),
 855         reorder("reorder2", "reorder", output_layout));
 856
 857     network network(engine, topology);
 858     network.set_input_data("input", input);
 859
 860     auto outputs = network.execute();
 861     EXPECT_EQ(outputs.size(), size_t(1));
 862     EXPECT_EQ(outputs.begin()->first, "reorder2");
 863
 864     auto output = outputs.begin()->second.get_memory();
 865
 866     float answers[16] = {
 867         1.f, 0.f,
 868         5.f, 1.5f,
 869
 870         2.f, 0.f,
 871         6.f, 5.2f,
 872
 873         3.f, 0.5f,
 874         7.f, 12.f,
 875
 876         4.f, -0.5f,
 877         8.f, 8.f
 878     };
 879     std::vector<float> out;
 880     auto output_ptr = output.pointer<float>();
 881     for (int i = 0; i < 16; i++)
 882     {
 883         out.push_back(output_ptr[i]);
 884         EXPECT_FLOAT_EQ(answers[i], output_ptr[i]);
 885     }
 886
 887 }
 888
 889 TEST(reorder_gpu_opt, basic_remove_redundant)
 890 {
 891     engine eng;
 892
 893     memory in = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 2, 2, 1 } });
 894     topology tpl{
 895         input_layout("in", in.get_layout()),
 896         reorder("r1", "in", format::bfyx, data_types::f32),
 897         reorder("r2", "r1", format::yxfb, data_types::f32)
 898     };
 899
 900     build_options opts;
 901     opts.set_option(build_option::optimize_data(true));
 902
 903     network net(eng, tpl, opts);
 904     net.set_input_data("in", in);
 905     auto outputs = net.execute();
 906     auto executed_primitives = net.get_executed_primitives();
 907
 908     EXPECT_TRUE(executed_primitives.count("r1") == 0);
 909     ASSERT_TRUE(outputs.count("r2") == 1);
 910     EXPECT_TRUE(outputs.at("r2").get_memory().get_layout().format == format::yxfb);
 911 }
 912
 913 TEST(reorder_gpu_opt, remove_redundant_activation_fuse)
 914 {
 915     engine eng;
 916
 917     memory in = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 1, 2, 1 } });
 918     set_values(in, { -1.0f, -1.0f });
 919     memory scale_mem = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{1, 1, 1, 1 } });
 920     set_values(scale_mem, { 2.0f });
 921     topology tpl{
 922         input_layout("in", in.get_layout()),
 923         reorder("r1", "in", format::bfyx, data_types::f32),
 924         activation("relu", "r1", cldnn_activation_func::activation_relu_negative_slope, {0.01f, 0.0f}),
 925         data("scale_data", scale_mem),
 926         scale("output", "relu", "scale_data")
 927     };
 928
 929     build_options opts;
 930     opts.set_option(build_option::optimize_data(true));
 931
 932     network net(eng, tpl, opts);
 933     net.set_input_data("in", in);
 934     auto outputs = net.execute();
 935     auto out_ptr = outputs.begin()->second.get_memory().pointer<float>();
 936     EXPECT_FLOAT_EQ(out_ptr[0], -0.02f);
 937     EXPECT_FLOAT_EQ(out_ptr[1], -0.02f);
 938 }
 939
 940 TEST(reorder_gpu_opt, basic_do_not_remove_redundant_due_it_is_output)
 941 {
 942     engine eng;
 943
 944     memory in = memory::allocate(eng, { data_types::f32, format::yxfb, tensor{ 1, 2, 2, 1 } });
 945     memory weights = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 2, 2, 1 } });
 946     topology tpl{
 947         input_layout("in", in.get_layout()),
 948         convolution("conv", "in", { "weights" }),
 949         data("weights", weights),
 950         reorder("r1", "conv", format::bfyx, data_types::f32) //reoder is output - do not optimize
 951     };
 952
 953     build_options opts;
 954     opts.set_option(build_option::optimize_data(true));
 955
 956     network net(eng, tpl, opts);
 957     net.set_input_data("in", in);
 958     auto outputs = net.execute();
 959     auto executed_primitives = net.get_executed_primitives();
 960
 961     //all pirmitives in this test needs to be executed
 962     EXPECT_TRUE(executed_primitives.count("conv") == 1);
 963     EXPECT_TRUE(executed_primitives.count("in") == 1);
 964     EXPECT_TRUE(executed_primitives.count("r1") == 1);
 965     ASSERT_TRUE(outputs.count("r1") == 1);
 966     EXPECT_TRUE(outputs.at("r1").get_memory().get_layout().format == format::bfyx);
 967 }
 968
 969 TEST(reorder_gpu_opt, basic_remove_redundant_output_due_to_implicit_reorders)
 970 {
 971     engine eng;
 972
 973     memory in = memory::allocate(eng, { data_types::f32, format::yxfb, tensor{ 1, 2, 2, 1 } });
 974     memory weights = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 2, 2, 1 } });
 975     topology tpl{
 976         input_layout("in", in.get_layout()),
 977         convolution("conv", "in",{ "weights" }),
 978         data("weights", weights),
 979         reorder("r1", "conv", format::bfyx, data_types::f32) //optimize data should add conversion from yxfb to bfyx and 'conv' should output data in bfyx as well (IE case)
 980     };
 981
 982     build_options opts;
 983
 984     //we need to check if r1 will be successfully opimized and still we should be able to query for r1's output which should point to conv's output (note conv cannot be marked as output in this case)
 985     opts.set_option(build_option::outputs({ "r1" }));
 986     opts.set_option(build_option::optimize_data(true));
 987
 988     network net(eng, tpl, opts);
 989     net.set_input_data("in", in);
 990     auto outputs = net.execute();
 991
 992     EXPECT_TRUE(outputs.count("conv") == 0);
 993     ASSERT_TRUE(outputs.count("r1") == 1);
 994     EXPECT_TRUE(outputs.at("r1").get_memory().get_layout().format == format::bfyx);
 995 }
 996
 997 TEST(reorder_gpu_opt, basic_remove_redundant_due_to_implicit_reorders)
 998 {
 999     engine eng;
1000
1001     memory in = memory::allocate(eng, { data_types::f32, format::yxfb, tensor{ 1, 2, 2, 1 } });
1002     memory weights = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 2, 2, 1 } });
1003     topology tpl{
1004         input_layout("in", in.get_layout()),
1005         convolution("conv", "in",{ "weights" }),
1006         data("weights", weights),
1007         reorder("r1", "conv", format::bfyx, data_types::f32), //optimize data should add conversion from yxfb to bfyx and 'conv' should output data in bfyx as well (IE case)
1008         softmax("output", "r1")
1009     };
1010
1011     build_options opts;
1012     opts.set_option(build_option::optimize_data(true));
1013
1014     network net(eng, tpl, opts);
1015     net.set_input_data("in", in);
1016     auto outputs = net.execute();
1017     auto executed_primitives = net.get_executed_primitives();
1018
1019     //remove redundant reorder optimization should remove r1 node
1020     EXPECT_TRUE(executed_primitives.count("r1") == 0);
1021     //all pirmitives in this test needs to be executed
1022     ASSERT_TRUE(outputs.count("output") == 1);
1023     EXPECT_TRUE(outputs.at("output").get_memory().get_layout().format == format::bfyx);
1024 }
1025
1026 TEST(reorder_gpu_opt, non_trivial_remove_redundant)
1027 {
1028     engine eng;
1029
1030     memory in = memory::allocate(eng, { data_types::f32, format::yxfb, tensor{ 1, 1, 5, 2 } });
1031     topology tpl{
1032         input_layout("in", in.get_layout()),
1033         reorder("r1", "in", format::bfyx, data_types::f32)
1034     };
1035
1036     build_options opts;
1037
1038     opts.set_option(build_option::optimize_data(true));
1039
1040     network net(eng, tpl, opts);
1041     net.set_input_data("in", in);
1042     auto outputs = net.execute();
1043     auto executed_primitives = net.get_executed_primitives();
1044     auto all_primitives = net.get_all_primitives();
1045
1046     ASSERT_TRUE(executed_primitives.count("in") == 1);
1047     //ASSERT_TRUE(all_primitives.at("r1") == "_optimized_");
1048     EXPECT_TRUE(executed_primitives.at("in") != outputs.at("r1").get_event());
1049     ASSERT_TRUE(outputs.count("r1") == 1);
1050     EXPECT_TRUE(outputs.at("r1").get_memory().get_layout().format == format::bfyx);
1051 }
1052
1053
1054 TEST(reorder_gpu_opt, mean_mul)
1055 {
1056     engine eng;
1057
1058     memory in  = memory::allocate(eng, { data_types::i8, format::bfyx, tensor{ 1, 3, 1, 2 } });
1059     memory mul = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{1, 3, 1, 2 } });
1060
1061     set_values<char>(in,
1062     { 1, 2,
1063       3, 4,
1064       5, 6 });
1065     set_values<float>(mul,
1066     { 0.5f, 2.5f, -5.0f, 4.3f, 1.2f, -3.5f });
1067
1068     topology tpl{
1069         input_layout("in", in.get_layout()),
1070         data("mul",mul),
1071         reorder("r1", "in", format::bfyx, data_types::f32,"mul", cldnn_reorder_mean_mode::mean_mul)
1072     };
1073
1074     float answers[] = { 0.5f, 5.0f, -15.0f, 17.2f, 6.0f, -21.0f };
1075     build_options opts;
1076     opts.set_option(build_option::optimize_data(true));
1077     network net(eng, tpl, opts);
1078     net.set_input_data("in", in);
1079
1080     auto outputs = net.execute();
1081     auto output = outputs.begin()->second.get_memory();
1082     auto ptr = output.pointer<float>();
1083     float* a_ptr = answers;
1084     for (auto& val : ptr)
1085         EXPECT_FLOAT_EQ(*(a_ptr++), val);;
1086
1087 }
1088
1089
1090 TEST(reorder_gpu_opt, mean_div)
1091 {
1092     engine eng;
1093
1094     memory in = memory::allocate(eng, { data_types::i8, format::bfyx, tensor{ 1, 3, 1, 2 } });
1095     memory mul = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 3, 1, 2 } });
1096
1097     set_values<char>(in,
1098     { 1, 2,
1099       3, 4,
1100       5, 6 });
1101     set_values<float>(mul,
1102     { 0.5f, 2.0f, -3.0f, 8.0f, 1.25f, -3.0f });
1103
1104     topology tpl{
1105         input_layout("in", in.get_layout()),
1106         data("mul",mul),
1107         reorder("r1", "in", format::bfyx, data_types::f32,"mul", cldnn_reorder_mean_mode::mean_div)
1108     };
1109
1110     float answers[] = { 2.0f, 1.0f, -1.0f, 0.5f, 4.0f, -2.0f };
1111     build_options opts;
1112     opts.set_option(build_option::optimize_data(true));
1113     network net(eng, tpl, opts);
1114     net.set_input_data("in", in);
1115
1116     auto outputs = net.execute();
1117     auto output = outputs.begin()->second.get_memory();
1118     auto ptr = output.pointer<float>();
1119     float* a_ptr = answers;
1120     for (auto& val : ptr)
1121         EXPECT_FLOAT_EQ(*(a_ptr++), val);;
1122
1123 }
1124
1125
1126 TEST(reorder_gpu_opt, mean_mul_val)
1127 {
1128     engine eng;
1129
1130     memory in = memory::allocate(eng, { data_types::i8, format::bfyx, tensor{ 1, 3, 1, 2 } });
1131
1132     set_values<char>(in,
1133     { 1, 2,
1134       3, 4,
1135       5, 60 });
1136     std::vector<float> mul_val = { 2.0f, 0.5f, 10.0f };
1137     topology tpl{
1138         input_layout("in", in.get_layout()),
1139         reorder("r1", "in", format::bfyx, data_types::f32, mul_val, cldnn_reorder_mean_mode::mean_mul)
1140     };
1141
1142     float answers[] = { 2.0f, 4.0f, 1.5f, 2.0f, 50.0f, 600.0f };
1143     build_options opts;
1144     opts.set_option(build_option::optimize_data(true));
1145     network net(eng, tpl, opts);
1146     net.set_input_data("in", in);
1147
1148     auto outputs = net.execute();
1149     auto output = outputs.begin()->second.get_memory();
1150     auto ptr = output.pointer<float>();
1151     float* a_ptr = answers;
1152     for (auto& val : ptr)
1153         EXPECT_FLOAT_EQ(*(a_ptr++), val);;
1154 }
1155
1156
1157 TEST(reorder_gpu_opt, mean_mul_val_float_to_int)
1158 {
1159     engine eng;
1160
1161     memory in = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 3, 1, 2 } });
1162
1163     set_values<float>(in,
1164     { 0.6f, 1.5f,
1165       3.0f, 4.2f,
1166       5.0f, 60.0f });
1167     std::vector<float> mul_val = { 1.4f, 0.5f, 5.0f };
1168     topology tpl{
1169         input_layout("in", in.get_layout()),
1170         reorder("r1", "in", format::bfyx, data_types::i8, mul_val, cldnn_reorder_mean_mode::mean_mul)
1171     };
1172
1173     char answers[] = { 0, 2, 1, 2, 25, 127 };
1174     build_options opts;
1175     opts.set_option(build_option::optimize_data(true));
1176     network net(eng, tpl, opts);
1177     net.set_input_data("in", in);
1178
1179     auto outputs = net.execute();
1180     auto output = outputs.begin()->second.get_memory();
1181     auto ptr = output.pointer<char>();
1182     char* a_ptr = answers;
1183     for (auto& val : ptr)
1184         EXPECT_EQ(*(a_ptr++), val);
1185 }
1186
1187 TEST(reorder_gpu_i32, basic)
1188 {
1189     //  Test for converting data types f32->i32
1190     const auto& engine = get_test_engine();
1191
1192     auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } });
1193     layout output_layout(data_types::i32, format::bfyx, { 2,2,2,2 });
1194
1195     set_values(input, {
1196         1.f, 0.f, 5.f, 1.5f,
1197         2.f, 0.f, 6.f, 5.2f,
1198         3.f, 0.5f, 7.f, 12.f,
1199         4.f, -0.5f, 8.f, 8.f
1200     });
1201
1202     topology topology(
1203         input_layout("input", input.get_layout()),
1204         reorder("reorder", "input", output_layout));
1205
1206     network network(engine, topology);
1207     network.set_input_data("input", input);
1208
1209     auto outputs = network.execute();
1210     EXPECT_EQ(outputs.size(), size_t(1));
1211     EXPECT_EQ(outputs.begin()->first, "reorder");
1212
1213     auto output = outputs.begin()->second.get_memory();
1214
1215     int32_t answers[16] = {
1216         1, 0, 5, 1,
1217         2, 0, 6, 5,
1218         3, 0, 7, 12,
1219         4, 0, 8, 8
1220     };
1221
1222     int32_t* a_ptr = answers;
1223     auto output_ptr = output.pointer<int32_t>();
1224     for (auto& val : output_ptr)
1225         EXPECT_EQ(*(a_ptr++), val);
1226 }
1227
1228 TEST(reorder_gpu_i64, basic)
1229 {
1230     //  Test for converting data types f32->i64
1231     const auto& engine = get_test_engine();
1232
1233     auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } });
1234     layout output_layout(data_types::i64, format::bfyx, { 2,2,2,2 });
1235
1236     set_values(input, {
1237         1.f, 0.f, 5.f, 1.5f,
1238         2.f, 0.f, 6.f, 5.2f,
1239         3.f, 0.5f, 7.f, 12.f,
1240         4.f, -0.5f, 8.f, 8.f
1241     });
1242
1243     topology topology(
1244         input_layout("input", input.get_layout()),
1245         reorder("reorder", "input", output_layout));
1246
1247     network network(engine, topology);
1248     network.set_input_data("input", input);
1249
1250     auto outputs = network.execute();
1251     EXPECT_EQ(outputs.size(), size_t(1));
1252     EXPECT_EQ(outputs.begin()->first, "reorder");
1253
1254     auto output = outputs.begin()->second.get_memory();
1255
1256     int64_t answers[16] = {
1257         1, 0, 5, 1,
1258         2, 0, 6, 5,
1259         3, 0, 7, 12,
1260         4, 0, 8, 8
1261     };
1262
1263     int64_t* a_ptr = answers;
1264     auto output_ptr = output.pointer<int64_t>();
1265     for (auto& val : output_ptr)
1266         EXPECT_EQ(*(a_ptr++), val);
1267 }
1268
1269 using namespace cldnn;
1270
1271 class reorder_test : public tests::generic_test
1272 {
1273
1274 public:
1275
1276     static void TearDownTestCase()
1277     {
1278         for (auto generic_params : all_generic_params)
1279         {
1280             delete generic_params;
1281         }
1282         for (auto test_param : all_test_params)
1283         {
1284             auto primitive = std::get<1>(test_param);
1285             delete primitive;
1286         }
1287     }
1288
1289
1290     static std::vector<std::tuple<test_params*, cldnn::primitive*>> generate_specific_test_params()
1291     {
1292         generic_test::generate_generic_test_params(all_generic_params);
1293
1294         const auto data_types = test_data_types();
1295
1296         for (const auto& test_param : all_generic_params)
1297         {
1298             cldnn::tensor input_tensor = test_param->input_layouts[0].size;
1299
1300             std::vector<cldnn::layout> output_layouts = {};
1301
1302             for (const auto& dt : data_types)
1303             {
1304                 for (const auto& fmt : generic_test::test_input_formats)
1305                 {
1306                     output_layouts.push_back({ dt, fmt, input_tensor });
1307                 }
1308             }
1309             // TODO: check unsupported formats.
1310
1311             //TODO: check subtract.
1312             std::vector<float> subtract = {};
1313
1314             for (const auto& output_layout : output_layouts)
1315             {
1316                 //TODO: check input + output padding.
1317                 all_test_params.push_back(std::make_tuple(test_param, new reorder("reorder", "input0", output_layout, subtract)));
1318
1319             }
1320         }
1321
1322         return all_test_params;
1323     }
1324
1325     virtual bool is_format_supported(cldnn::format format)
1326     {
1327         return (    (format == cldnn_format_type::cldnn_format_yxfb) ||
1328                     (format == cldnn_format_type::cldnn_format_byxf) ||
1329                     (format == cldnn_format_type::cldnn_format_bfyx) ||
1330                     (format == cldnn_format_type::cldnn_format_fyxb)
1331                 );
1332     }
1333
1334     template<typename InputType, typename OutputType>
1335     memory generate_reference_typed(const std::vector<cldnn::memory>& inputs)
1336     {
1337         const cldnn::reorder* reorder = (cldnn::reorder*)layer_params;
1338         primitive_id mean = reorder->mean;
1339         std::vector<float> subtract_per_feature = reorder->subtract_per_feature;
1340         assert(mean == "");
1341         assert(subtract_per_feature.size() == 0);
1342
1343         auto output = memory::allocate(engine, cldnn::layout(*reorder->output_data_type, inputs[0].get_layout().format, inputs[0].get_layout().size));
1344
1345         cldnn::pointer<InputType> input_mem = inputs[0].pointer<InputType>();
1346         cldnn::pointer<OutputType> output_mem = output.pointer<OutputType>();
1347
1348         for (size_t i = 0; i < inputs[0].get_layout().get_linear_size(); i++)
1349         {
1350             // Write the output in the same order as the input with type conversion as needed.
1351             // The correct order will be checked in generic_test::compare_buffers.
1352             output_mem[i] = (OutputType)input_mem[i];
1353         }
1354
1355         return output;
1356     }
1357
1358     virtual memory generate_reference(const std::vector<cldnn::memory>& inputs)
1359     {
1360         if (generic_params->data_type == data_types::f32)
1361         {
1362             if (*layer_params->output_data_type == data_types::f32)
1363             {
1364                 return generate_reference_typed<float, float>(inputs);
1365             }
1366             else
1367             {
1368                 return generate_reference_typed<float, FLOAT16>(inputs);
1369             }
1370         }
1371         else
1372         {
1373             if (*layer_params->output_data_type == data_types::f32)
1374             {
1375                 return generate_reference_typed<FLOAT16, float>(inputs);
1376             }
1377             else
1378             {
1379                 return generate_reference_typed<FLOAT16, FLOAT16>(inputs);
1380             }
1381         }
1382     }
1383
1384 private:
1385
1386     static std::vector<tests::test_params*> all_generic_params;
1387     static std::vector<std::tuple<test_params*, cldnn::primitive*>> all_test_params;
1388
1389 };
1390
1391 std::vector<tests::test_params*> reorder_test::all_generic_params = {};
1392 std::vector<std::tuple<test_params*, cldnn::primitive*>> reorder_test::all_test_params = {};
1393
1394 TEST_P(reorder_test, REORDER)
1395 {
1396     run_single_test();
1397 }
1398
1399 INSTANTIATE_TEST_CASE_P(DISABLED_REORDER,
1400                         reorder_test,
1401                         ::testing::ValuesIn(reorder_test::generate_specific_test_params()),
1402                         tests::generic_test::custom_param_name_functor());