inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_gpu_test.cpp

   1 /*
   2 // Copyright (c) 2019 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 ///////////////////////////////////////////////////////////////////////////////////////////////////
  18 #include <gtest/gtest.h>
  19 #include "api/CPP/memory.hpp"
  20 #include <api/CPP/input_layout.hpp>
  21 #include "api/CPP/fully_connected.hpp"
  22 #include <api/CPP/topology.hpp>
  23 #include <api/CPP/tensor.hpp>
  24 #include <api/CPP/network.hpp>
  25 #include <api/CPP/engine.hpp>
  26 #include "test_utils/test_utils.h"
  27 #include <api/CPP/data.hpp>
  28 #include "instrumentation.h"
  29
  30 #include <cmath>
  31
  32 namespace cldnn
  33 {
  34     template<> struct type_to_data_type<FLOAT16> { static const data_types value = data_types::f16; };
  35 }
  36
  37 using namespace cldnn;
  38 using namespace tests;
  39
  40 cldnn::format::type layout_4d(cldnn::format f) {
  41     switch (f.value) {
  42     case cldnn::format::bfyx:
  43         return cldnn::format::bfyx;
  44     case cldnn::format::yxfb:
  45         return cldnn::format::yxfb;
  46     default:
  47         return f.value;
  48     }
  49 }
  50
  51 template <typename T>
  52 VVVVF<T> fully_connected_reference(VVVVF<T> &input, VVVVF<T> &weights, VF<T> &bias, bool relu = false, T slope = 0.0f) {
  53     size_t input_f = input[0].size();
  54     size_t input_y = input[0][0].size();
  55     size_t input_x = input[0][0][0].size();
  56     size_t output_b = input.size();        // input is assumed to be bfyx
  57     size_t output_f = weights.size();    // weights is assumed to be bfyx
  58     VVVVF<T> output(output_b, VVVF<T>(1, VVF<T>(1, VF<T>(output_f))));
  59     float res;
  60     for (size_t b = 0; b < output_b; ++b) {
  61         for (size_t n = 0; n < output_f; ++n) {
  62             res = bias[n];
  63             for (size_t f = 0; f < input_f; ++f) {
  64                 for (size_t y = 0; y < input_y; ++y) {
  65                     for (size_t x = 0; x < input_x; ++x) {
  66                         res += (float)input[b][f][y][x] * (float)weights[n][f][y][x];
  67                     }
  68                 }
  69             }
  70             if (relu && res < (float)0)
  71                 res *= (float)slope;
  72             output[b][0][0][n] = (T)res;
  73         }
  74     }
  75     return output;
  76 }
  77
  78 template <typename T>
  79 void generic_fully_connected_test(cldnn::format test_input_fmt, cldnn::format test_weights_fmt, int input_b, int f, int y, int x, int output_f, bool relu, T slope = 0) {
  80     int min_random = -2, max_random = 2;
  81     VVVVF<T> input_rnd = generate_random_4d<T>(input_b, f, y, x, min_random, max_random);
  82     VVVVF<T> weights_rnd = generate_random_4d<T>(output_f, f, y, x, min_random, max_random);
  83     VF<T> bias_rnd_vec = generate_random_1d<T>(output_f, min_random, max_random);
  84     VF<T> input_rnd_vec = flatten_4d<T>(test_input_fmt, input_rnd);
  85     VF<T> weights_rnd_vec = flatten_4d<T>(test_weights_fmt, weights_rnd);
  86
  87     const auto& engine = get_test_engine();
  88     tensor input_tensor(input_b, f, x, y);
  89     tensor weights_tensor(output_f, f, x, y);
  90     auto input = memory::allocate(engine, { type_to_data_type<T>::value, test_input_fmt, input_tensor });
  91     auto weights = memory::allocate(engine, { type_to_data_type<T>::value, test_weights_fmt, weights_tensor });
  92     auto bias = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1,1,output_f,1 } });
  93     set_values(input, input_rnd_vec);
  94     set_values(weights, weights_rnd_vec);
  95     set_values(bias, bias_rnd_vec);
  96
  97     topology topology(
  98         input_layout("input", input.get_layout()),
  99         data("weights", weights),
 100         data("bias", bias),
 101         fully_connected("fully_connected", "input", "weights", "bias", relu, slope)
 102     );
 103
 104     network network(engine, topology);
 105     network.set_input_data("input", input);
 106
 107     auto outputs = network.execute();
 108     EXPECT_EQ(outputs.size(), size_t(1));
 109     EXPECT_EQ(outputs.begin()->first, "fully_connected");
 110
 111     auto output_memory = outputs.at("fully_connected").get_memory();
 112     auto output_layout = output_memory.get_layout();
 113     auto output_ptr = output_memory.pointer<T>();
 114
 115     //EXPECT_EQ(output_layout.format.value, test_input_fmt);
 116     tensor output_tensor = output_layout.size;
 117     int b_size = output_tensor.batch[0];
 118     int x_size = output_tensor.feature[0];
 119     EXPECT_EQ(b_size, input_b);
 120     EXPECT_EQ(x_size, output_f);
 121     unsigned num_of_operations = f * x * y * 2;
 122     float ulp = (1.0f / 1024.0f) * num_of_operations;
 123     bool test_is_correct = true;
 124     VVVVF<T> output_cpu = fully_connected_reference<T>(input_rnd, weights_rnd, bias_rnd_vec, relu, slope);
 125     VF<T> output_cpu_vec = flatten_4d<T>(layout_4d(output_layout.format), output_cpu);
 126     for (size_t i = 0; i < output_cpu_vec.size(); ++i) {
 127         if (std::abs(float(output_cpu_vec[i]) - float(output_ptr[i])) > ulp) {
 128             EXPECT_FLOAT_EQ(output_cpu_vec[i], output_ptr[i]); // to print the problematic values
 129             test_is_correct = false;
 130             break;
 131         }
 132     }
 133
 134     EXPECT_EQ(test_is_correct, true) << std::endl
 135         << "failing test parameters:" << std::endl
 136         << "test_input_fmt = " << format::traits(test_input_fmt).order << std::endl
 137         << "test_weights_fmt = " << format::traits(test_weights_fmt).order << std::endl
 138         << "input_b = " << input_b << std::endl
 139         << "f = " << f << std::endl
 140         << "y = " << y << std::endl
 141         << "x = " << x << std::endl
 142         << "output_f = " << output_f << std::endl
 143         << "relu = " << relu << std::endl
 144         << "slope = " << (float)slope << std::endl
 145         << "type = " << (sizeof(T) == 2 ? "float16" : "float32") << std::endl;
 146 }
 147
 148 TEST(DISABLED_fully_connected_gpu, generic_random_short) {
 149     VF<cldnn::format> test_input_fmts = { cldnn::format::bfyx, cldnn::format::yxfb };
 150     VF<cldnn::format> test_weights_fmts = { cldnn::format::yxfb };
 151     VF<bool> relu = { true, false };
 152     std::vector<int> batches = { 1, 2, 4, 8, 16 };
 153     std::vector<int> features = { 1, 2 };
 154     std::vector<std::pair<int, int>> input_sizes = { {28, 28}, {64, 64}, {100, 100}, {227, 227}, {1000, 1}, {1, 4096} };
 155     VF<int> outputs_x = { 5, 16 };
 156
 157     const auto& engine = get_test_engine();
 158     bool f16_supported = !!engine.get_info().supports_fp16;
 159     if (!f16_supported) {
 160         std::cout << "[ SKIPPED  ] float16 combinations are skipped (cl_khr_fp16 is not supported)." << std::endl;
 161     }
 162
 163     for (cldnn::format test_input_fmt : test_input_fmts) {
 164         for (cldnn::format test_weights_fmt : test_weights_fmts) {
 165             for (const auto& b : batches) {
 166                 for(const auto& f : features) {
 167                     for (const auto& sizes : input_sizes) {
 168                         for (int output_f : outputs_x) {
 169                             for (bool relu_activated : relu) {
 170                                     generic_fully_connected_test<float>(test_input_fmt, test_weights_fmt, b, f, sizes.second, sizes.first, output_f, relu_activated);
 171                                     if (!f16_supported) continue;
 172                                     generic_fully_connected_test<FLOAT16>(test_input_fmt, test_weights_fmt, b, f, sizes.second, sizes.first, output_f, relu_activated);
 173                             }
 174                         }
 175                     }
 176                 }
 177             }
 178         }
 179     }
 180 }
 181
 182 TEST(fully_connected_gpu, no_biases) {
 183     //  Input  : 3x1
 184     //  Output : 4x1
 185     //  Weights: 4x3
 186     //
 187     //  Input:
 188     //  -0.5     2    0.5
 189     //
 190     //  Weights:
 191     //   1.5     1    0.5
 192     //  -1       0    0.5
 193     //   0.5    -0.5 -2
 194     //  -0.5     1    1.5
 195     //
 196     //
 197     //  Biases:
 198     //   no biases
 199     //
 200     //  Output:
 201     //   2.5    2.75    0.75   7
 202
 203     const int32_t input_x = 3, input_b = 1,  // size of whole input buffer
 204         weight_b = 4, weight_x = 3;  // size of whole weights buffer
 205
 206     const auto& engine = get_test_engine();
 207
 208     auto input_prim = memory::allocate(engine, { data_types::f32,format::yxfb,{ input_b, 1, input_x, 1} });
 209     auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ weight_b, 1, weight_x, 1 } });
 210
 211     set_values(input_prim, { -0.5f, 2.0f, 0.5f });
 212     set_values(weights_prim, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f });
 213
 214     auto input = input_layout("input", input_prim.get_layout());
 215     auto w_data = data("weights", weights_prim);
 216     auto fc = fully_connected("full_con_prim", "input", "weights");
 217     topology topology;
 218     topology.add(input);
 219     topology.add(w_data);
 220     topology.add(fc);
 221
 222     network network(engine, topology);
 223     network.set_input_data("input", input_prim);
 224
 225     auto outputs = network.execute();
 226     EXPECT_EQ(outputs.size(), size_t(1));
 227     EXPECT_EQ(outputs.begin()->first, "full_con_prim");
 228
 229     auto output_prim = outputs.begin()->second.get_memory();
 230
 231     auto output_ptr = output_prim.pointer<float>();
 232
 233     EXPECT_EQ(1.5f, output_ptr[0]);
 234     EXPECT_EQ(0.75f, output_ptr[1]);
 235     EXPECT_EQ(-2.25f, output_ptr[2]);
 236     EXPECT_EQ(3.0f, output_ptr[3]);
 237 }
 238
 239
 240 TEST(fully_connected_gpu, no_biases_int8) {
 241     //  Input  : 3x1
 242     //  Output : 4x1
 243     //  Weights: 4x3
 244     //
 245     //  Input:
 246     //  8.0f, 2.0f, -4.0f
 247     //
 248     //  Weights:
 249     //   2.0f    1.0f  0.0f
 250     //  -3.0f   -2.0f  1.0f
 251     //   0.0f   -2.0f -4.0f
 252     //  -5.0f   10.0f  8.0f
 253     //
 254     //
 255     //  Biases:
 256     //   no biases
 257     //
 258     //  Output:
 259     //  18    -32    12   -52
 260
 261     const int32_t input_x = 3, input_b = 1,  // size of whole input buffer
 262         weight_b = 4, weight_x = 3;  // size of whole weights buffer
 263
 264     const auto& engine = get_test_engine();
 265
 266     auto input_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ input_b, 1, input_x, 1 } });
 267     auto weights_prim = memory::allocate(engine, { data_types::i8,format::bfyx,{ weight_b, 1, weight_x, 1 } });
 268
 269     set_values(input_prim, { 8.4f, 2.3f, -4.99f });
 270     set_values<char>(weights_prim, { 2, 1, 0, -3, -2, 1, 0, -2, -4, -5, 10, 8 });
 271
 272     auto input = input_layout("input", input_prim.get_layout());
 273     auto w_data = data("weights", weights_prim);
 274     auto ri = reorder("reorder_to_int", "input", { data_types::i8,format::bfyx,{ input_b, 1, input_x, 1 } });
 275     auto fc = fully_connected("full_con_prim", "reorder_to_int", "weights");
 276     auto rf = reorder("reorder_to_float", "full_con_prim", { data_types::f32,format::bfyx,{ input_b, 1, 4, 1 } });
 277     topology topology;
 278     topology.add(input);
 279     topology.add(w_data);
 280     topology.add(fc);
 281     topology.add(ri);
 282     topology.add(rf);
 283     network network(engine, topology);
 284     network.set_input_data("input", input_prim);
 285
 286     auto outputs = network.execute();
 287     EXPECT_EQ(outputs.size(), size_t(1));
 288     EXPECT_EQ(outputs.begin()->first, "reorder_to_float");
 289
 290     auto output_prim = outputs.begin()->second.get_memory();
 291
 292     auto output_ptr = output_prim.pointer<float>();
 293
 294     EXPECT_EQ(18.0f, output_ptr[0]);
 295     EXPECT_EQ(-32.0f, output_ptr[1]);
 296     EXPECT_EQ(12.0f, output_ptr[2]);
 297     EXPECT_EQ(-52.0f, output_ptr[3]);
 298 }
 299
 300
 301 TEST(fully_connected_gpu, xb_f32_batch_1) {
 302     //  Input  : 3x1
 303     //  Output : 4x1
 304     //  Weights: 4x3
 305     //
 306     //  Input:
 307     //  -0.5     2    0.5
 308     //
 309     //  Weights:
 310     //   1.5     1    0.5
 311     //  -1       0    0.5
 312     //   0.5    -0.5 -2
 313     //  -0.5     1    1.5
 314     //
 315     //
 316     //  Biases:
 317     //   1.0, 2.0, 3.0, 4.0
 318     //
 319     //  Output:
 320     //   2.5    2.75    0.75   7
 321
 322     const int32_t output_f = 4,  // size of whole output buffer
 323         input_x = 3, input_b = 1,  // size of whole input buffer
 324         weight_b = 4, weight_x = 3;  // size of whole weights buffer
 325
 326     const auto& engine = get_test_engine();
 327
 328     auto input_prim = memory::allocate( engine, { data_types::f32, format::yxfb, { input_b, 1, input_x, 1 } });
 329     auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ weight_b, 1, weight_x, 1 } });
 330     auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx, { 1,1,output_f, 1} });
 331
 332     set_values(input_prim, { -0.5f, 2.0f, 0.5f });
 333     set_values(weights_prim, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f });
 334     set_values(bias_prim, { 1.0f, 2.0f, 3.0f, 4.0f });
 335
 336     topology topology(
 337         input_layout("input", input_prim.get_layout()),
 338         data("weights", weights_prim),
 339         data("bias", bias_prim),
 340         fully_connected("full_con_prim", "input", "weights", "bias")
 341     );
 342
 343     network network(engine, topology);
 344     network.set_input_data("input", input_prim);
 345
 346     auto outputs = network.execute();
 347     EXPECT_EQ(outputs.size(), size_t(1));
 348     EXPECT_EQ(outputs.begin()->first, "full_con_prim");
 349
 350     auto output_prim = outputs.begin()->second.get_memory();
 351
 352     auto output_ptr = output_prim.pointer<float>();
 353
 354     EXPECT_EQ(2.5f, output_ptr[0]);
 355     EXPECT_EQ(2.75f, output_ptr[1]);
 356     EXPECT_EQ(0.75f, output_ptr[2]);
 357     EXPECT_EQ(7.0f, output_ptr[3]);
 358 }
 359
 360 TEST(fully_connected_gpu, xb_f32_batch_2) {
 361     //  Input  : 3x2
 362     //  Output : 4x2
 363     //  Weights: 4x3
 364     //
 365     //  Input:
 366     //  -0.5     2    0.5
 367     //   1       1.5  0
 368     //
 369     //  Weights:
 370     //   1.5     1    0.5
 371     //  -1       0    0.5
 372     //   0.5    -0.5 -2
 373     //  -0.5     1    1.5
 374     //
 375     //  Biases:
 376     //   1.0, 2.0, 3.0, 4.0
 377     //
 378     //  Output:
 379     //   2.5    2.75     0.75   7
 380     //   4      1        2.75   5
 381
 382     const int32_t output_f = 4,  // size of whole output buffer
 383         input_x = 3, input_b = 2,  // size of whole input buffer
 384         weight_b = 4, weight_x = 3;  // size of whole weights buffer
 385
 386     const auto& engine = get_test_engine();
 387
 388     auto input_prim = memory::allocate(engine, { data_types::f32,format::yxfb,{ input_b,1,input_x, 1 } });
 389     auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ weight_b, 1, weight_x, 1 } });
 390     auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_f,1 } });
 391
 392     set_values(input_prim, { -0.5f, 1.0f, 2.0f, 1.5f, 0.5f, 0.0f });
 393     set_values(weights_prim, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f });
 394     set_values(bias_prim, { 1.0f, 2.0f, 3.0f, 4.0f });
 395
 396     topology topology(
 397         input_layout("input", input_prim.get_layout()),
 398         data("weights", weights_prim),
 399         data("bias", bias_prim),
 400         fully_connected("full_con_prim", "input", "weights", "bias")
 401     );
 402
 403     network network(engine, topology);
 404     network.set_input_data("input", input_prim);
 405
 406     auto outputs = network.execute();
 407     EXPECT_EQ(outputs.size(), size_t(1));
 408     EXPECT_EQ(outputs.begin()->first, "full_con_prim");
 409
 410     auto output_prim = outputs.begin()->second.get_memory();
 411
 412     auto output_ptr = output_prim.pointer<float>();
 413
 414     EXPECT_EQ(2.50f, output_ptr[0]);
 415     EXPECT_EQ(4.00f, output_ptr[1]);
 416     EXPECT_EQ(2.75f, output_ptr[2]);
 417     EXPECT_EQ(1.00f, output_ptr[3]);
 418     EXPECT_EQ(0.75f, output_ptr[4]);
 419     EXPECT_EQ(2.75f, output_ptr[5]);
 420     EXPECT_EQ(7.00f, output_ptr[6]);
 421     EXPECT_EQ(5.00f, output_ptr[7]);
 422 }
 423
 424 TEST(fully_connected_gpu, x_f32) {
 425     //  Input  : 3x1
 426     //  Output : 4x1
 427     //  Weights: 4x3
 428     //
 429     //  Input:
 430     //  -0.5     2    0.5
 431     //
 432     //  Weights:
 433     //   1.5     1    0.5
 434     //  -1       0    0.5
 435     //   0.5    -0.5 -2
 436     //  -0.5     1    1.5
 437     //
 438     //  Biases:
 439     //   1.0, 2.0, 3.0, 4.0
 440     //  Output:
 441     //   2.5    2.75    0.75   7
 442
 443     const int32_t output_f = 4,                 // size of whole output buffer
 444         input_x = 3,                 // size of whole input buffer
 445         weight_b = 4, weight_x = 3;  // size of whole weights buffer
 446
 447     const auto& engine = get_test_engine();
 448
 449     auto input_prim = memory::allocate(engine, { data_types::f32,format::bfyx, { 1,1,input_x,1 } });
 450     //auto output_prim = memory::allocate({ memory::format::xb_f32,{ output_b,{ { output_f } },{ 1 } } });
 451     auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ weight_b, 1, weight_x, 1 } });
 452     auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_f,1 } });
 453
 454     set_values(input_prim, { -0.5f, 2.0f, 0.5f });
 455     set_values(weights_prim, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f });
 456     set_values(bias_prim, { 1.0f, 2.0f, 3.0f, 4.0f });
 457
 458     topology topology(
 459         input_layout("input", input_prim.get_layout()),
 460         data("weights", weights_prim),
 461         data("bias", bias_prim),
 462         fully_connected("full_con_prim", "input", "weights", "bias")
 463     );
 464
 465     network network(engine, topology);
 466     network.set_input_data("input", input_prim);
 467
 468     auto outputs = network.execute();
 469     EXPECT_EQ(outputs.size(), size_t(1));
 470     EXPECT_EQ(outputs.begin()->first, "full_con_prim");
 471
 472     auto output_prim = outputs.begin()->second.get_memory();
 473
 474     auto output_ptr = output_prim.pointer<float>();
 475
 476     EXPECT_EQ(2.50f, output_ptr[0]);
 477     EXPECT_EQ(2.75f, output_ptr[1]);
 478     EXPECT_EQ(0.75f, output_ptr[2]);
 479     EXPECT_EQ(7.00f, output_ptr[3]);
 480 }
 481
 482
 483 TEST(fully_connected_gpu, yxfn_f32) {
 484     //  Input  : 1x2x1x2 - 1 batch 2 feature maps of size 2x1
 485     //  Output : 2x1 - 2 batches 1 neuron each
 486     //  Weights: 2x2x1x2 - 2 neurons with weights of 2 feature maps of size 2x1
 487     //
 488     //  Input:
 489     //   1  -2      f0: b0
 490     //   3  -4      f1: b0
 491
 492     //  Weights:
 493     //   1  -1      n0: fm0
 494     //   2   0      n0: fm1
 495     //   3   4      n1: fm0
 496     //   0.5 5      n1: fm1
 497     //
 498     //  Biases:
 499     //   1.0 -5
 500     //
 501     //  Output:
 502     //   10  -28.5
 503
 504     const auto& engine = get_test_engine();
 505
 506     auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 2, 2, 1 } });
 507     //auto output_prim = memory::allocate({ memory::format::xb_f32,{ 2 ,{ { 1 } }, 1 } });
 508     auto weights_prim = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 1 } });
 509     auto bias_prim = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 1 } });
 510
 511     set_values(input_prim, { 1.f, 3.f, -2.f, -4.f });
 512     set_values(weights_prim, { 1.f, -1.f, 2.0f, 0.f, 3.0f, 4.0f, 0.5f, 5.0f });
 513     set_values(bias_prim, { 1.0f, -5.0f });
 514
 515     topology topology(
 516         input_layout("input", input_prim.get_layout()),
 517         data("weights", weights_prim),
 518         data("bias", bias_prim),
 519         fully_connected("full_con_prim", "input", "weights", "bias")
 520     );
 521
 522     network network(engine, topology);
 523     network.set_input_data("input", input_prim);
 524
 525     auto outputs = network.execute();
 526     EXPECT_EQ(outputs.size(), size_t(1));
 527     EXPECT_EQ(outputs.begin()->first, "full_con_prim");
 528
 529     auto output_prim = outputs.begin()->second.get_memory();
 530
 531     auto output_ptr = output_prim.pointer<float>();
 532
 533     EXPECT_EQ(10, output_ptr[0]);
 534     EXPECT_EQ(-28.5, output_ptr[1]);
 535 }
 536
 537 TEST(fully_connected_gpu, xb_f32_batch_1_relu) {
 538     //  Input  : 3x1
 539     //  Output : 4x1
 540     //  Weights: 4x3
 541     //
 542     //  Input:
 543     //  -0.5     2    0.5
 544     //
 545     //  Weights:
 546     //   1.5     1    0.5
 547     //  -1       0    0.5
 548     //   0.5    -0.5 -2
 549     //  -0.5     1    1.5
 550     //
 551     //
 552     //  Biases:
 553     //   1.0,  -2.0,  3.0,  -4.0
 554     //
 555     //  Output:
 556     //   2.5   0      0.75  0
 557
 558     const int32_t output_f = 4,  // size of whole output buffer
 559         input_x = 3, input_b = 1,  // size of whole input buffer
 560         weight_b = 4, weight_x = 3;  // size of whole weights buffer
 561
 562     const auto& engine = get_test_engine();
 563
 564     auto input_prim = memory::allocate(engine, { data_types::f32,format::yxfb,{ input_b, 1, input_x, 1 } });
 565     //auto output_prim = memory::allocate({ memory::format::xb_f32,{ output_b,{ { output_f } },{ 1 } } });
 566     auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ weight_b, 1, weight_x, 1 } });
 567     auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_f, 1 } });
 568
 569     set_values(input_prim, { -0.5f, 2.0f, 0.5f });
 570     set_values(weights_prim, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f });
 571     set_values(bias_prim, { 1.0f, -2.0f, 3.0f, -4.0f });
 572
 573     topology topology(
 574         input_layout("input", input_prim.get_layout()),
 575         data("weights", weights_prim),
 576         data("bias", bias_prim),
 577         fully_connected("full_con_prim", "input", "weights", "bias", true, 0)
 578     );
 579
 580     network network(engine, topology);
 581     network.set_input_data("input", input_prim);
 582
 583     auto outputs = network.execute();
 584     EXPECT_EQ(outputs.size(), size_t(1));
 585     EXPECT_EQ(outputs.begin()->first, "full_con_prim");
 586
 587     auto output_prim = outputs.begin()->second.get_memory();
 588
 589     auto output_ptr = output_prim.pointer<float>();
 590
 591     EXPECT_EQ(2.50f, output_ptr[0]);
 592     EXPECT_EQ(0.00f, output_ptr[1]);
 593     EXPECT_EQ(0.75f, output_ptr[2]);
 594     EXPECT_EQ(0.00f, output_ptr[3]);
 595 }
 596
 597 TEST(fully_connected_gpu, xb_f32_batch_2_relu) {
 598     //  Input  : 3x2
 599     //  Output : 4x2
 600     //  Weights: 4x3
 601     //
 602     //  Input:
 603     //  -0.5     2    0.5
 604     //   1       1.5  0
 605     //
 606     //  Weights:
 607     //   1.5     1    0.5
 608     //  -1       0    0.5
 609     //   0.5    -0.5 -2
 610     //  -0.5     1    1.5
 611     //
 612     //  Biases:
 613     //   1.0, -2.0, 3.0, -4.0
 614     //
 615     //  Output:
 616     //   2.5    0   0.75   0
 617     //   4      0   2.75   0
 618
 619     const int32_t output_f = 4,  // size of whole output buffer
 620         input_x = 3, input_b = 2,  // size of whole input buffer
 621         weight_b = 4, weight_x = 3;  // size of whole weights buffer
 622
 623     const auto& engine = get_test_engine();
 624
 625     auto input_prim = memory::allocate(engine, { data_types::f32,format::yxfb,{ input_b, 1, input_x, 1 } });
 626     //auto output_prim = memory::allocate({ memory::format::xb_f32,{ output_b,{ { output_f } },{ 1 } } });
 627     auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ weight_b, 1, weight_x, 1 } });
 628     auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_f,1 } });
 629
 630     set_values(input_prim, { -0.5f, 1.0f, 2.0f, 1.5f, 0.5f, 0.0f });
 631     set_values(weights_prim, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f });
 632     set_values(bias_prim, { 1.0f, -2.0f, 3.0f, -4.0f });
 633
 634     topology topology(
 635         input_layout("input", input_prim.get_layout()),
 636         data("weights", weights_prim),
 637         data("bias", bias_prim),
 638         fully_connected("full_con_prim", "input", "weights", "bias", true, 0)
 639     );
 640
 641     network network(engine, topology);
 642     network.set_input_data("input", input_prim);
 643
 644     auto outputs = network.execute();
 645     EXPECT_EQ(outputs.size(), size_t(1));
 646     EXPECT_EQ(outputs.begin()->first, "full_con_prim");
 647
 648     auto output_prim = outputs.begin()->second.get_memory();
 649
 650     auto output_ptr = output_prim.pointer<float>();
 651
 652     EXPECT_EQ(2.50f, output_ptr[0]);
 653     EXPECT_EQ(4.00f, output_ptr[1]);
 654     EXPECT_EQ(0.00f, output_ptr[2]);
 655     EXPECT_EQ(0.00f, output_ptr[3]);
 656     EXPECT_EQ(0.75f, output_ptr[4]);
 657     EXPECT_EQ(2.75f, output_ptr[5]);
 658     EXPECT_EQ(0.00f, output_ptr[6]);
 659     EXPECT_EQ(0.00f, output_ptr[7]);
 660 }
 661
 662 TEST(fully_connected_gpu, x_f32_relu) {
 663     //  Input  : 3x1
 664     //  Output : 4x1
 665     //  Weights: 4x3
 666     //
 667     //  Input:
 668     //  -0.5     2    0.5
 669     //
 670     //  Weights:
 671     //   1.5     1    0.5
 672     //  -1       0    0.5
 673     //   0.5    -0.5 -2
 674     //  -0.5     1    1.5
 675     //
 676     //  Biases:
 677     //   1.0, -2.0, 3.0, -4.0
 678     //  Output:
 679     //   2.5   0    0.75  0
 680
 681     const int32_t output_f = 4,                 // size of whole output buffer
 682         input_x = 3,                 // size of whole input buffer
 683         weight_b = 4, weight_x = 3;  // size of whole weights buffer
 684
 685     const auto& engine = get_test_engine();
 686
 687     auto input_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,input_x,1 } });
 688     //auto output_prim = memory::allocate({ memory::format::x_f32,{ 1       ,{ { output_f } }, 1 } });
 689     auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ weight_b, 1, weight_x, 1 } });
 690     auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_f,1 } });
 691
 692     set_values(input_prim, { -0.5f, 2.0f, 0.5f });
 693     set_values(weights_prim, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f });
 694     set_values(bias_prim, { 1.0f, -2.0f, 3.0f, -4.0f });
 695
 696     topology topology(
 697         input_layout("input", input_prim.get_layout()),
 698         data("weights", weights_prim),
 699         data("bias", bias_prim),
 700         fully_connected("full_con_prim", "input", "weights", "bias", true, 0)
 701     );
 702
 703     network network(engine, topology);
 704     network.set_input_data("input", input_prim);
 705
 706     auto outputs = network.execute();
 707     EXPECT_EQ(outputs.size(), size_t(1));
 708     EXPECT_EQ(outputs.begin()->first, "full_con_prim");
 709
 710     auto output_prim = outputs.begin()->second.get_memory();
 711
 712     auto output_ptr = output_prim.pointer<float>();
 713
 714     EXPECT_EQ(2.50f, output_ptr[0]);
 715     EXPECT_EQ(0.00f, output_ptr[1]);
 716     EXPECT_EQ(0.75f, output_ptr[2]);
 717     EXPECT_EQ(0.00f, output_ptr[3]);
 718 }
 719
 720 TEST(fully_connected_gpu, x_f32_relu_with_negative_slope) {
 721     //  Input  : 3x1
 722     //  Output : 4x1
 723     //  Weights: 4x3
 724     //  Negative Slope: 0.1
 725     //
 726     //  Input:
 727     //  -0.5     2    0.5
 728     //
 729     //  Weights:
 730     //   1.5     1    0.5
 731     //  -1       0    0.5
 732     //   0.5    -0.5 -2
 733     //  -0.5     1    1.5
 734     //
 735     //  Biases:
 736     //   1.0, -2.0, 3.0, -4.0
 737     //  Output:
 738     //   2.5   -0.125    0.75  -0.1
 739
 740     const int32_t output_f = 4,                 // size of whole output buffer
 741         input_x = 3,                 // size of whole input buffer
 742         weight_b = 4, weight_x = 3;  // size of whole weights buffer
 743
 744     const auto& engine = get_test_engine();
 745
 746     auto input_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,input_x,1 } });
 747     //auto output_prim = memory::allocate({ memory::format::x_f32,{ 1       ,{ { output_f } }, 1 } });
 748     auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ weight_b, 1, weight_x, 1 } });
 749     auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_f,1 } });
 750
 751     set_values(input_prim, { -0.5f, 2.0f, 0.5f });
 752     set_values(weights_prim, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f });
 753     set_values(bias_prim, { 1.0f, -2.0f, 3.0f, -4.0f });
 754
 755     topology topology(
 756         input_layout("input", input_prim.get_layout()),
 757         data("weights", weights_prim),
 758         data("bias", bias_prim),
 759         fully_connected("full_con_prim", "input", "weights", "bias", true, 0.1f)
 760     );
 761
 762     network network(engine, topology);
 763     network.set_input_data("input", input_prim);
 764
 765     auto outputs = network.execute();
 766     EXPECT_EQ(outputs.size(), size_t(1));
 767     EXPECT_EQ(outputs.begin()->first, "full_con_prim");
 768
 769     auto output_prim = outputs.begin()->second.get_memory();
 770
 771     auto output_ptr = output_prim.pointer<float>();
 772
 773     EXPECT_EQ(2.50f, output_ptr[0]);
 774     EXPECT_EQ(-0.125f, output_ptr[1]);
 775     EXPECT_EQ(0.75f, output_ptr[2]);
 776     EXPECT_EQ(-0.1f, output_ptr[3]);
 777 }
 778
 779 TEST(fully_connected_gpu, b_fs_yx_fsv4)
 780 {
 781     const auto& engine = get_test_engine();
 782
 783     const int in_B = 2;
 784     const int in_F = 2048;
 785     const int in_Y = 1;
 786     const int in_X = 1;
 787
 788     const int W_B = 1000;
 789     const int W_F = in_F;
 790     const int W_Y = in_Y;
 791     const int W_X = in_X;
 792
 793     // Input data
 794     std::vector<char> Data(in_F * in_B); // in_X=in_Y=1
 795     int i = 0;
 796     std::generate(Data.begin(), Data.end(), [i]() mutable { return i++ % 9; });
 797     auto input = memory::allocate(engine, {data_types::i8, format::bfyx, {in_B, in_F, in_X, in_Y}});
 798     set_values(input, std::move(Data));
 799
 800     // Create a topology
 801     topology topology(input_layout("input", input.get_layout()));
 802
 803     // Reorder
 804     topology.add(reorder("reorder_in",
 805                          "input",
 806                          layout(data_types::i8, format::b_fs_yx_fsv4, {in_B, in_F, in_X, in_Y})));
 807
 808     // Weights
 809     std::vector<char> Weights(W_B * W_F);
 810     i = 0;
 811     std::generate(Weights.begin(), Weights.end(), [W_F, i]() mutable {
 812         return i % 2 ? -(i++) / W_F - 1 : (i++) / W_F + 1;
 813     });
 814     auto weights_gold =
 815         memory::allocate(engine, {data_types::i8, format::bfyx, {W_B, W_F, W_X, W_Y}});
 816     auto weights_imad =
 817         memory::allocate(engine, {data_types::i8, format::bfyx, {W_B, W_F, W_X, W_Y}});
 818     set_values(weights_gold, Weights);
 819     set_values(weights_imad, std::move(Weights));
 820     topology.add(data("weights_gold", weights_gold), data("weights_imad", weights_imad));
 821
 822     // Bias, Callibraiton, Quantization
 823     std::vector<float> vB(in_F), vC(in_F), vQ(in_F);
 824     float x = 0.1f;
 825     std::generate(vB.begin(), vB.end(), [x]() mutable {
 826         x += 0.01f;
 827         if (x >= 0.9f)
 828             x = 0.1f;
 829         return x;
 830     });
 831     x = 0.2f;
 832     std::generate(vC.begin(), vC.end(), [x]() mutable {
 833         x += 0.01f;
 834         if (x >= 0.9f)
 835             x = 0.2f;
 836         return x;
 837     });
 838     x = 0.3f;
 839     std::generate(vQ.begin(), vQ.end(), [x]() mutable {
 840         x += 0.01f;
 841         if (x >= 0.9f)
 842             x = 0.3f;
 843         return x;
 844     });
 845     auto bias_gold = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, in_F, 1}});
 846     auto bias_imad = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, in_F, 1}});
 847     auto callib_gold = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, in_F, 1}});
 848     auto callib_imad = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, in_F, 1}});
 849     auto quant_gold = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, in_F, 1}});
 850     auto quant_imad = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, in_F, 1}});
 851     set_values(bias_gold, vB);
 852     set_values(bias_imad, std::move(vB));
 853     set_values(callib_gold, vC);
 854     set_values(callib_imad, std::move(vC));
 855     set_values(quant_gold, vQ);
 856     set_values(quant_imad, std::move(vQ));
 857     topology.add(data("bias_gold", bias_gold),
 858                  data("callib_gold", callib_gold),
 859                  data("quant_gold", quant_gold));
 860     topology.add(data("bias_imad", bias_imad),
 861                  data("callib_imad", callib_imad),
 862                  data("quant_imad", quant_imad));
 863
 864     // Fully connected
 865     fully_connected fullc_gold(
 866         "fullc_gold", "input", "weights_gold", {"bias_gold"}, {"quant_gold"}, {"callib_gold"}, 1.0f);
 867     fully_connected fullc_imad(
 868         "fullc_imad", "reorder_in", "weights_imad", {"bias_imad"}, {"quant_imad"}, {"callib_imad"}, 1.0f);
 869     topology.add(fullc_gold, fullc_imad);
 870
 871     // Output reorder
 872     auto reorder_gold =
 873         reorder("reorder_gold", fullc_gold, layout(data_types::i8, format::bfyx, {in_B, W_B, 1, 1}));
 874     auto reorder_imad =
 875         reorder("reorder_imad", fullc_imad, layout(data_types::i8, format::bfyx, {in_B, W_B, 1, 1}));
 876     topology.add(reorder_gold, reorder_imad);
 877
 878     // Network build
 879     build_options build_opt;
 880     build_opt.set_option(build_option::optimize_data(true));
 881     network network(engine, topology, build_opt);
 882
 883     // Network execuiton
 884     network.set_input_data("input", input);
 885     auto outputs = network.execute();
 886
 887     auto out_gold = outputs.find("reorder_gold");
 888     auto out_test = outputs.find("reorder_imad");
 889
 890     ASSERT_NE(out_gold, outputs.end());
 891     ASSERT_NE(out_test, outputs.end());
 892     auto gold_ptr = out_gold->second.get_memory().pointer<char>();
 893     auto test_ptr = out_test->second.get_memory().pointer<char>();
 894
 895     ASSERT_EQ(gold_ptr.size(), test_ptr.size());
 896     for (size_t i = 0; i < gold_ptr.size(); i++)
 897     {
 898         ASSERT_EQ(gold_ptr[i], test_ptr[i]);
 899     }
 900 }