inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp

   1 /*
   2 // Copyright (c) 2016 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 ///////////////////////////////////////////////////////////////////////////////////////////////////
  18
  19 #include <gtest/gtest.h>
  20 #include "api/CPP/memory.hpp"
  21 #include <api/CPP/input_layout.hpp>
  22 #include "api/CPP/convolution.hpp"
  23 #include <api/CPP/topology.hpp>
  24 #include <api/CPP/network.hpp>
  25 #include <api/CPP/engine.hpp>
  26 #include "test_utils/test_utils.h"
  27 #include "test_utils/float16.h"
  28 #include <api/CPP/data.hpp>
  29 #include <algorithm>
  30 #include <cmath>
  31 #include <iostream>
  32 #include <iomanip>
  33 #include <thread>
  34 #include <fstream>
  35 #include <api/CPP/reorder.hpp>
  36
  37 using namespace cldnn;
  38 using namespace tests;
  39
  40
  41 namespace cldnn
  42 {
  43     template<> struct type_to_data_type<FLOAT16> { static const data_types value = data_types::f16; };
  44 }
  45
  46
  47
  48 template<typename T>
  49 T kahan_summation(std::vector<T> &input) {
  50     T sum = 0;
  51     T c = 0;
  52     for (T x : input) {
  53         T y = x - c;
  54         T t = sum + y;
  55         c = (t - sum) - y;
  56         sum = t;
  57     }
  58     return sum;
  59 }
  60
  61 template<typename T>
  62 VVF<T> reference_convolve(VVVF<T> &input, VVVF<T> &filter, int stride_y, int stride_x, float bias, int dilation_y = 1, int dilation_x = 1,
  63         int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0,
  64         int output_padding_x = 0, size_t f_begin = 0)
  65 {
  66     size_t kernel_extent_y = dilation_y * (filter[0].size() - 1) + 1;
  67     size_t kernel_extent_x = dilation_x * (filter[0][0].size() - 1) + 1;
  68     size_t output_y = 1 + (input[0].size() - kernel_extent_y + 2 * input_padding_y) / stride_y + 2 * output_padding_y;
  69     size_t output_x = 1 + (input[0][0].size() - kernel_extent_x + 2 * input_padding_x) / stride_x + 2 * output_padding_x;
  70     VVF<T> output(output_y, VF<T>(output_x, bias));
  71     for (size_t f = 0; f < filter.size(); ++f) {
  72         for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
  73             for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
  74                 VF<T> values;
  75                 values.reserve(filter[0].size() * filter[0][0].size());
  76                 for (size_t yf = 0; yf < filter[0].size(); ++yf) {
  77                     int yi = -input_padding_y + (int)yf * dilation_y + stride_y * (int)y;
  78                     if (yi < 0 || (int)input[0].size() <= yi) continue;
  79                     for (size_t xf = 0; xf < filter[0][0].size(); ++xf) {
  80                         int xi = -input_padding_x + (int)xf * dilation_x + stride_x * (int)x;
  81                         if (xi < 0 || (int)input[0][0].size() <= xi) continue;
  82                         values.push_back(input[f_begin + f][yi][xi] * filter[f][yf][xf]);
  83                     }
  84                 }
  85                 output[y + output_padding_y][x + output_padding_x] += kahan_summation<T>(values);
  86             }
  87         }
  88     }
  89     return output;
  90 }
  91
  92 void dump_buffer(memory const& mem, std::string const& name)
  93 {
  94     std::ofstream out(name);
  95     auto size = mem.get_layout().get_buffer_size();
  96     auto ptr = mem.pointer<const float>();
  97     auto pitches = mem.get_layout().get_pitches();
  98     out << "Data size: " << mem.get_layout().size << "\n";
  99     out << "Lower padding: " << mem.get_layout().data_padding.lower_size() << "\n";
 100     out << "Upper padding: " << mem.get_layout().data_padding.upper_size() << "\n";
 101     out << "\n";
 102
 103     for (int b = 0; b < size.batch[0]; ++b)
 104     {
 105         out << " ================ BATCH " << b << " =================\n\n";
 106         for (int f = 0; f < size.feature[0]; ++f)
 107         {
 108             out << "feature " << f << ":\n";
 109             for (int y = 0; y < size.spatial[1]; ++y)
 110             {
 111                 for (int x = 0; x < size.spatial[0]; ++x)
 112                 {
 113                     size_t idx = b * pitches.batch[0] + f * pitches.feature[0] + y * pitches.spatial[1] + x * pitches.spatial[0];
 114                     out << ptr[idx] << " ";
 115                 }
 116                 out << "\n";
 117             }
 118
 119             out << "\n";
 120         }
 121
 122         out << "\n";
 123     }
 124 }
 125
 126
 127 TEST(convolution_f32_fw_gpu, basic_convolution_no_bias) {
 128     //  Filter : 2x3
 129     //  Stride : 2x1
 130     //  Input  : 4x5
 131     //  Output : 2x3
 132     //
 133     //  Input:
 134     //  1  2  3  4  5
 135     //  2  2  3  4  6
 136     //  3  3  3  5  1
 137     //  1  1  1  1  1
 138     //
 139     //  Filter:
 140     //  1  2  1
 141     //  2  1  2
 142     //
 143     //  Output:
 144     // 21  28  39
 145     // 18  20  20
 146
 147     engine engine;
 148
 149     auto input = memory::allocate(engine, { data_types::f32,format::yxfb,{ 1, 1, 5, 4 } });
 150     auto weights = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1, 1, 3, 2 } });
 151
 152     set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 2.0f, 2.0f, 3.0f, 4.0f, 6.0f, 3.0f, 3.0f, 3.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f });
 153     set_values(weights, { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f });
 154     VVF<float> output_vec = {
 155         { 20.0f, 27.0f, 38.0f },
 156         { 17.0f, 19.0f, 19.0f } };
 157
 158     topology topology(
 159         input_layout("input", input.get_layout()),
 160         data("weights", weights),
 161         convolution("conv", "input", { "weights" }, { 1,1,1,2 }));
 162
 163     network network(engine, topology);
 164     network.set_input_data("input", input);
 165
 166     auto outputs = network.execute();
 167     EXPECT_EQ(outputs.size(), size_t(1));
 168     EXPECT_EQ(outputs.begin()->first, "conv");
 169
 170     auto output_memory = outputs.at("conv").get_memory();
 171     auto output_layout = output_memory.get_layout();
 172     auto output_ptr = output_memory.pointer<float>();
 173
 174     int y_size = output_layout.size.spatial[1];
 175     int x_size = output_layout.size.spatial[0];
 176     int f_size = output_layout.size.feature[0];
 177     int b_size = output_layout.size.batch[0];
 178     EXPECT_EQ(output_layout.format, format::yxfb);
 179     EXPECT_EQ(y_size, 2);
 180     EXPECT_EQ(x_size, 3);
 181     EXPECT_EQ(f_size, 1);
 182     EXPECT_EQ(b_size, 1);
 183     for (int y = 0; y < y_size; ++y) {
 184         for (int x = 0; x < x_size; ++x) {
 185             EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
 186         }
 187     }
 188
 189     //VVF temp_vec(y_size, VF(x_size, 0.0f));
 190     //for (int y = 0; y < y_size; ++y) {
 191     //    for (int x = 0; x < x_size; ++x) {
 192     //        temp_vec[y][x] = output_ptr[y * x_size + x];
 193     //    }
 194     //}
 195     //print_2d(temp_vec);
 196 }
 197
 198
 199 TEST(convolution_f32_fw_gpu, basic_convolution_int8_no_bias) {
 200     //  Filter : 2x3
 201     //  Stride : 2x1
 202     //  Input  : 4x5
 203     //  Output : 2x3
 204     //
 205     //  Input:
 206     //  1  2  3  4  5
 207     //  2  2  3  4  6
 208     //  3  3  3  5  1
 209     //  1  1  1  1  1
 210     //
 211     //  Filter:
 212     //  1  2  1
 213     //  2  1  2
 214     //
 215     //  Output:
 216     // 21  28  39
 217     // 18  20  20
 218
 219     engine engine;
 220
 221     auto input = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1, 1, 5, 4 } });
 222     auto weights = memory::allocate(engine, { data_types::i8,format::bfyx,{ 1, 1, 3, 2 } });
 223
 224     set_values(input, { 1.1f, 2.4f, 3.5f, 4.5f, 5.8f, 2.9f, 2.3f, 3.5f, 4.4f, 6.6f, 3.8f, 3.9f, 3.4f, 5.1f, 1.4f, 1.8f, 1.1f, 1.2f, 1.2f, 1.9f });
 225     set_values<char>(weights, { 1, 2, 1, 2, 1, 2 });
 226     VVF<float> output_vec = {
 227         { 20.0f, 27.0f, 38.0f },
 228         { 17.0f, 19.0f, 19.0f } };
 229
 230     topology topology(
 231         input_layout("input", input.get_layout()),
 232         reorder("to_int","input", { data_types::i8,format::bfyx,{ 1, 1, 5, 4 } }),
 233         data("weights", weights),
 234         convolution("conv", "to_int", { "weights" }, { 1,1,1,2 }),
 235         reorder("output", "conv", { data_types::f32,format::bfyx,{ 1, 1, 3, 2 } }));
 236
 237     network network(engine, topology);
 238     network.set_input_data("input", input);
 239
 240     auto outputs = network.execute();
 241     EXPECT_EQ(outputs.size(), size_t(1));
 242     EXPECT_EQ(outputs.begin()->first, "output");
 243
 244     auto output_memory = outputs.at("output").get_memory();
 245     auto output_layout = output_memory.get_layout();
 246     auto output_ptr = output_memory.pointer<float>();
 247
 248     int y_size = output_layout.size.spatial[1];
 249     int x_size = output_layout.size.spatial[0];
 250     int f_size = output_layout.size.feature[0];
 251     int b_size = output_layout.size.batch[0];
 252     EXPECT_EQ(output_layout.format, format::bfyx);
 253     EXPECT_EQ(y_size, 2);
 254     EXPECT_EQ(x_size, 3);
 255     EXPECT_EQ(f_size, 1);
 256     EXPECT_EQ(b_size, 1);
 257     for (int y = 0; y < y_size; ++y) {
 258         for (int x = 0; x < x_size; ++x) {
 259             EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
 260         }
 261     }
 262 }
 263
 264
 265 TEST(convolution_f32_fw_gpu, basic_convolution) {
 266     //  Filter : 2x3
 267     //  Stride : 2x1
 268     //  Input  : 4x5
 269     //  Output : 2x3
 270     //
 271     //  Input:
 272     //  1  2  3  4  5
 273     //  2  2  3  4  6
 274     //  3  3  3  5  1
 275     //  1  1  1  1  1
 276     //
 277     //  Filter:
 278     //  1  2  1
 279     //  2  1  2
 280     //
 281     //  Output:
 282     // 21  28  39
 283     // 18  20  20
 284     //
 285     //  Bias:
 286     //  1
 287
 288     engine engine;
 289
 290     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 5, 4 } });
 291     auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 3, 2 } });
 292     auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
 293
 294     set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 2.0f, 2.0f, 3.0f, 4.0f, 6.0f, 3.0f, 3.0f, 3.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f });
 295     set_values(weights, { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f });
 296     set_values(biases, { 1.0f });
 297     VVF<float> output_vec = {
 298         { 21.0f, 28.0f, 39.0f },
 299         { 18.0f, 20.0f, 20.0f } };
 300
 301     topology topology(
 302         input_layout("input", input.get_layout()),
 303         data("weights", weights),
 304         data("biases", biases),
 305         convolution( "conv", "input", { "weights" }, { "biases" }, { 0,0,1,2 }));
 306
 307     network network(engine, topology);
 308     network.set_input_data("input", input);
 309
 310     auto outputs = network.execute();
 311     EXPECT_EQ(outputs.size(), size_t(1));
 312     EXPECT_EQ(outputs.begin()->first, "conv");
 313
 314     auto output_memory = outputs.at("conv").get_memory();
 315     auto output_layout = output_memory.get_layout();
 316     auto output_ptr = output_memory.pointer<float>();
 317
 318     int y_size = output_layout.size.spatial[1];
 319     int x_size = output_layout.size.spatial[0];
 320     int f_size = output_layout.size.feature[0];
 321     int b_size = output_layout.size.batch[0];
 322     EXPECT_EQ(output_layout.format, format::yxfb);
 323     EXPECT_EQ(y_size, 2);
 324     EXPECT_EQ(x_size, 3);
 325     EXPECT_EQ(f_size, 1);
 326     EXPECT_EQ(b_size, 1);
 327     for (int y = 0; y < y_size; ++y) {
 328         for (int x = 0; x < x_size; ++x) {
 329             EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
 330         }
 331     }
 332 }
 333
 334 TEST(convolution_f32_fw_gpu, basic_convolution_bfyx_weights_as_input_layout) {
 335     //Same params as convolution_f32_fw_gpu, basic_convolution but with bfyx optimized data and weights set as input_layout
 336     engine engine;
 337     auto input = memory::allocate(engine, { data_types::f32, format::bfyx,
 338     { 1, 1, 5, 4 }
 339     });
 340     auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,
 341     { 1, 1, 3, 2 }
 342     });
 343     auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,
 344     { 1, 1, 1, 1 }
 345     });
 346     set_values(input,
 347     { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 2.0f, 2.0f, 3.0f, 4.0f, 6.0f, 3.0f, 3.0f, 3.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }
 348     );
 349     set_values(weights,
 350     { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f }
 351     );
 352     set_values(biases,
 353     { 1.0f }
 354     );
 355     VVF<float> output_vec = {
 356         { 21.0f, 28.0f, 39.0f }
 357         ,
 358         { 18.0f, 20.0f, 20.0f }
 359     };
 360     topology topology(
 361         input_layout("input", input.get_layout()),
 362         input_layout("weights", weights.get_layout()),
 363         input_layout("biases", biases.get_layout()),
 364         convolution("conv", "input",
 365         { "weights" }
 366             ,
 367             { "biases" }
 368             ,
 369             { 0,0,1,2 }
 370     ));
 371     cldnn::build_options options;
 372     options.set_option(cldnn::build_option::optimize_data(true));
 373     network network(engine, topology, options);
 374     network.set_input_data("input", input);
 375     network.set_input_data("weights", weights);
 376     network.set_input_data("biases", biases);
 377     auto outputs = network.execute();
 378     EXPECT_EQ(outputs.size(), size_t(1));
 379     EXPECT_EQ(outputs.begin()->first, "conv");
 380
 381     auto output_memory = outputs.at("conv").get_memory();
 382     auto output_layout = output_memory.get_layout();
 383     auto output_ptr = output_memory.pointer<float>();
 384
 385     int y_size = output_layout.size.spatial[1];
 386     int x_size = output_layout.size.spatial[0];
 387     int f_size = output_layout.size.feature[0];
 388     int b_size = output_layout.size.batch[0];
 389     EXPECT_EQ(output_layout.format, format::bfyx);
 390     EXPECT_EQ(y_size, 2);
 391     EXPECT_EQ(x_size, 3);
 392     EXPECT_EQ(f_size, 1);
 393     EXPECT_EQ(b_size, 1);
 394     for (int y = 0; y < y_size; ++y) {
 395         for (int x = 0; x < x_size; ++x) {
 396             EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
 397         }
 398     }
 399 }
 400
 401 TEST(convolution_f32_fw_gpu, basic_convolution_input_padding) {
 402     //  Filter : 2x2
 403     //  Stride : 1x1
 404     //  Input  : 3x4
 405     //  Input padding : 2x1
 406     //  Output : 6x5
 407     //  Padding: Zero
 408     //
 409     //  Input:
 410     //  z  z  z  z  z  z
 411     //  z  z  z  z  z  z
 412     //  z  1  2  3  4  z
 413     //  z  2  2  3  4  z
 414     //  z  3  3  3  5  z
 415     //  z  z  z  z  z  z
 416     //  z  z  z  z  z  z
 417     //
 418     //  Filter:
 419     //  1  1
 420     //  1  1
 421     //
 422     //  Output:
 423     //  1  1  1  1  1
 424     //  2  4  6  8  5
 425     //  4  8 11 15  9
 426     //  6 11 12 16 10
 427     //  4  7  7  9  6
 428     //  1  1  1  1  1
 429     //
 430     //  Bias:
 431     //  1
 432
 433     engine engine;
 434
 435     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 3 } });
 436     auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
 437     auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
 438
 439     set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f, 4.0f, 3.0f, 3.0f, 3.0f, 5.0f });
 440     set_values(weights, { 1.0f, 1.0f, 1.0f, 1.0f });
 441     set_values(biases, { 1.0f });
 442     VVF<float> output_vec = {
 443         { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
 444         { 2.0f, 4.0f, 6.0f, 8.0f, 5.0f },
 445         { 4.0f, 8.0f, 11.0f, 15.0f, 9.0f },
 446         { 6.0f, 11.0f, 12.0f, 16.0f, 10.0f },
 447         { 4.0f, 7.0f, 7.0f, 9.0f, 6.0f },
 448         { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f } };
 449
 450     topology topology(
 451         input_layout("input", input.get_layout()),
 452         data("weights", weights),
 453         data("biases", biases),
 454         convolution(
 455             "conv",
 456             "input",
 457             { "weights" },
 458             { "biases" },
 459             { 1,1,1,1 },
 460             { 0,0,-1,-2 },
 461             { 1, 1, 1, 1 },
 462             false,
 463             0,
 464             padding{ { 0,0,0,0 }, 0 })
 465     );
 466
 467     network network(engine, topology);
 468     network.set_input_data("input", input);
 469
 470     auto outputs = network.execute();
 471     EXPECT_EQ(outputs.size(), size_t(1));
 472     EXPECT_EQ(outputs.begin()->first, "conv");
 473
 474     auto output_memory = outputs.at("conv").get_memory();
 475     auto output_layout = output_memory.get_layout();
 476     auto output_ptr = output_memory.pointer<float>();
 477
 478     int y_size = output_layout.size.spatial[1];
 479     int x_size = output_layout.size.spatial[0];
 480     int f_size = output_layout.size.feature[0];
 481     int b_size = output_layout.size.batch[0];
 482     EXPECT_EQ(output_layout.format, format::yxfb);
 483     EXPECT_EQ(y_size, 6);
 484     EXPECT_EQ(x_size, 5);
 485     EXPECT_EQ(f_size, 1);
 486     EXPECT_EQ(b_size, 1);
 487
 488     for (int y = 0; y < y_size; ++y) {
 489         for (int x = 0; x < x_size; ++x) {
 490             EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
 491         }
 492     }
 493
 494     //VVF temp_vec(y_size, VF(x_size, 0.0f));
 495     //for (int y = 0; y < y_size; ++y) {
 496     //    for (int x = 0; x < x_size; ++x) {
 497     //        temp_vec[y][x] = output_ptr[y * x_size + x];
 498     //    }
 499     //}
 500     //print_2d(temp_vec);
 501 }
 502
 503 TEST(convolution_f32_fw_gpu, basic_convolution_input_and_output_padding) {
 504     //  Filter : 2x2
 505     //  Stride : 1x1
 506     //  Input  : 3x4
 507     //  Input padding : 2x1
 508     //  Output : 8x9
 509     //  Padding: Zero
 510     //
 511     //  Input:
 512     //  z  z  z  z  z  z
 513     //  z  z  z  z  z  z
 514     //  z  1  2  3  4  z
 515     //  z  2  2  3  4  z
 516     //  z  3  3  3  5  z
 517     //  z  z  z  z  z  z
 518     //  z  z  z  z  z  z
 519     //
 520     //  Filter:
 521     //  1  1
 522     //  1  1
 523     //
 524     //  Output:
 525     //  1  1  1  1  1  1  1  1  1
 526     //  1  1  1  1  1  1  1  1  1
 527     //  1  1  2  4  6  8  5  1  1
 528     //  1  1  4  8 11 15  9  1  1
 529     //  1  1  6 11 12 16 10  1  1
 530     //  1  1  4  7  7  9  6  1  1
 531     //  1  1  1  1  1  1  1  1  1
 532     //  1  1  1  1  1  1  1  1  1
 533     //
 534     //  Bias:
 535     //  1
 536
 537     engine engine;
 538
 539     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 3 } });
 540     auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
 541     auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
 542
 543     set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f, 4.0f, 3.0f, 3.0f, 3.0f, 5.0f });
 544     set_values(weights, { 1.0f, 1.0f, 1.0f, 1.0f });
 545     set_values(biases, { 1.0f });
 546     VVF<float> output_vec = {
 547         { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
 548         { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
 549         { 1.0f, 1.0f, 2.0f, 4.0f, 6.0f, 8.0f, 5.0f, 1.0f, 1.0f },
 550         { 1.0f, 1.0f, 4.0f, 8.0f, 11.0f, 15.0f, 9.0f, 1.0f, 1.0f },
 551         { 1.0f, 1.0f, 6.0f, 11.0f, 12.0f, 16.0f, 10.0f, 1.0f, 1.0f },
 552         { 1.0f, 1.0f, 4.0f, 7.0f, 7.0f, 9.0f, 6.0f, 1.0f, 1.0f },
 553         { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
 554         { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f } };
 555
 556     const int x_pad = 2;
 557     const int y_pad = 1;
 558     topology topology(
 559         input_layout("input", input.get_layout()),
 560         data("weights", weights),
 561         data("biases", biases),
 562         convolution(
 563             "conv",
 564             "input",
 565             { "weights" },
 566             { "biases" },
 567             { 1,1,1,1 },
 568             { 0,0,-1,-2 },
 569             { 1, 1, 1, 1 },
 570             false,
 571             0,
 572             padding{ { 0,0,-x_pad,-y_pad }, 0 })
 573     );
 574
 575     network network(engine, topology);
 576     network.set_input_data("input", input);
 577
 578     auto outputs = network.execute();
 579     EXPECT_EQ(outputs.size(), size_t(1));
 580     EXPECT_EQ(outputs.begin()->first, "conv");
 581
 582     auto output_memory = outputs.at("conv").get_memory();
 583     auto output_layout = output_memory.get_layout();
 584     auto output_size = output_layout.get_buffer_size();
 585     auto output_ptr = output_memory.pointer<float>();
 586
 587     int y_size = output_size.spatial[1];
 588     int x_size = output_size.spatial[0];
 589     int f_size = output_size.feature[0];
 590     int b_size = output_size.batch[0];
 591     EXPECT_EQ(output_layout.format, format::yxfb);
 592     EXPECT_EQ(y_size, 8);
 593     EXPECT_EQ(x_size, 9);
 594     EXPECT_EQ(f_size, 1);
 595     EXPECT_EQ(b_size, 1);
 596
 597     for (int y = y_pad; y < y_size - y_pad; ++y)
 598     {
 599         for (int x = x_pad; x < x_size - x_pad; ++x)
 600         {
 601             EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
 602         }
 603     }
 604
 605     //VVF temp_vec(y_size, VF(x_size, 0.0f));
 606     //for (int y = 0; y < y_size; ++y) {
 607     //    for (int x = 0; x < x_size; ++x) {
 608     //        temp_vec[y][x] = output_ptr[y * x_size + x];
 609     //    }
 610     //}
 611     //print_2d(temp_vec);
 612 }
 613
 614 TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x1x1_nopad_random) {
 615     //  Filter : 2x2
 616     //  Stride : 2x2
 617     //  Input  : 4x4
 618     //  Output : 2x2
 619     //
 620     //  Input:
 621     //  rnd  rnd  rnd  rnd
 622     //  rnd  rnd  rnd  rnd
 623     //  rnd  rnd  rnd  rnd
 624     //  rnd  rnd  rnd  rnd
 625     //
 626     //  Filter
 627     //  rnd  rnd
 628     //  rnd  rnd
 629     //
 630     //  Bias
 631     //  rnd
 632     //
 633     //  Output:
 634     //  rnd  rnd
 635     //  rnd  rnd
 636
 637     size_t batch = 1, input_f = 1, input_y = 4, input_x = 4;
 638
 639     VVVVF<float> input_rnd = generate_random_4d<float>(batch, input_f, input_y, input_x, -10, 10);
 640     VF<float> input_rnd_vec = flatten_4d<float>(format::yxfb, input_rnd);
 641     VVVVF<float> filter_rnd = generate_random_4d<float>(1, 1, 2, 2, -10, 10);
 642     VF<float> filter_rnd_vec = flatten_4d<float>(format::bfyx, filter_rnd);
 643     VF<float> bias_rnd = generate_random_1d<float>(1, -10, 10);
 644     VVVVF<float> output_rnd(batch, VVVF<float>(filter_rnd.size()));
 645     for (size_t b = 0; b < output_rnd.size(); ++b) {
 646         for (size_t of = 0; of < filter_rnd.size(); ++of) {
 647             output_rnd[b][of] = reference_convolve<float>(input_rnd[b], filter_rnd[of], 2, 2, bias_rnd[of]);
 648         }
 649     }
 650     VF<float> output_rnd_vec = flatten_4d<float>(format::yxfb, output_rnd);
 651
 652     engine engine;
 653
 654     auto input = memory::allocate(engine, { data_types::f32,  format::yxfb, { 1, 1, 4, 4 } });
 655     //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 2, 2 }, 1 } });
 656     auto weights = memory::allocate(engine, { data_types::f32,  format::bfyx, { 1, 1, 2, 2 } });
 657     auto biases = memory::allocate(engine, { data_types::f32,  format::bfyx, { 1, 1, 1, 1 } });
 658
 659     set_values(input, input_rnd_vec);
 660     set_values(weights, filter_rnd_vec);
 661     set_values(biases, bias_rnd);
 662
 663     topology topology(
 664         input_layout("input", input.get_layout()),
 665         data("weights", weights),
 666         data("biases", biases),
 667         convolution("conv", "input", {"weights"}, {"biases"}, {1,1,2,2})
 668     );
 669
 670     network network(engine, topology);
 671     network.set_input_data("input", input);
 672
 673     auto outputs = network.execute();
 674     EXPECT_EQ(outputs.size(), size_t(1));
 675     EXPECT_EQ(outputs.begin()->first, "conv");
 676
 677     auto output_prim = outputs.begin()->second.get_memory();
 678
 679     auto output_ptr = output_prim.pointer<float>();
 680
 681     for (size_t i = 0; i < output_rnd.size(); ++i) {
 682         float x = float_round(output_rnd_vec[i]), y = float_round(output_ptr[i]);
 683         EXPECT_FLOAT_EQ(x, y) << "random seed = " << random_seed << std::endl;
 684     }
 685 }
 686
 687 TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in2x2x1x2_nopad_random) {
 688     //  Filter : 2x2
 689     //  Stride : 2x2
 690     //  Input  : 2x2x1x2
 691     //  Output : 1x1x1x2
 692     //
 693     //  Input:
 694     //  rnd  rnd    rnd  rnd
 695     //  rnd  rnd    rnd  rnd
 696     //
 697     //  Filter:
 698     //  rnd  rnd
 699     //  rnd  rnd
 700     //
 701     //  Bias:
 702     //  rnd
 703     //
 704     //  Output:
 705     //  rnd  rnd
 706
 707     size_t batch = 2, input_f = 1, input_y = 2, input_x = 2;
 708
 709     VVVVF<float> input_rnd = generate_random_4d<float>(batch, input_f, input_y, input_x, -10, 10);
 710     VF<float> input_rnd_vec = flatten_4d<float>(format::yxfb, input_rnd);
 711     VVVVF<float> filter_rnd = generate_random_4d<float>(1, 1, 2, 2, -10, 10);
 712     VF<float> filter_rnd_vec = flatten_4d<float>(format::bfyx, filter_rnd);
 713     VF<float> bias_rnd = generate_random_1d<float>(1, -10, 10);
 714     VVVVF<float> output_rnd(batch, VVVF<float>(filter_rnd.size()));
 715     for (size_t b = 0; b < output_rnd.size(); ++b) {
 716         for (size_t of = 0; of < filter_rnd.size(); ++of) {
 717             output_rnd[b][of] = reference_convolve<float>(input_rnd[b], filter_rnd[of], 2, 2, bias_rnd[of]);
 718         }
 719     }
 720     VF<float> output_rnd_vec = flatten_4d<float>(format::yxfb, output_rnd);
 721
 722     engine engine;
 723
 724     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 1, 2, 2 } });
 725     //auto output = memory::allocate({ memory::format::yxfb_f32,{ 2,{ 1, 1 }, 1 } });
 726     auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
 727     auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
 728
 729     set_values(input, input_rnd_vec);
 730     set_values(weights, filter_rnd_vec);
 731     set_values(biases, bias_rnd);
 732
 733     topology topology(
 734         input_layout("input", input.get_layout()),
 735         data("weights", weights),
 736         data("biases", biases),
 737         convolution("conv", "input", { "weights" }, { "biases" }, { 1,1,2,2 })
 738     );
 739
 740     network network(engine, topology);
 741     network.set_input_data("input", input);
 742
 743     auto outputs = network.execute();
 744     EXPECT_EQ(outputs.size(), size_t(1));
 745     EXPECT_EQ(outputs.begin()->first, "conv");
 746
 747     auto output_prim = outputs.begin()->second.get_memory();
 748
 749     auto output_ptr = output_prim.pointer<float>();
 750
 751     for (size_t i = 0; i < output_rnd.size(); ++i) {
 752         float x = float_round(output_rnd_vec[i]), y = float_round(output_ptr[i]);
 753         EXPECT_FLOAT_EQ(x, y) << "random seed = " << random_seed << std::endl;
 754     }
 755 }
 756
 757 TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x1x1_nopad) {
 758     //  Filter : 2x2
 759     //  Stride : 2x2
 760     //  Input  : 4x4
 761     //  Output : 2x2
 762     //
 763     //  Input:
 764     //  -0.5   1     0.5  2
 765     //   1.5  -0.5   0   -1
 766     //   0.5   0.5  -1    1
 767     //   0.5   2     1.5 -0.5
 768     //
 769     //  Filter
 770     //  -2   0.5
 771     //   3.5 1.5
 772     //
 773     //  Bias
 774     //  2
 775     //
 776     //  Output:
 777     //  8  0.5
 778     //  6  9
 779
 780     engine engine;
 781
 782     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 4 } });
 783     //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 2, 2 }, 1 } });
 784     auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
 785     auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
 786
 787     set_values(input, { -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f });
 788     set_values(weights, { -2.0f, 0.5f, 3.5f, 1.5f });
 789     set_values(biases, { 2.0f });
 790
 791     topology topology(
 792         input_layout("input", input.get_layout()),
 793         data("weights", weights),
 794         data("biases", biases),
 795         convolution("conv", "input", { "weights" }, { "biases" }, { 1,1,2,2 })
 796     );
 797
 798     network network(engine, topology);
 799     network.set_input_data("input", input);
 800
 801     auto outputs = network.execute();
 802     EXPECT_EQ(outputs.size(), size_t(1));
 803     EXPECT_EQ(outputs.begin()->first, "conv");
 804
 805     auto output_prim = outputs.begin()->second.get_memory();
 806
 807     auto output_ptr = output_prim.pointer<float>();
 808
 809     EXPECT_FLOAT_EQ(8.0f, output_ptr[0]);
 810     EXPECT_FLOAT_EQ(0.5f, output_ptr[1]);
 811     EXPECT_FLOAT_EQ(6.0f, output_ptr[2]);
 812     EXPECT_FLOAT_EQ(9.0f, output_ptr[3]);
 813 }
 814
 815 TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in2x2x1x2_nopad) {
 816     //  Filter : 2x2
 817     //  Stride : 2x2
 818     //  Input  : 2x2x1x2
 819     //  Output : 1x1x1x2
 820     //
 821     //  Input:
 822     //  0.5   1.5    2.3 -0.4
 823     //  2.0  -4.0    1.0  3.0
 824     //
 825     //  Filter:
 826     //  -1.2  1.5
 827     //   0.5 -0.5
 828     //
 829     //  Bias:
 830     //  -1
 831     //
 832     //  Output:
 833     //  3.65 -5.36
 834     engine engine;
 835
 836     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 1, 2, 2 } });
 837     //auto output = memory::allocate({ memory::format::yxfb_f32,{ 2,{ 1, 1 }, 1 } });
 838     auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
 839     auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
 840
 841     set_values(input, { 0.5f, 2.3f, 1.5f, -0.4f, 2.0f, 1.0f, -4.0f, 3.0f });
 842     set_values(weights, { -1.2f, 1.5f, 0.5f, -0.5f });
 843     set_values(biases, { -1.0f });
 844
 845     topology topology(
 846         input_layout("input", input.get_layout()),
 847         data("weights", weights),
 848         data("biases", biases),
 849         convolution("conv", "input", { "weights" }, { "biases" }, { 1,1,2,2 } )
 850     );
 851
 852     network network(engine, topology);
 853     network.set_input_data("input", input);
 854
 855     auto outputs = network.execute();
 856     EXPECT_EQ(outputs.size(), size_t(1));
 857     EXPECT_EQ(outputs.begin()->first, "conv");
 858
 859     auto output_prim = outputs.begin()->second.get_memory();
 860
 861     auto output_ptr = output_prim.pointer<float>();
 862
 863     EXPECT_FLOAT_EQ(3.65f, output_ptr[0]);
 864     EXPECT_FLOAT_EQ(-5.36f, output_ptr[1]);
 865 }
 866
 867 TEST(convolution_f32_fw_gpu, basic_ofm_wsiz2x1x2x1_in1x2x1_nopad) {
 868     //  Filter : 1x2x1x2x1
 869     //  Input  : 1x1x2x1
 870     //  Output : 1x2x1x1
 871     //
 872     //  Input:
 873     //  1.0    2.0
 874     //
 875     // Filter:
 876     //   1.0    2.0  ofm=0
 877     //  -1.0   -2.0  ofm=1
 878     //
 879     //  Bias:
 880     //  0.1 -0.2
 881     //
 882     //  Output:
 883     //   5.1  f=0
 884     //  -5.2  f=1
 885
 886     engine engine;
 887
 888     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 1, 2 } });
 889     //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 1, 1 }, 2 } });
 890     auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 1, 2 } });
 891     auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 1 } });
 892
 893     set_values(input, { 1.0f, 2.0f });
 894     set_values(weights, { 1.0f, 2.0f, -1.0f, -2.0f });
 895     set_values(biases, { 0.1f, -0.2f });
 896
 897     topology topology(
 898         input_layout("input", input.get_layout()),
 899         data("weights", weights),
 900         data("biases", biases),
 901         convolution("conv", "input", { "weights" }, { "biases" }, { 1,1,5,5 })
 902     );
 903
 904     network network(engine, topology);
 905     network.set_input_data("input", input);
 906
 907     auto outputs = network.execute();
 908     EXPECT_EQ(outputs.size(), size_t(1));
 909     EXPECT_EQ(outputs.begin()->first, "conv");
 910
 911     auto output_prim = outputs.begin()->second.get_memory();
 912
 913     auto output_ptr = output_prim.pointer<float>();
 914
 915     EXPECT_FLOAT_EQ(5.1f, output_ptr[0]);
 916     EXPECT_FLOAT_EQ(-5.2f, output_ptr[1]);
 917 }
 918
 919 TEST(convolution_f32_fw_gpu, basic_ofm_wsiz3x2x2x1_in2x2x1_nopad) {
 920     //  Filter : 1x3x2x2x1
 921     //  Input  : 1x2x2x1
 922     //  Output : 1x3x1x1
 923     //
 924     //  Input:
 925     //  1.0    2.0  f=0
 926     //  3.0    4.0  f=1
 927     //
 928     // Filter:
 929     //   1.0    2.0  ifm=0  ofm=0
 930     //   3.0    4.0  ifm=1
 931     //
 932     //   5.0    6.0  ifm=0  ofm=1
 933     //   7.0    8.0  ifm=1
 934     //
 935     //   9.0   10.0  ifm=0  ofm=2
 936     //  11.0   12.0  ifm=1
 937     //  Bias:
 938     //   -5     -6     -7
 939     //
 940     //  Output:
 941     //   25.0  f=0
 942     //   64,0  f=1
 943     //  103.0  f=2
 944
 945     engine engine;
 946
 947     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 2, 1, 2 } });
 948     //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 1, 1 }, 3 } });
 949     auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 2, 1, 2 } });
 950     auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 3, 1 } });
 951
 952     set_values(input, { 1.0f, 3.0f, 2.0f, 4.0f });
 953     set_values(weights, { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f });
 954     set_values(biases, { -5.0f, -6.0f, -7.0f });
 955
 956     topology topology(
 957         input_layout("input", input.get_layout()),
 958         data("weights", weights),
 959         data("biases", biases),
 960         convolution("conv", "input", { "weights" }, { "biases" }, { 1,1,5,5 })
 961     );
 962
 963     network network(engine, topology);
 964     network.set_input_data("input", input);
 965
 966     auto outputs = network.execute();
 967     EXPECT_EQ(outputs.size(), size_t(1));
 968     EXPECT_EQ(outputs.begin()->first, "conv");
 969
 970     auto output_prim = outputs.begin()->second.get_memory();
 971
 972     auto output_ptr = output_prim.pointer<float>();
 973
 974     EXPECT_FLOAT_EQ(25.0f, output_ptr[0]);
 975     EXPECT_FLOAT_EQ(64.0f, output_ptr[1]);
 976     EXPECT_FLOAT_EQ(103.0f, output_ptr[2]);
 977 }
 978
 979 TEST(convolution_f32_fw_gpu, basic_wsiz2x2x1x3_wstr2x2_in2x2x1x1_nopad) {
 980     //  Filter : 2x2x1x3
 981     //  Stride : 2x2
 982     //  Input  : 2x2x1x1
 983     //  Output : 1x1x3x1
 984     //
 985     //  Input:
 986     //  -2.3 -0.1
 987     //   3.1  1.9
 988     //
 989     //  Filter:
 990     //  -1.1  1.5       0.1  0.2        2.0  -1.0
 991     //   0.5 -0.5       0.4  0.7        2.5  -1.5
 992     //
 993     //  Bias:
 994     //  0.1 -0.2 0.3
 995     //
 996     //  Output:
 997     //   0.7
 998     //   2.12
 999     //   3.08
1000
1001     engine engine;
1002
1003     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } });
1004     //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 1, 1 }, 3 } });
1005     auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 1, 2, 2 } });
1006     auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 3, 1 } });
1007
1008     set_values(input, { -2.3f, -0.1f, 3.1f, 1.9f });
1009     set_values(weights, { -1.1f, 1.5f, 0.5f, -0.5f, 0.1f, 0.2f, 0.4f, 0.7f, 2.0f, -1.0f, 2.5f, -1.5f });
1010     set_values(biases, { 0.1f, -0.2f, 0.3f });
1011
1012     topology topology(
1013         input_layout("input", input.get_layout()),
1014         data("weights", weights),
1015         data("biases", biases),
1016         convolution("conv", "input", { "weights" }, { "biases" }, { 1,1,2,2 })
1017     );
1018
1019     network network(engine, topology);
1020     network.set_input_data("input", input);
1021
1022     auto outputs = network.execute();
1023     EXPECT_EQ(outputs.size(), size_t(1));
1024     EXPECT_EQ(outputs.begin()->first, "conv");
1025
1026     auto output_prim = outputs.begin()->second.get_memory();
1027
1028     auto output_ptr = output_prim.pointer<float>();
1029
1030     EXPECT_TRUE(are_equal(3.08f, output_ptr[0]));
1031     EXPECT_TRUE(are_equal(2.12f, output_ptr[1]));
1032     EXPECT_TRUE(are_equal(0.7f,  output_ptr[2]));
1033 }
1034
1035 TEST(convolution_f32_fw_gpu, wsiz3x3_wstr2x2_in2x2x1x1_zeropad) {
1036     //  Filter  : 3x3
1037     //  Stride  : 2x2
1038     //  Input   : 2x2
1039     //  Output  : 1x1
1040     //  Padding : zero
1041     //
1042     //  Input:
1043     //  -0.5   1.0   padd
1044     //   0.5   2.0   padd
1045     //  padd  padd   padd
1046     //
1047     //  Filter
1048     //  -2    0.5  3.5
1049     //   1.5  4   -5
1050     //   0.5  1.5 -1.5
1051     //
1052     //  Bias
1053     //  2
1054     //
1055     //  Output:
1056     //  12.25
1057     engine engine;
1058
1059     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } });
1060     //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 1, 1 }, 1 } });
1061     auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 3, 3 } });
1062     auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
1063
1064     set_values(input, { -0.5f, 1.0f, 0.5f, 2.0f });
1065     set_values(weights, { -2.0f, 0.5f, 3.5f, 1.5f, 4.0f, -5.0f, 0.5f, 1.5f, -1.5f });
1066     set_values(biases, { 2.0f });
1067
1068     topology topology(
1069         input_layout("input", input.get_layout()),
1070         data("weights", weights),
1071         data("biases", biases),
1072         convolution("conv", "input", { "weights" }, { "biases" }, { 1,1,2,2 })
1073     );
1074
1075     network network(engine, topology);
1076     network.set_input_data("input", input);
1077
1078     auto outputs = network.execute();
1079     EXPECT_EQ(outputs.size(), size_t(1));
1080     EXPECT_EQ(outputs.begin()->first, "conv");
1081
1082     auto output_prim = outputs.begin()->second.get_memory();
1083
1084     auto output_ptr = output_prim.pointer<float>();
1085
1086     EXPECT_FLOAT_EQ(12.25f, output_ptr[0]);
1087 }
1088
1089 TEST(convolution_f32_fw_gpu, offsets_wsiz3x3_wstr2x2_in2x2x1x1_zeropad) {
1090     //   Filter       : 3x3
1091     //   Stride       : 2x2
1092     //   Input        : 2x2
1093     //   Input offset : -1x-1
1094     //   Output       : 2x2
1095     //   Output offset: 1x1
1096     //   Padding      : zero
1097     //
1098     //   Input:
1099     //   padd padd  padd
1100     //   padd -0.5   1
1101     //   padd  0.5   2.0
1102     //
1103     //   Filter
1104     //   -2    0.5  3.5
1105     //    1.5  4   -5
1106     //    0.5  1.5 -1.5
1107     //
1108     //   Bias
1109     //   2
1110     //
1111     //   Output:
1112     //   rnd   rnd
1113     //   rnd   2.0
1114     engine engine;
1115
1116     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } });
1117     //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 2, 2 }, 1 } });
1118     auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 3, 3 } });
1119     auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
1120
1121     set_values(input, { -0.5f, 1.0f, 0.5f, 2.0f });
1122     set_values(weights, { -2.0f, 0.5f, 3.5f, 1.5f, 4.0f, -5.0f, 0.5f, 1.5f, -1.5f });
1123     set_values(biases, { 2.0f });
1124
1125     topology topology(
1126         input_layout("input", input.get_layout()),
1127         data("weights", weights),
1128         data("biases", biases),
1129         convolution(
1130             "conv",
1131             "input",
1132             { "weights" },
1133             { "biases" },
1134             { 1,1,2,2 },
1135             { 0,0,-1,-1 },
1136             { 1, 1, 1, 1 },
1137             false,
1138             0,
1139             padding{ { 0,0,1,1 }, 0 })
1140     );
1141
1142     network network(engine, topology);
1143     network.set_input_data("input", input);
1144
1145     auto outputs = network.execute();
1146     EXPECT_EQ(outputs.size(), size_t(1));
1147     EXPECT_EQ(outputs.begin()->first, "conv");
1148
1149     auto output_prim = outputs.begin()->second.get_memory();
1150
1151     auto output_ptr = output_prim.pointer<float>();
1152
1153     EXPECT_FLOAT_EQ(-7.25f, output_ptr[4]);
1154 }
1155
1156 TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x1_nopad_split2) {
1157     //  Filter : 2x2
1158     //  Stride : 2x2
1159     //  Input  : 4x4x2
1160     //  Output : 2x2x2
1161     //
1162     //  Input:
1163     //  f0: -0.5   1     0.5  2
1164     //       1.5  -0.5   0   -1
1165     //       0.5   0.5  -1    1
1166     //       0.5   2     1.5 -0.5
1167     //
1168     //  f1:  0.5   1.5   2.3 -0.4
1169     //       2.0  -4.0   1.0  3.0
1170     //       0.5   1.5   2.3 -0.4
1171     //       2.0  -4.0   1.0  3.0
1172     //
1173     //  Filter1:
1174     //  -2   0.5
1175     //   3.5 1.5
1176     //
1177     //  Bias1:
1178     //  2
1179     //
1180     //  Filter2:
1181     //  -1.2  1.5
1182     //   0.5 -0.5
1183     //
1184     //  Bias2:
1185     //  -1
1186
1187     //  Output:
1188     //   8  3.65 0.5 -5.36
1189     //   6  3.65 9   -5.36
1190
1191     engine engine;
1192
1193     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 2, 4, 4 } });
1194     //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 2, 2 }, 2 } });
1195     auto weights1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
1196     auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
1197     auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
1198     auto biases2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
1199
1200     set_values(input, {
1201         -0.5f,  0.5f,  1.0f,  1.5f,  0.5f,  2.3f,  2.0f, -0.4f,
1202         1.5f,  2.0f, -0.5f, -4.0f,  0.0f,  1.0f, -1.0f,  3.0f,
1203         0.5f,  0.5f,  0.5f,  1.5f, -1.0f,  2.3f,  1.0f, -0.4f,
1204         0.5f,  2.0f,  2.0f, -4.0f,  1.5f,  1.0f, -0.5f,  3.0f
1205     });
1206     set_values(weights1, { -2.0f, 0.5f, 3.5f, 1.5f });
1207     set_values(biases1, { 2.0f });
1208     set_values(weights2, { -1.2f, 1.5f, 0.5f, -0.5f });
1209     set_values(biases2, { -1.0f });
1210
1211     topology topology(
1212         input_layout("input", input.get_layout()),
1213         data("weights1", weights1),
1214         data("biases1", biases1),
1215         data("weights2", weights2),
1216         data("biases2", biases2),
1217         convolution(
1218             "conv",
1219             "input",
1220             { "weights1", "weights2" },
1221             { "biases1", "biases2" },
1222             { 0,0,2,2 },
1223             { 0,0,0,0 },
1224             { 1,1,1,1 })
1225     );
1226
1227     network network(engine, topology);
1228     network.set_input_data("input", input);
1229
1230     auto outputs = network.execute();
1231     EXPECT_EQ(outputs.size(), size_t(1));
1232     EXPECT_EQ(outputs.begin()->first, "conv");
1233
1234     auto output_prim = outputs.begin()->second.get_memory();
1235
1236     auto output_ptr = output_prim.pointer<float>();
1237
1238     EXPECT_FLOAT_EQ(8.0f,   get_value<float>(output_ptr, 0));
1239     EXPECT_FLOAT_EQ(3.65f,  get_value<float>(output_ptr, 1));
1240     EXPECT_FLOAT_EQ(0.5f,   get_value<float>(output_ptr, 2));
1241     EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 3));
1242     EXPECT_FLOAT_EQ(6.0f,   get_value<float>(output_ptr, 4));
1243     EXPECT_FLOAT_EQ(3.65f,  get_value<float>(output_ptr, 5));
1244     EXPECT_FLOAT_EQ(9.0f,   get_value<float>(output_ptr, 6));
1245     EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 7));
1246 }
1247
1248 TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2) {
1249     //  2x Filter : 2x2
1250     //  Stride : 2x2
1251     //  Input  : 2x4x4x2
1252     //  Output : 2x2x2x2
1253     //
1254     //  Input:
1255     //  f0b0: -0.5   1     0.5  2
1256     //         1.5  -0.5   0   -1
1257     //         0.5   0.5  -1    1
1258     //         0.5   2     1.5 -0.5
1259     //
1260     //  f0b1: -0.5   1     0.5  2
1261     //         1.5  -0.5   0   -1
1262     //         0.5   0.5  -1    1
1263     //         0.5   2     1.5 -0.5
1264     //
1265     //  f1b0:  0.5   1.5   2.3 -0.4
1266     //         2.0  -4.0   1.0  3.0
1267     //         0.5   1.5   2.3 -0.4
1268     //         2.0  -4.0   1.0  3.0
1269     //
1270     //  f1b1:  0.5   1.5   2.3 -0.4
1271     //         2.0  -4.0   1.0  3.0
1272     //         0.5   1.5   2.3 -0.4
1273     //         2.0  -4.0   1.0  3.0
1274     //
1275     //
1276     //  Filter1:
1277     //  -2   0.5
1278     //   3.5 1.5
1279     //
1280     //  Bias1:
1281     //  2
1282     //
1283     //  Filter2:
1284     //  -1.2  1.5
1285     //   0.5 -0.5
1286     //
1287     //  Bias2:
1288     //  -1
1289
1290     //  Output:
1291     //   8  8 3.65 3.65 0.5  0.5 -5.36 -5.36
1292     //   6  6 3.65 3.65 9    9   -5.36 -5.36
1293
1294     engine engine;
1295
1296     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 4, 4 } });
1297     //auto output = memory::allocate({ memory::format::yxfb_f32,{ 2,{ 2, 2 }, 2 } });
1298     auto weights1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
1299     auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
1300     auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
1301     auto biases2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
1302
1303     set_values(input, {
1304        -0.5f, -0.5f,  0.5f,  0.5f,  1.0f,  1.0f,  1.5f,  1.5f,  0.5f,  0.5f,  2.3f,  2.3f,  2.0f,  2.0f, -0.4f, -0.4f,
1305         1.5f,  1.5f,  2.0f,  2.0f, -0.5f, -0.5f, -4.0f, -4.0f,  0.0f,  0.0f,  1.0f,  1.0f, -1.0f, -1.0f,  3.0f,  3.0f,
1306         0.5f,  0.5f,  0.5f,  0.5f,  0.5f,  0.5f,  1.5f,  1.5f, -1.0f, -1.0f,  2.3f,  2.3f,  1.0f,  1.0f, -0.4f, -0.4f,
1307         0.5f,  0.5f,  2.0f,  2.0f,  2.0f,  2.0f, -4.0f, -4.0f,  1.5f,  1.5f,  1.0f,  1.0f, -0.5f, -0.5f,  3.0f,  3.0f,
1308     });
1309     set_values(weights1, { -2.0f, 0.5f, 3.5f, 1.5f });
1310     set_values(biases1, { 2.0f });
1311     set_values(weights2, { -1.2f, 1.5f, 0.5f, -0.5f });
1312     set_values(biases2, { -1.0f });
1313
1314     topology topology(
1315         input_layout("input", input.get_layout()),
1316         data("weights1", weights1),
1317         data("biases1", biases1),
1318         data("weights2", weights2),
1319         data("biases2", biases2),
1320         convolution(
1321             "conv",
1322             "input",
1323             { "weights1", "weights2" },
1324             { "biases1", "biases2" },
1325             { 1,1,2,2 },
1326             { 0,0,0,0 },
1327             { 1,1,1,1 })
1328     );
1329
1330     network network(engine, topology);
1331     network.set_input_data("input", input);
1332
1333     auto outputs = network.execute();
1334     EXPECT_EQ(outputs.size(), size_t(1));
1335     EXPECT_EQ(outputs.begin()->first, "conv");
1336
1337     auto output_prim = outputs.begin()->second.get_memory();
1338
1339     auto output_ptr = output_prim.pointer<float>();
1340
1341     EXPECT_FLOAT_EQ(8.0f,   get_value<float>(output_ptr, 0));
1342     EXPECT_FLOAT_EQ(8.0f,   get_value<float>(output_ptr, 1));
1343     EXPECT_FLOAT_EQ(3.65f,  get_value<float>(output_ptr, 2));
1344     EXPECT_FLOAT_EQ(3.65f,  get_value<float>(output_ptr, 3));
1345     EXPECT_FLOAT_EQ(0.5f,   get_value<float>(output_ptr, 4));
1346     EXPECT_FLOAT_EQ(0.5f,   get_value<float>(output_ptr, 5));
1347     EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 6));
1348     EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 7));
1349     EXPECT_FLOAT_EQ(6.0f,   get_value<float>(output_ptr, 8));
1350     EXPECT_FLOAT_EQ(6.0f,   get_value<float>(output_ptr, 9));
1351     EXPECT_FLOAT_EQ(3.65f,  get_value<float>(output_ptr, 10));
1352     EXPECT_FLOAT_EQ(3.65f,  get_value<float>(output_ptr, 11));
1353     EXPECT_FLOAT_EQ(9.0f,   get_value<float>(output_ptr, 12));
1354     EXPECT_FLOAT_EQ(9.0f,   get_value<float>(output_ptr, 13));
1355     EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 14));
1356     EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 15));
1357 }
1358
1359 TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2_depthwise_sep_opt) {
1360     //  Test for depthwise separable optimization, there are 16 weights and biases (split 16)
1361     //  data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2 but with batch 1
1362
1363     engine engine;
1364
1365     auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 16, 4, 4 } });
1366
1367     set_values(input, {
1368         -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f,
1369         1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f,
1370         0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f,
1371         2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f,
1372         1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f,
1373         -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f,
1374         0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f,
1375         -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f,
1376         0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f,
1377         0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f,
1378         -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f,
1379         1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f,
1380         0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f,
1381         2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f,
1382         1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f,
1383         -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f,
1384     });
1385
1386     topology topology(input_layout("input", input.get_layout()));
1387
1388     std::vector<primitive_id> weights_vec;
1389     std::vector<primitive_id> bias_vec;
1390
1391     for (uint32_t i = 0; i < 8; i++)
1392     {
1393         auto weights1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } });
1394         auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });
1395         auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } });
1396         auto biases2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });
1397
1398         set_values(weights1, { -2.0f, 0.5f, 3.5f, 1.5f });
1399         set_values(biases1, { 2.0f });
1400         set_values(weights2, { -1.2f, 1.5f, 0.5f, -0.5f });
1401         set_values(biases2, { -1.0f });
1402
1403         primitive_id weights_id = "weights_" + std::to_string(i);
1404         primitive_id weights2_id = "weights2_" + std::to_string(i);
1405         primitive_id bias_id = "biases_" + std::to_string(i);
1406         primitive_id bias2_id = "biases2_" + std::to_string(i);
1407
1408         weights_vec.push_back(weights_id);
1409         weights_vec.push_back(weights2_id);
1410         bias_vec.push_back(bias_id);
1411         bias_vec.push_back(bias2_id);
1412
1413         topology.add(
1414             data(weights_id, weights1),
1415             data(bias_id, biases1),
1416             data(weights2_id, weights2),
1417             data(bias2_id, biases2)
1418         );
1419
1420     }
1421
1422     topology.add(
1423         convolution(
1424             "conv",
1425             "input",
1426             weights_vec,
1427             bias_vec,
1428             { 1,1,2,2 },
1429             { 0,0,0,0 },
1430             { 1,1,1,1 })
1431     );
1432
1433     network network(engine, topology);
1434     network.set_input_data("input", input);
1435
1436     auto outputs = network.execute();
1437     EXPECT_EQ(outputs.size(), size_t(1));
1438     EXPECT_EQ(outputs.begin()->first, "conv");
1439
1440     auto output_prim = outputs.begin()->second.get_memory();
1441
1442     auto output_ptr = output_prim.pointer<float>();
1443
1444     std::vector<float> expected_output_vec = {
1445         8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f,
1446         0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f,
1447         6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f,
1448         9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f,
1449     };
1450
1451     for (unsigned int i = 0; i < expected_output_vec.size(); i++)
1452     {
1453         EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]);
1454     }
1455 }
1456
1457 TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2_depthwise_sep_opt_bfyx) {
1458     //  Test for depthwise separable optimization, there are 16 weights and biases (split 16)
1459     //  data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2 but with batch 1
1460     engine engine;
1461
1462     auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 16, 4, 4 } });
1463
1464     set_values(input, {
1465         -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
1466         0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
1467         -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
1468         0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
1469         -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
1470         0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
1471         -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
1472         0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
1473         -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
1474         0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
1475         -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
1476         0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
1477         -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
1478         0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
1479         -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
1480         0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
1481     });
1482
1483     topology topology(input_layout("input", input.get_layout()));
1484
1485     std::vector<primitive_id> weights_vec;
1486     std::vector<primitive_id> bias_vec;
1487
1488     for (uint32_t i = 0; i < 8; i++)
1489     {
1490         auto weights1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } });
1491         auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });
1492         auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } });
1493         auto biases2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });
1494
1495         set_values(weights1, { -2.0f, 0.5f, 3.5f, 1.5f });
1496         set_values(biases1, { 2.0f });
1497         set_values(weights2, { -1.2f, 1.5f, 0.5f, -0.5f });
1498         set_values(biases2, { -1.0f });
1499
1500         primitive_id weights_id = "weights_" + std::to_string(i);
1501         primitive_id weights2_id = "weights2_" + std::to_string(i);
1502         primitive_id bias_id = "biases_" + std::to_string(i);
1503         primitive_id bias2_id = "biases2_" + std::to_string(i);
1504
1505         weights_vec.push_back(weights_id);
1506         weights_vec.push_back(weights2_id);
1507         bias_vec.push_back(bias_id);
1508         bias_vec.push_back(bias2_id);
1509
1510         topology.add(
1511             data(weights_id, weights1),
1512             data(bias_id, biases1),
1513             data(weights2_id, weights2),
1514             data(bias2_id, biases2)
1515         );
1516
1517     }
1518
1519     topology.add(
1520         convolution(
1521             "conv",
1522             "input",
1523             weights_vec,
1524             bias_vec,
1525             { 1,1,2,2 },
1526             { 0,0,0,0 },
1527             { 1,1,1,1 })
1528     );
1529
1530     network network(engine, topology);
1531     network.set_input_data("input", input);
1532
1533     auto outputs = network.execute();
1534     EXPECT_EQ(outputs.size(), size_t(1));
1535     EXPECT_EQ(outputs.begin()->first, "conv");
1536
1537     auto output_prim = outputs.begin()->second.get_memory();
1538
1539     auto output_ptr = output_prim.pointer<float>();
1540
1541     std::vector<float> expected_output_vec = {
1542         8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
1543         8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
1544         8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
1545         8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
1546         8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
1547         8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
1548         8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
1549         8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
1550     };
1551
1552     for (unsigned int i = 0; i < expected_output_vec.size(); i++)
1553     {
1554         EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]);
1555     }
1556 }
1557
1558 TEST(convolution_f32_fw_gpu, basic_wsiz1x1_wstr2x2_in1x1x4x1_nopad_split2) {
1559     //  Filter : 1x1
1560     //  Stride : 2x2
1561     //  Input  : 1x1x4
1562     //  Output : 1x1x4
1563     //
1564     //  Input:
1565     //  f0:  1.5
1566     //  f1:  0.5
1567     //
1568     //  f2:  0.0
1569     //  f3: -0.5
1570     //
1571     //
1572     //  Filter1:
1573     //  -2 -0.5  ofm=0
1574     //   1  2    ofm=1
1575     //  Bias1:
1576     //   1  5
1577     //
1578     //  Filter2:
1579     //   4  1.5  ofm=0
1580     //   2  0.5  ofm=1
1581     //
1582     //  Bias2:
1583     //  -1  2.5
1584     //
1585     //  Output:
1586     //  -2.25
1587     //   7.5
1588     //
1589     //  -1.75
1590     //   2.25
1591
1592     engine engine;
1593
1594     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 4, 1, 1 } });
1595     //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 1, 1 }, 4 } });
1596     auto weights1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 1 } });
1597     auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 1 } });
1598     auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 1 } });
1599     auto biases2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 1 } });
1600
1601     set_values(input, {
1602        1.5f, 0.5f, 0.0f, -0.5f
1603     });
1604     set_values(weights1, { -2.0f, -0.5f, 1.0f, 2.0f });
1605     set_values(biases1, { 1.0f, 5.0f });
1606     set_values(weights2, { 4.0f, 1.5f, 2.0f, 0.5f });
1607     set_values(biases2, { -1.0f, 2.5f });
1608
1609     topology topology(
1610         input_layout("input", input.get_layout()),
1611         data("weights1", weights1),
1612         data("biases1", biases1),
1613         data("weights2", weights2),
1614         data("biases2", biases2),
1615         convolution(
1616             "conv",
1617             "input",
1618             { "weights1", "weights2" },
1619             { "biases1", "biases2" },
1620             { 1,1,2,2 },
1621             { 0,0,0,0 },
1622             { 1,1,1,1 })
1623     );
1624
1625     network network(engine, topology);
1626     network.set_input_data("input", input);
1627
1628     auto outputs = network.execute();
1629     EXPECT_EQ(outputs.size(), size_t(1));
1630     EXPECT_EQ(outputs.begin()->first, "conv");
1631
1632     auto output_prim = outputs.begin()->second.get_memory();
1633
1634     auto output_ptr = output_prim.pointer<float>();
1635
1636     EXPECT_FLOAT_EQ(-2.25f, get_value<float>(output_ptr, 0));
1637     EXPECT_FLOAT_EQ(7.5f, get_value<float>(output_ptr, 1));
1638     EXPECT_FLOAT_EQ(-1.75f, get_value<float>(output_ptr, 2));
1639     EXPECT_FLOAT_EQ(2.25f, get_value<float>(output_ptr, 3));
1640 }
1641
1642 TEST(convolution_f32_fw_gpu, basic_wsiz1x1_wstr2x2_in1x1x2x1_nopad_split2) {
1643     //  Filter : 1x1
1644     //  Stride : 2x2
1645     //  Input  : 1x1x2
1646     //  Output : 1x1x4
1647     //
1648     //  Input:
1649     //  f0:  1.5
1650     //
1651     //  f1:  0.5
1652     //
1653     //  Filter1:
1654     //  -2  ofm=0
1655     //   1  ofm=1
1656     //  Bias1:
1657     //   1  5
1658     //
1659     //  Filter2:
1660     //   4  ofm=0
1661     //   2  ofm=1
1662     //
1663     //  Bias2:
1664     //  -1  2.5
1665     //
1666     //  Output:
1667     //  -2
1668     //   6.5
1669     //
1670     //   1
1671     //   3.5
1672
1673
1674     engine engine;
1675
1676     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 2, 1, 1 } });
1677     //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 1, 1 }, 4 } });
1678     auto weights1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 1, 1 } });
1679     auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 1 } });
1680     auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 1, 1 } });
1681     auto biases2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 1 } });
1682
1683     set_values(input, {
1684         1.5f, 0.5f
1685     });
1686     set_values(weights1, { -2.0f, 1.0f });
1687     set_values(biases1, { 1.0f, 5.0f });
1688     set_values(weights2, { 4.0f, 2.0f });
1689     set_values(biases2, { -1.0f, 2.5f });
1690
1691     topology topology(
1692         input_layout("input", input.get_layout()),
1693         data("weights1", weights1),
1694         data("biases1", biases1),
1695         data("weights2", weights2),
1696         data("biases2", biases2),
1697         convolution(
1698             "conv",
1699             "input",
1700             { "weights1", "weights2" },
1701             { "biases1", "biases2" },
1702             { 1,1,2,2 },
1703             { 0,0,0,0 },
1704             { 1,1,1,1 })
1705     );
1706
1707     network network(engine, topology);
1708     network.set_input_data("input", input);
1709
1710     auto outputs = network.execute();
1711     EXPECT_EQ(outputs.size(), size_t(1));
1712     EXPECT_EQ(outputs.begin()->first, "conv");
1713
1714     auto output_prim = outputs.begin()->second.get_memory();
1715
1716     auto output_ptr = output_prim.pointer<float>();
1717
1718     EXPECT_FLOAT_EQ(-2.0f, get_value<float>(output_ptr, 0));
1719     EXPECT_FLOAT_EQ(6.5f, get_value<float>(output_ptr, 1));
1720     EXPECT_FLOAT_EQ(1.0f, get_value<float>(output_ptr, 2));
1721     EXPECT_FLOAT_EQ(3.5f, get_value<float>(output_ptr, 3));
1722 }
1723
1724 TEST(convolution_f32_fw_gpu, basic_wsiz1x1_wstr2x2_in1x1x4x1_filter_1x3x2x1x1_nopad_split2) {
1725     //  Filter : 1x1
1726     //  Stride : 2x2
1727     //  Input  : 1x1x4
1728     //  Output : 1x1x6
1729     //
1730     //  Input:
1731     //  f0:  1.5
1732     //  f1:  0.5
1733     //
1734     //  f2:  2
1735     //  f3: -1.0
1736     //
1737     //  Filter1:
1738     //  -2   1   ofm=0
1739     //   1   3   ofm=1
1740     //   0.5 8   ofm=2
1741     //  Bias1:
1742     //   1   5   3
1743     //
1744     //  Filter2:
1745     //   4  -4   ofm=0
1746     //   2   0.5 ofm=1
1747     //  -0.5 3   ofm=2
1748     //
1749     //  Bias2:
1750     //  -1   2.5 2
1751     //
1752     //  Output:
1753     //  -1.5
1754     //   8
1755     //   7.75
1756     //
1757     //   11
1758     //   6
1759     //  -2
1760
1761
1762     engine engine;
1763
1764     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 4, 1, 1 } });
1765     //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 1, 1 }, 6 } });
1766     auto weights1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 2, 1, 1 } });
1767     auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 3, 1 } });
1768     auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 2, 1, 1 } });
1769     auto biases2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 3, 1 } });
1770
1771     set_values(input, {
1772         1.5f, 0.5f, 2.0f, -1.0f
1773     });
1774     set_values(weights1, { -2.0f, 1.0f, 1.0f, 3.0f, 0.5f, 8.0f });
1775     set_values(biases1, { 1.0f, 5.0f, 3.0f });
1776     set_values(weights2, { 4.0f, -4.0f, 2.0f, 0.5f, -0.5f, 3.0f });
1777     set_values(biases2, { -1.0f, 2.5f, 2.0f });
1778
1779     topology topology(
1780         input_layout("input", input.get_layout()),
1781         data("weights1", weights1),
1782         data("biases1", biases1),
1783         data("weights2", weights2),
1784         data("biases2", biases2),
1785         convolution(
1786             "conv",
1787             "input",
1788             { "weights1", "weights2" },
1789             { "biases1", "biases2" },
1790             { 1,1,2,2 },
1791             { 0,0,0,0 },
1792             { 1,1,1,1 })
1793     );
1794
1795     network network(engine, topology);
1796     network.set_input_data("input", input);
1797
1798     auto outputs = network.execute();
1799     EXPECT_EQ(outputs.size(), size_t(1));
1800     EXPECT_EQ(outputs.begin()->first, "conv");
1801
1802     auto output_prim = outputs.begin()->second.get_memory();
1803
1804     auto output_ptr = output_prim.pointer<float>();
1805
1806     EXPECT_FLOAT_EQ(-1.5f, get_value<float>(output_ptr, 0));
1807     EXPECT_FLOAT_EQ(8.0f, get_value<float>(output_ptr, 1));
1808     EXPECT_FLOAT_EQ(7.75f, get_value<float>(output_ptr, 2));
1809     EXPECT_FLOAT_EQ(11.0f, get_value<float>(output_ptr, 3));
1810     EXPECT_FLOAT_EQ(6.0f, get_value<float>(output_ptr, 4));
1811     EXPECT_FLOAT_EQ(-2.0f, get_value<float>(output_ptr, 5));
1812
1813 }
1814
1815 TEST(convolution_gpu, trivial_convolution_relu) {
1816
1817     //  Filter : 2x2
1818     //  Stride : 2x2
1819     //  Input  : 4x4
1820     //  Output : 2x2
1821
1822     //  Input:
1823     //  -0.5   1     0.5  2
1824     //   1.5  -0.5   0   -1
1825     //   0.5   0.5  -1    1
1826     //   0.5   2     1.5 -0.5
1827     //
1828     //  Filter
1829     //  -2   0.5
1830     //   3.5 1.5
1831     //
1832     //  Bias
1833     //  -2
1834     //
1835     //  Output:
1836     //  4  0.0
1837     //  2  5
1838
1839     engine engine;
1840
1841     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 4 } });
1842     //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 2, 2 }, 1 } });
1843     auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
1844     auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
1845
1846     set_values(input, {
1847         -0.5f,  1.0f,  0.5f,  2.0f,
1848         1.5f, -0.5f,  0.0f, -1.0f,
1849         0.5f,  0.5f, -1.0f,  1.0f,
1850         0.5f,  2.0f,  1.5f, -0.5f
1851     });
1852     set_values(weights, { -2.0f, 0.5f, 3.5f, 1.5f });
1853     set_values(biases, { -2.0f });
1854
1855     topology topology(
1856         input_layout("input", input.get_layout()),
1857         data("weights", weights),
1858         data("biases", biases),
1859         convolution(
1860             "conv",
1861             "input",
1862             { "weights" },
1863             { "biases" },
1864             { 1,1,2,2 },
1865             { 0,0,0,0 },
1866             { 1, 1, 1, 1 },
1867             true,
1868             0)
1869     );
1870
1871     network network(engine, topology);
1872     network.set_input_data("input", input);
1873
1874     auto outputs = network.execute();
1875     EXPECT_EQ(outputs.size(), size_t(1));
1876     EXPECT_EQ(outputs.begin()->first, "conv");
1877
1878     auto output_prim = outputs.begin()->second.get_memory();
1879
1880     auto output_ptr = output_prim.pointer<float>();
1881
1882     EXPECT_FLOAT_EQ(4.0f, get_value<float>(output_ptr, 0));
1883     EXPECT_FLOAT_EQ(0.0f, get_value<float>(output_ptr, 1));
1884     EXPECT_FLOAT_EQ(2.0f, get_value<float>(output_ptr, 2));
1885     EXPECT_FLOAT_EQ(5.0f, get_value<float>(output_ptr, 3));
1886 }
1887
1888 TEST(convolution_gpu, relu_with_negative_slope) {
1889
1890     //  Filter : 2x2
1891     //  Stride : 2x2
1892     //  Input  : 4x4
1893     //  Output : 2x2
1894     //  Negative Slope : 0.1
1895
1896     //  Input:
1897     //  -0.5   1     0.5  2
1898     //   1.5  -0.5   0   -1
1899     //   0.5   0.5  -1    1
1900     //   0.5   2     1.5 -0.5
1901     //
1902     //  Filter
1903     //  -2   0.5
1904     //   3.5 1.5
1905     //
1906     //  Bias
1907     //  -2
1908     //
1909     //  Output:
1910     //  4  -0.35
1911     //  2  5
1912
1913     engine engine;
1914
1915     auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 4 } });
1916     //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 2, 2 }, 1 } });
1917     auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
1918     auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
1919
1920     set_values(input, {
1921         -0.5f,  1.0f,  0.5f,  2.0f,
1922         1.5f, -0.5f,  0.0f, -1.0f,
1923         0.5f,  0.5f, -1.0f,  1.0f,
1924         0.5f,  2.0f,  1.5f, -0.5f
1925     });
1926     set_values(weights, { -2.0f, 0.5f, 3.5f, 1.5f });
1927     set_values(biases, { -2.0f });
1928
1929     topology topology(
1930         input_layout("input", input.get_layout()),
1931         data("weights", weights),
1932         data("biases", biases),
1933         convolution(
1934             "conv",
1935             "input",
1936             { "weights" },
1937             { "biases" },
1938             { 1,1,2,2 },
1939             { 0,0,0,0 },
1940             { 1, 1, 1, 1 },
1941             true,
1942             0.1f)
1943     );
1944
1945     network network(engine, topology);
1946     network.set_input_data("input", input);
1947
1948     auto outputs = network.execute();
1949     EXPECT_EQ(outputs.size(), size_t(1));
1950     EXPECT_EQ(outputs.begin()->first, "conv");
1951
1952     auto output_prim = outputs.begin()->second.get_memory();
1953
1954     auto output_ptr = output_prim.pointer<float>();
1955
1956     EXPECT_FLOAT_EQ(4.0f, get_value<float>(output_ptr, 0));
1957     EXPECT_FLOAT_EQ(-0.35f, get_value<float>(output_ptr, 1));
1958     EXPECT_FLOAT_EQ(2.0f, get_value<float>(output_ptr, 2));
1959     EXPECT_FLOAT_EQ(5.0f, get_value<float>(output_ptr, 3));
1960 }
1961
1962 TEST(convolution_gpu, DISABLED_two_1x1_kernels_after_each_other) {
1963
1964     engine engine;
1965
1966     extern const std::vector<float> conv_1x1_output;
1967
1968     auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 16, 8, 16, 16 } });
1969     auto weights_conv_1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 8, 8, 1, 1 } });
1970     auto weights_conv_2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 8, 1, 1 } });
1971
1972     set_random_values<float>(input);
1973     set_random_values<float>(weights_conv_1);
1974     set_random_values<float>(weights_conv_2);
1975
1976     auto inp_lay = input_layout("input", input.get_layout());
1977     auto conv_1 = convolution(
1978         "conv_1",
1979         "input",
1980         { "weights_conv_1" });
1981     auto conv_2 = convolution(
1982         "conv_2",
1983         "conv_1",
1984         { "weights_conv_2" });
1985
1986     topology topology(
1987         inp_lay,
1988         data("weights_conv_1", weights_conv_1),
1989         conv_1,
1990         data("weights_conv_2", weights_conv_2),
1991         conv_2
1992     );
1993
1994     build_options bo;
1995     bo.set_option(build_option::optimize_data(true));
1996     network network(engine, topology, bo);
1997     network.set_input_data("input", input);
1998
1999     auto outputs = network.execute();
2000     EXPECT_EQ(outputs.size(), size_t(1));
2001
2002     auto output_prim = outputs.at("conv_2").get_memory();
2003
2004     auto output_ptr = output_prim.pointer<float>();
2005     auto output_layout = output_prim.get_layout();
2006
2007     int y_size = output_layout.size.spatial[1];
2008     int x_size = output_layout.size.spatial[0];
2009     int f_size = output_layout.size.feature[0];
2010     int b_size = output_layout.size.batch[0];
2011     int f_offset = y_size * x_size;
2012     int b_offset = f_size * f_offset;
2013     for (int b = 0; b < b_size; ++b)
2014     {
2015         for (int f = 0; f < f_size; ++f)
2016         {
2017             for (int y = 0; y < y_size; ++y)
2018             {
2019                 for (int x = 0; x < x_size; ++x)
2020                 {
2021                     int idx = b * b_offset + f * f_offset + y * x_size + x;
2022                     EXPECT_TRUE(are_equal(conv_1x1_output[idx], get_value<float>(output_ptr, idx)));
2023                 }
2024             }
2025         }
2026     }
2027 }
2028
2029 TEST(convolution_gpu, basic_yxfb_4_4_yxfb_2_2_b16_if2_of16_st2_2_p0_sp1_fp32)
2030 {
2031 #define USE_OLD_WEIGHTS_FORMAT 0
2032
2033     const auto input_format   = format::yxfb;
2034 #if USE_OLD_WEIGHTS_FORMAT
2035     const auto weights_format = format::bfyx;
2036 #else
2037     const auto weights_format = format::yxfb;
2038 #endif
2039     const auto biases_format = format::bfyx;
2040
2041     const int32_t batch_size = 16;
2042     const int32_t input_feature_count = 2;
2043     const int32_t output_feature_count = 16;
2044
2045     const int32_t stride_x = 2;
2046     const int32_t stride_y = 2;
2047
2048     const int32_t input_x = 4;
2049     const int32_t input_y = 4;
2050     const int32_t weights_x = 2;
2051     const int32_t weights_y = 2;
2052     const int32_t output_x = (input_x - weights_x) / stride_x + 1;
2053     const int32_t output_y = (input_y - weights_y) / stride_y + 1;
2054
2055     engine engine;
2056
2057     auto input_size = tensor( batch_size, input_feature_count, input_x, input_y );
2058     auto input = memory::allocate(engine, { data_types::f32, input_format, input_size });
2059     auto weights_size = tensor( output_feature_count, input_feature_count, weights_x, weights_y );
2060     auto weights = memory::allocate(engine, { data_types::f32, weights_format, weights_size });
2061     auto biases = memory::allocate(engine, { data_types::f32, biases_format, {1,1,output_feature_count,1}});
2062
2063     //auto output = memory::allocate({output_format, {batch_size, {output_x, output_y}, output_feature_count}});
2064
2065
2066     // input:
2067     std::vector<float> input_vals_template {
2068         0.25f, 0.50f, 0.75f, 1.00f,
2069         1.25f, 1.50f, 1.75f, 2.00f,
2070         2.25f, 2.50f, 2.75f, 3.00f,
2071         3.25f, 3.50f, 3.75f, 4.00f,
2072     };
2073     input_vals_template.resize(input_y * input_x);
2074
2075     std::vector<float> input_vals;
2076     input_vals.reserve(input_y * input_x * input_feature_count * batch_size);
2077     for (uint32_t yxi = 0; yxi < input_y * input_x; ++yxi)
2078     {
2079         for (uint32_t ifi = 0; ifi < input_feature_count; ++ifi)
2080         {
2081             for (uint32_t bi = 0; bi < batch_size; ++bi)
2082             {
2083                 input_vals.push_back((bi * input_feature_count + ifi + 1) * input_vals_template[yxi]);
2084             }
2085         }
2086     }
2087     set_values(input, input_vals);
2088
2089
2090     // weights:
2091     std::vector<float> weights_vals_template {
2092         -4.0f, -2.0f,
2093          4.0f,  4.0f,
2094     };
2095     weights_vals_template.resize(weights_y * weights_x);
2096
2097     std::vector<float> weights_vals;
2098     weights_vals.reserve(weights_y * weights_x * input_feature_count * output_feature_count);
2099 #if USE_OLD_WEIGHTS_FORMAT
2100     for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
2101     {
2102         for (uint32_t ifi = 0; ifi < input_feature_count; ++ifi)
2103         {
2104             for (uint32_t yxi = 0; yxi < weights_y * weights_x; ++yxi)
2105             {
2106                 weights_vals.push_back((ofi * input_feature_count + ifi + 1) * weights_vals_template[yxi]);
2107             }
2108         }
2109     }
2110 #else
2111     for (uint32_t yxi = 0; yxi < weights_y * weights_x; ++yxi)
2112     {
2113         for (uint32_t ifi = 0; ifi < input_feature_count; ++ifi)
2114         {
2115             for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
2116             {
2117                 weights_vals.push_back((ofi * input_feature_count + ifi + 1) * weights_vals_template[yxi]);
2118             }
2119         }
2120     }
2121 #endif
2122     set_values(weights, weights_vals);
2123
2124
2125     // biases:
2126     std::vector<float> biases_vals;
2127     biases_vals.reserve(output_feature_count);
2128     for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
2129     {
2130         biases_vals.push_back(ofi * 1.0f);
2131     }
2132     set_values(biases, biases_vals);
2133
2134
2135     // output:
2136     std::vector<float> output_vals_template {
2137          9.0f, 10.0f,
2138         13.0f, 14.0f,
2139     };
2140     output_vals_template.resize(output_y * output_x);
2141
2142     std::vector<float> output_vals;
2143     output_vals.reserve(output_y * output_x * output_feature_count * batch_size);
2144     for (uint32_t yxi = 0; yxi < output_y * output_x; ++yxi)
2145     {
2146         for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
2147         {
2148             for (uint32_t bi = 0; bi < batch_size; ++bi)
2149             {
2150                 uint32_t template_factor = input_feature_count * input_feature_count * input_feature_count * bi * ofi +
2151                     input_feature_count * input_feature_count * (input_feature_count + 1) / 2 * (bi + ofi) +
2152                     input_feature_count * (input_feature_count + 1) * (2 * input_feature_count + 1) / 6;
2153                 float bias_factor = ofi * 1.0f;
2154
2155                 output_vals.push_back(template_factor * output_vals_template[yxi] + bias_factor);
2156             }
2157         }
2158     }
2159
2160     // Computing convolution.
2161     topology topology(
2162         input_layout("input", input.get_layout()),
2163         data("weights", weights),
2164         data("biases", biases),
2165         convolution(
2166             "conv",
2167             "input",
2168             { "weights" },
2169             { "biases" },
2170             { 1,1,stride_x,stride_y },
2171             { 0,0,0,0 },
2172             { 1, 1, 1, 1 },
2173             true,
2174             0.1f)
2175     );
2176
2177     network network(engine, topology);
2178     network.set_input_data("input", input);
2179
2180     auto outputs = network.execute();
2181     EXPECT_EQ(outputs.size(), size_t(1));
2182     EXPECT_EQ(outputs.begin()->first, "conv");
2183
2184     auto output_prim = outputs.begin()->second.get_memory();
2185
2186     auto output_ptr = output_prim.pointer<float>();
2187
2188     // Checking result.
2189     uint32_t i = 0;
2190     for (uint32_t yxi = 0; yxi < output_y * output_x; ++yxi)
2191     {
2192         for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
2193         {
2194             for (uint32_t bi = 0; bi < batch_size; ++bi, ++i)
2195             {
2196                 auto equal = are_equal(output_vals[i], get_value<float>(output_ptr, i));
2197                 EXPECT_TRUE(equal);
2198                 if (!equal)
2199                 {
2200                     std::cout << "Failed at position (" << yxi << ", output feature = " << ofi << ", batch = " << bi << "): "
2201                         << output_vals[i] << " != " << get_value<float>(output_ptr, i) << std::endl;
2202                     return;
2203                 }
2204             }
2205         }
2206     }
2207
2208 #undef USE_OLD_WEIGHTS_FORMAT
2209 }
2210
2211 template<typename T>
2212 void quantize_weights(cldnn::memory& weights, cldnn::memory& w_qf)
2213 {
2214     using std::abs;
2215
2216     auto batch_pitch = weights.get_layout().get_pitches().batch[0];
2217     auto ptr = weights.pointer<T>();
2218     auto wqf_ptr = w_qf.pointer<float>();
2219     T max = (T) 0.0f;
2220     for (int ofm = 0; ofm < weights.get_layout().size.batch[0]; ofm++)
2221     {
2222         max = (T) 0.0f;
2223         for (int w = 0; w < batch_pitch; w++)
2224             if (max < abs(ptr[ofm* batch_pitch + w]))
2225                 max = abs(ptr[ofm* batch_pitch + w]);
2226
2227         if (max == (T)0)
2228             max = (T)1; // do not quantize
2229
2230         for (int w = 0; w < batch_pitch; w++)
2231             ptr[ofm* batch_pitch + w] = (T)round((float)ptr[ofm* batch_pitch + w] * 127.0f / (float)max);
2232         wqf_ptr[ofm] = max/127.0f;
2233     }
2234 }
2235 template<typename T>
2236 void calibrate(const cldnn::memory& output, cldnn::memory& calibrations)
2237 {
2238     using std::abs;
2239
2240     auto feature_pitch = output.get_layout().get_pitches().feature[0];
2241     auto ptr = output.pointer<T>();
2242     auto calibrations_ptr = calibrations.pointer<float>();
2243     T max = (T) 0.0f;
2244     for (int ofm = 0; ofm < output.get_layout().size.feature[0]; ofm++)
2245     {
2246         max = (T) 0.0f;
2247         for (int w = 0; w < feature_pitch; w++)
2248             if (max < abs(ptr[ofm* feature_pitch + w]))
2249                 max = abs(ptr[ofm* feature_pitch + w]);
2250         calibrations_ptr[ofm] =  127.0f / max;
2251     }
2252 }
2253
2254 template<typename T>
2255 T max_abs(const cldnn::memory& mem)
2256 {
2257     using std::abs;
2258
2259     T max = (T)0;
2260     auto ptr = mem.pointer<T>();
2261     for (auto& a : ptr)
2262         if (max < abs(a))
2263             max = abs(a);
2264     return max;
2265 }
2266
2267 template<typename T>
2268 void apply_calibration_on_weights(cldnn::memory& weights, cldnn::memory& qf)
2269 {
2270     auto batch_pitch = weights.get_layout().get_pitches().batch[0];
2271     auto ptr = weights.pointer<T>();
2272     auto wqf_ptr = qf.pointer<float>();
2273     tensor w_size = weights.get_layout().size;
2274     int index = 0;
2275     for (int ofm = 0; ofm < w_size.batch[0]; ofm++)
2276         for (int ifm = 0; ifm < w_size.feature[0]; ifm++)
2277             for (int xy = 0; xy < w_size.spatial[0] * w_size.spatial[1]; xy++)
2278             {
2279                 ptr[index] = ptr[index] / wqf_ptr[ifm];
2280                 index++;
2281             }
2282 }
2283
2284 cldnn::memory create_int8_weights(engine engine, cldnn::memory& in_weights)
2285 {
2286     auto layout = in_weights.get_layout();
2287     auto out_weights = memory::allocate(engine, { data_types::i8, layout.format, layout.size });
2288     auto in = in_weights.pointer<float>();
2289     auto out = out_weights.pointer<char>();
2290     int indx = 0;
2291     for (auto& a : in)
2292         out[indx++] = (char) a;
2293     return out_weights;
2294 }
2295
2296 void add_primitives(const engine& engine, topology& topology)
2297 {
2298     auto weights = memory::allocate(engine, { data_types::i8, format::bfyx,{ 2, 1, 3, 2 } });
2299
2300     std::vector<char> weights_values = { 1, 2, 1, 2, 1, 2, 19, 17, -1, -10, 32, 23 };
2301     set_values<char>(weights, weights_values);
2302     cldnn::memory biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 1 } });
2303     auto weigths_qfs = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 1 } });
2304     set_values(biases, { 1.0f, -8.0f });
2305
2306     topology.add(
2307         data("weights", weights),
2308         data("biases", biases),
2309         data("w_qfs", weigths_qfs),
2310         convolution("conv", "input", { "weights" }, { "biases" }, { 0, 0, 1, 2 }, { 0, 0, 0, 0 }, { 1, 1, 1, 1 }, true));
2311 }
2312
2313 TEST(convolution_f32_fw_gpu, byte_activation) {
2314     //  Filter : 2x3
2315     //  Stride : 2x1
2316     //  Input  : 4x5
2317     //  Output : 2x3
2318     //
2319     //  Input:
2320     //  1  2  3  4  5
2321     //  2  2  3  4  6
2322     //  3  3  3  5  1
2323     //  1  1  1  1  1
2324     //
2325     //  Filter:
2326     //  1  2  1
2327     //  2  1  2
2328     //
2329     //  19 17 -1
2330     // -10 32 23
2331     //
2332     //  Output:
2333     // 21  28  39
2334     // 18  20  20
2335     //
2336     // -101 -11 92
2337     // -114 -116 -78
2338     //
2339     //  Bias:
2340     //  1 -8
2341     engine_configuration eng_conf(false, false, false, "", "", true, "", "kernels");
2342     engine engine{ eng_conf };
2343     auto input = memory::allocate(engine, { data_types::i8, format::bfyx,{ 1, 1, 5, 4 } });
2344
2345     VVVF<char> output_vec = {
2346         {
2347             { 11, 0, 15 },
2348             { 0,  0, 2 }
2349         },
2350         {
2351             { 33, 0, 0 },
2352             { 0, 0, 0 }
2353         } };
2354
2355     build_options opts;
2356     opts.set_option(build_option::optimize_data(true));
2357     opts.set_option(build_option::graph_dumps_dir("graph"));
2358
2359     set_values<char>(input, { 1, 2, -3, 4, -5, 2, -2, 3, -4, 6, -3, 3, -3, 5, -1, -1, -1, -1, -1, -1 });
2360
2361     topology topology(
2362         input_layout("input", input.get_layout()));
2363     add_primitives(engine, topology);
2364     network network(engine, topology, opts);
2365     network.set_input_data("input", input);
2366
2367     auto outputs = network.execute();
2368     EXPECT_EQ(outputs.begin()->first, "conv");
2369
2370     auto output_memory = outputs.at("conv").get_memory();
2371     auto output_layout = output_memory.get_layout();
2372     auto output_ptr = output_memory.pointer<char>();
2373
2374     int y_size = output_layout.size.spatial[1];
2375     int x_size = output_layout.size.spatial[0];
2376     int f_size = output_layout.size.feature[0];
2377     int b_size = output_layout.size.batch[0];
2378     EXPECT_EQ(output_layout.format, format::bfyx);
2379     EXPECT_EQ(y_size, 2);
2380     EXPECT_EQ(x_size, 3);
2381     EXPECT_EQ(f_size, 2);
2382     EXPECT_EQ(b_size, 1);
2383     for (int f = 0; f < f_size; f++)
2384         for (int y = 0; y < y_size; ++y) {
2385             for (int x = 0; x < x_size; ++x) {
2386                 EXPECT_NEAR(output_vec[f][y][x], ((float)output_ptr[f*y_size*x_size + y * x_size + x]), 3.0f);
2387             }
2388         }
2389 }
2390
2391 TEST(convolution_f32_fw_gpu, quantized_convolution_low_prec_single_ofq) {
2392     //  Filter : 2x3
2393     //  Stride : 2x1
2394     //  Input  : 4x5
2395     //  Output : 2x3
2396     //
2397     //  Input:
2398     //  1  2  3  4  5
2399     //  2  2  3  4  6
2400     //  3  3  3  5  1
2401     //  1  1  1  1  1
2402     //
2403     //  Filter:
2404     //  1  2  1
2405     //  2  1  2
2406     //
2407     //  19 17 -1
2408     // -10 32 23
2409     //
2410     //  Output:
2411     // 21  28  39
2412     // 18  20  20
2413     //
2414     // -101 -11 92
2415     // -114 -116 -78
2416     //
2417     //  Bias:
2418     //  1 -8
2419
2420     engine engine;
2421
2422     auto input_f = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 5, 4 } });
2423     auto weights_f = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 2 } });
2424     cldnn::memory biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 1 } });
2425     auto weigths_qfs = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 1 } });
2426
2427     std::vector<float> weights_values_f = { 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 19.0, 17.0, -1.0, -10.0, 32.0, 23.0 };
2428     set_values<float>(input_f, { 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 2.0, 3.0, 4.0, 6.0, 3.0, 3.0, 3.0, 5.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 });
2429     set_values<float>(weights_f, weights_values_f);
2430
2431     set_values(biases, { 1.0f, -8.0f });
2432     VVVF<float> output_vec = {
2433         {
2434             { 21.0f, 28.0f, 39.0f },
2435             { 18.0f, 20.0f, 20.0f }
2436         },
2437         {
2438             { 155.0f, 245.0f, 348.0f },
2439             { 142.0f, 140.0f, 178.0f }
2440         } };
2441
2442     topology topology_f(
2443         input_layout("input_f", input_f.get_layout()),
2444         data("weights_f", weights_f),
2445         data("biases", biases),
2446         convolution("conv_f", "input_f", { "weights_f" }, { "biases" }, { 0, 0, 1, 2 }));
2447
2448     build_options opts;
2449     opts.set_option(build_option::optimize_data(true));
2450     network network_f(engine, topology_f, opts);
2451     network_f.set_input_data("input_f", input_f);
2452
2453     auto outputs_f = network_f.execute();
2454     EXPECT_EQ(outputs_f.begin()->first, "conv_f");
2455
2456     auto output_memory_f = outputs_f.at("conv_f").get_memory();
2457     auto output_ptr_f = output_memory_f.pointer<float>();
2458
2459     auto input = memory::allocate(engine, { data_types::i8, format::bfyx,{ 1, 1, 5, 4 } });
2460     auto weights = memory::allocate(engine, { data_types::i8, format::bfyx,{ 2, 1, 3, 2 } });
2461     float i_qf = 1.0f;
2462     float o_qf = 127.0f / max_abs<float>(output_memory_f);
2463
2464     std::vector<char> weights_values = { 1, 2, 1, 2, 1, 2, 19, 17, -1, -10, 32, 23 };
2465     set_values<char>(input, { 1, 2, 3, 4, 5, 2, 2, 3, 4, 6, 3, 3, 3, 5, 1, 1, 1, 1, 1, 1 });
2466     set_values<char>(weights, weights_values);
2467     set_values<float>(weigths_qfs, { 1.0f, 1.0f });
2468     quantize_weights<char>(weights, weigths_qfs);
2469
2470     topology topology(
2471         input_layout("input", input.get_layout()),
2472         data("weights", weights),
2473         data("biases", biases),
2474         data("w_qfs",weigths_qfs),
2475         convolution("conv", "input", { "weights" }, { "biases" }, { "w_qfs" },i_qf, o_qf, { 0, 0, 1, 2 }));
2476
2477     network network(engine, topology, opts);
2478     network.set_input_data("input", input);
2479
2480     auto outputs = network.execute();
2481     EXPECT_EQ(outputs.begin()->first, "conv");
2482
2483     auto output_memory = outputs.at("conv").get_memory();
2484     auto output_layout = output_memory.get_layout();
2485     auto output_ptr = output_memory.pointer<char>();
2486
2487     int y_size = output_layout.size.spatial[1];
2488     int x_size = output_layout.size.spatial[0];
2489     int f_size = output_layout.size.feature[0];
2490     int b_size = output_layout.size.batch[0];
2491     EXPECT_EQ(output_layout.format, format::bfyx);
2492     EXPECT_EQ(y_size, 2);
2493     EXPECT_EQ(x_size, 3);
2494     EXPECT_EQ(f_size, 2);
2495     EXPECT_EQ(b_size, 1);
2496     for (int f = 0; f < f_size; f++)
2497         for (int y = 0; y < y_size; ++y) {
2498             for (int x = 0; x < x_size; ++x) {
2499                 EXPECT_NEAR(output_vec[f][y][x], ((float)output_ptr[f*y_size*x_size + y * x_size + x]) / o_qf, 3.0f);
2500             }
2501         }
2502 }
2503
2504
2505 TEST(convolution_f32_fw_gpu, quantized_convolution_high_prec_calib_per_ofm) {
2506     //  Filter : 2x3
2507     //  Stride : 2x1
2508     //  Input  : 4x5
2509     //  Output : 2x3
2510     //
2511     //  Input:
2512     //  1  2  3  4  5
2513     //  2  2  3  4  6
2514     //  3  3  3  5  1
2515     //  1  1  1  1  1
2516     //
2517     //  Filter:
2518     //  1  2  1
2519     //  2  1  2
2520     //
2521     //  19 17 -1
2522     // -10 32 23
2523     //
2524     //  Output:
2525     // 21  28  39
2526     // 18  20  20
2527     //
2528     // -101 -11 92
2529     // -114 -116 -78
2530     //
2531     //  Bias:
2532     //  1 -8
2533     engine engine;
2534
2535     auto input_f = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 5, 4 } });
2536     auto weights_f = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 2 } });
2537     cldnn::memory biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 1 } });
2538     auto weigths_qfs = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 1 } });
2539     auto output_calibrations = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 1 } });
2540
2541     std::vector<float> weights_values_f = { 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 19.0, 17.0, -1.0, -10.0, 32.0, 23.0 };
2542     set_values<float>(input_f, { 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 2.0, 3.0, 4.0, 6.0, 3.0, 3.0, 3.0, 5.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 });
2543     set_values<float>(weights_f, weights_values_f);
2544     set_values(biases, { 1.0f, -8.0f });
2545     VVVF<float> output_vec = {
2546         {
2547             { 21.0f, 28.0f, 39.0f },
2548             { 18.0f, 20.0f, 20.0f }
2549         },
2550         {
2551             { 155.0f, 245.0f, 348.0f },
2552             { 142.0f, 140.0f, 178.0f }
2553         } };
2554
2555     topology topology_f(
2556         input_layout("input_f", input_f.get_layout()),
2557         data("weights_f", weights_f),
2558         data("biases", biases),
2559         convolution("conv_f", "input_f", { "weights_f" }, { "biases" }, { 0, 0, 1, 2 }));
2560
2561     build_options opts;
2562     opts.set_option(build_option::optimize_data(true));
2563     network network_f(engine, topology_f, opts);
2564     network_f.set_input_data("input_f", input_f);
2565
2566     auto outputs_f = network_f.execute();
2567     EXPECT_EQ(outputs_f.begin()->first, "conv_f");
2568
2569     auto output_memory_f = outputs_f.at("conv_f").get_memory();
2570     auto output_ptr_f = output_memory_f.pointer<float>();
2571
2572     auto input = memory::allocate(engine, { data_types::i8, format::bfyx,{ 1, 1, 5, 4 } });
2573     auto weights = memory::allocate(engine, { data_types::i8, format::bfyx,{ 2, 1, 3, 2 } });
2574     float i_qf = 1.0f;
2575
2576     std::vector<char> weights_values = { 1, 2, 1, 2, 1, 2, 19, 17, -1, -10, 32, 23 };
2577     set_values<char>(input, { 1, 2, 3, 4, 5, 2, 2, 3, 4, 6, 3, 3, 3, 5, 1, 1, 1, 1, 1, 1 });
2578     set_values<char>(weights, weights_values);
2579     set_values<float>(weigths_qfs, { 1.0f, 1.0f });
2580     quantize_weights<char>(weights, weigths_qfs);
2581     calibrate<float>(output_memory_f, output_calibrations);
2582
2583     topology topology(
2584         input_layout("input", input.get_layout()),
2585         data("weights", weights),
2586         data("biases", biases),
2587         data("w_qfs", weigths_qfs),
2588         data("out_calibrations", output_calibrations),
2589         convolution( "conv", "input", { "weights" }, { "biases" },
2590                    { "w_qfs" }, { "out_calibrations" }, i_qf, { 0, 0, 1, 2 }));
2591
2592     network network(engine, topology, opts);
2593     network.set_input_data("input", input);
2594
2595     auto outputs = network.execute();
2596     EXPECT_EQ(outputs.begin()->first, "conv");
2597
2598     auto output_memory = outputs.at("conv").get_memory();
2599     auto output_layout = output_memory.get_layout();
2600     auto output_ptr = output_memory.pointer<char>();
2601     auto o_qf = output_calibrations.pointer<float>();
2602     int y_size = output_layout.size.spatial[1];
2603     int x_size = output_layout.size.spatial[0];
2604     int f_size = output_layout.size.feature[0];
2605     int b_size = output_layout.size.batch[0];
2606     EXPECT_EQ(output_layout.format, format::bfyx);
2607     EXPECT_EQ(y_size, 2);
2608     EXPECT_EQ(x_size, 3);
2609     EXPECT_EQ(f_size, 2);
2610     EXPECT_EQ(b_size, 1);
2611     for (int f = 0; f < f_size; f++)
2612         for (int y = 0; y < y_size; ++y) {
2613             for (int x = 0; x < x_size; ++x) {
2614                 EXPECT_NEAR(output_vec[f][y][x], ((float)output_ptr[f*y_size*x_size + y * x_size + x]) / o_qf[f], 3.0f);
2615             }
2616         }
2617 }
2618 TEST(convolution_f32_fw_gpu, calibration_advance) {
2619     //  Filter : 2x3
2620     //  Stride : 2x1
2621     //  Input  : 4x5
2622     //  Output : 2x3
2623     //
2624     //  Input:
2625     //  1  2  3  4  5
2626     //  2  2  3  4  6
2627     //  3  3  3  5  1
2628     //  1  1  1  1  1
2629     //
2630     //  Filter1:
2631     //  1  2  1
2632     //  2  1  2
2633     //
2634     //  1.9 1.7 -1
2635     // -1.0 3.2 2.3
2636     //
2637     //  Filter2:
2638     // IFM0:
2639     //  1.5  2.3  -1.0
2640     //  3  5.6  -1.0
2641     //
2642     //  3  5.6  -1.0
2643     //  1   2   3
2644     // IFM1:
2645     //  1.9 1.7 -1
2646     //  1.9 1.7 -1
2647     //
2648     // -1.0 3.2 2.3
2649     // -1.0 3.2 2.3
2650     // IFM2:
2651     //  1 2 -1
2652     //  2 1 -1
2653     //
2654     // -1 2  1
2655     //  1 2 -1
2656     //
2657     //  Output:
2658     // 313.32  217.43  118.10
2659     //
2660     //
2661     //  Bias1:
2662     //  1 -8
2663     //  Bias2:
2664     //  2  4  0
2665
2666     engine engine;
2667
2668     auto input_f = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 5, 4 } });
2669     auto weights_f = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 2 } });
2670     auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 1 } });
2671     auto w_qf = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 1 } });
2672     auto weights_f_2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 3, 2, 3, 2 } });
2673     auto biases_2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 1 } });
2674     auto w_qf_2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 1 } });
2675
2676     std::vector<float> weights_values_f = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.9f, 1.7f, -1.0f, -1.0f, 3.2f, 2.3f };
2677     std::vector<float> weights_values_f_2 = {
2678         1.5f, 2.3f, -1.0f, 3.0f, 5.6f, -1.0f,
2679         3.0f, 5.6f, -1.0f, 1.0f, 2.0f, 3.0f,
2680
2681         1.9f, 1.7f, -1.0f, 1.9f, 1.7f, -1.0f,
2682         -1.0f, 3.2f, 2.3f, -1.0f, 3.2f, 2.3f,
2683
2684         1.0f, 2.0f, -1.0f, 2.0f, 1.0f, -1.0f,
2685        -1.0f, 2.0f,  1.0f, 1.0f, 2.0f, -1.0f,};
2686
2687     set_values<float>(input_f, { 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 2.0, 3.0, 4.0, 6.0, 3.0, 3.0, 3.0, 5.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 });
2688     set_values<float>(weights_f, weights_values_f);
2689     set_values<float>(weights_f_2, weights_values_f_2);
2690     set_values(biases, { 1.0f, -8.0f });
2691     set_values(biases_2, { 2.0f, 4.0f, 0.0f });
2692
2693     topology topology_f(
2694         input_layout("input_f", input_f.get_layout()),
2695         data("weights_f", weights_f),
2696         data("biases", biases),
2697         data("weights_f_2", weights_f_2),
2698         data("biases_2", biases_2),
2699         convolution("conv_f", "input_f", { "weights_f" }, { "biases" }, { 0, 0, 1, 2 }),
2700         convolution("conv_f_2", "conv_f", { "weights_f_2" }, { "biases_2" }, { 0, 0, 1, 1 }));
2701
2702     build_options opts;
2703     opts.set_option(build_option::optimize_data(true));
2704     opts.set_option(build_option::outputs({ "conv_f", "conv_f_2" }));
2705     network network_f(engine, topology_f, opts);
2706     network_f.set_input_data("input_f", input_f);
2707
2708     auto outputs_f = network_f.execute();
2709     auto output_memory_f = outputs_f.at("conv_f").get_memory();
2710     auto output_memory_f_2 = outputs_f.at("conv_f_2").get_memory();
2711     auto output_calibrations = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 1 } });
2712     auto output_calibrations_2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 1 } });
2713
2714     calibrate<float>(output_memory_f, output_calibrations);
2715     calibrate<float>(output_memory_f_2, output_calibrations_2);
2716     apply_calibration_on_weights<float>(weights_f_2, output_calibrations);
2717     quantize_weights<float>(weights_f, w_qf);
2718     quantize_weights<float>(weights_f_2, w_qf_2);
2719
2720     auto weights   = create_int8_weights(engine, weights_f);
2721     auto weigths_2 = create_int8_weights(engine, weights_f_2);
2722     auto input = create_int8_weights(engine, input_f);
2723
2724     topology topology(
2725         input_layout("input", input.get_layout()),
2726         data("weights", weights),
2727         data("biases", biases),
2728         data("weights_2", weigths_2),
2729         data("biases_2", biases_2),
2730         data("w_qf", w_qf),
2731         data("w_qf_2", w_qf_2),
2732         data("calib", output_calibrations),
2733         data("calib_2", output_calibrations_2),
2734         convolution("conv", "input", { "weights" }, { "biases" }, { "w_qf" }, { "calib" }, 1.0f, { 0, 0, 1, 2 }),
2735         convolution("conv_2", "conv", { "weights_2" }, { "biases_2" }, { "w_qf_2" }, { "calib_2" }, 1.0f, { 0, 0, 1, 1 }));
2736
2737     build_options opts_2;
2738     opts_2.set_option(build_option::optimize_data(true));
2739     opts_2.set_option(build_option::outputs({ "conv", "conv_2" }));
2740     cldnn::network network(engine, topology, opts_2);
2741     network.set_input_data("input", input);
2742     auto outputs = network.execute();
2743     auto output_memory = outputs.at("conv_2").get_memory();
2744     auto ref_ptr = output_memory_f_2.pointer<float>();
2745     auto test_ptr = output_memory.pointer<char>();
2746     auto& out_size = output_memory.get_layout().size;
2747     auto o_qf = output_calibrations_2.pointer<float>();
2748
2749     for (int f = 0; f < out_size.feature[0]; f++)
2750         for (int y = 0; y < out_size.spatial[1]; ++y) {
2751             for (int x = 0; x < out_size.spatial[0]; ++x) {
2752                 EXPECT_NEAR(ref_ptr[x + out_size.spatial[0] * (y + out_size.spatial[1]*f)], ((float)test_ptr[x + out_size.spatial[0] * (y + out_size.spatial[1] * f)]) / o_qf[f], 3.0f);
2753             }
2754         }
2755
2756 }
2757
2758 TEST(convolution_gpu, basic_yxfb_4_4_yxfb_2_2_b16_if2_of16_st2_2_p0_sp1_fp16)
2759 {
2760 #define USE_OLD_WEIGHTS_FORMAT 0
2761
2762     engine engine;
2763
2764     if (!engine.get_info().supports_fp16)
2765     {
2766         std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
2767         EXPECT_EQ(1, 1);
2768         return;
2769     }
2770
2771
2772     const auto input_format   = format::yxfb;
2773 #if USE_OLD_WEIGHTS_FORMAT
2774     const auto weights_format = format::bfyx;
2775 #else
2776     const auto weights_format = format::yxfb;
2777 #endif
2778     const auto biases_format  = format::bfyx;
2779     const auto output_format  = input_format;
2780
2781     const int32_t batch_size = 16;
2782     const int32_t input_feature_count = 2;
2783     const int32_t output_feature_count = 16;
2784
2785     const int32_t stride_x = 2;
2786     const int32_t stride_y = 2;
2787
2788     const int32_t input_x = 4;
2789     const int32_t input_y = 4;
2790     const int32_t weights_x = 2;
2791     const int32_t weights_y = 2;
2792     const int32_t output_x = (input_x - weights_x) / stride_x + 1;
2793     const int32_t output_y = (input_y - weights_y) / stride_y + 1;
2794
2795
2796     auto input_size = tensor( batch_size, input_feature_count, input_x, input_y );
2797     auto input = memory::allocate(engine, { data_types::f32, input_format, input_size });
2798     auto weights_size = tensor( output_feature_count, input_feature_count, weights_x, weights_y );
2799     auto weights = memory::allocate(engine, { data_types::f32, weights_format, weights_size });
2800     auto biases_size = tensor( 1,1,output_feature_count,1 );
2801     auto biases = memory::allocate(engine, { data_types::f32, biases_format, biases_size });
2802     auto output_size = tensor( batch_size, output_feature_count, output_x, output_y );
2803     //auto output = memory::allocate({output_format, {batch_size, {output_x, output_y}, output_feature_count}});
2804
2805     //auto input_cvtd = memory::allocate(engine, { data_types::f16, input_size });
2806     //auto weights_cvtd = memory::allocate(engine, { data_types::f16, weights_size });
2807     //auto biases_cvtd = memory::allocate(engine, { data_types::f16, biases_size });
2808     //auto output_cvtd  = memory::allocate({output_cvt_format, {batch_size, {output_x, output_y}, output_feature_count}});
2809
2810
2811     // input:
2812     std::vector<float> input_vals_template {
2813         0.25f, 0.50f, 0.75f, 1.00f,
2814         1.25f, 1.50f, 1.75f, 2.00f,
2815         2.25f, 2.50f, 2.75f, 3.00f,
2816         3.25f, 3.50f, 3.75f, 4.00f,
2817     };
2818     input_vals_template.resize(input_y * input_x);
2819
2820     std::vector<float> input_vals;
2821     input_vals.reserve(input_y * input_x * input_feature_count * batch_size);
2822     for (uint32_t yxi = 0; yxi < input_y * input_x; ++yxi)
2823     {
2824         for (uint32_t ifi = 0; ifi < input_feature_count; ++ifi)
2825         {
2826             for (uint32_t bi = 0; bi < batch_size; ++bi)
2827             {
2828                 input_vals.push_back((bi * input_feature_count + ifi + 1) * input_vals_template[yxi]);
2829             }
2830         }
2831     }
2832     set_values(input, input_vals);
2833
2834
2835     // weights:
2836     std::vector<float> weights_vals_template {
2837         -0.50f, -0.25f,
2838          0.50f,  0.50f,
2839     };
2840     weights_vals_template.resize(weights_y * weights_x);
2841
2842     std::vector<float> weights_vals;
2843     weights_vals.reserve(weights_y * weights_x * input_feature_count * output_feature_count);
2844 #if USE_OLD_WEIGHTS_FORMAT
2845     for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
2846     {
2847         for (uint32_t ifi = 0; ifi < input_feature_count; ++ifi)
2848         {
2849             for (uint32_t yxi = 0; yxi < weights_y * weights_x; ++yxi)
2850             {
2851                 weights_vals.push_back((ofi * input_feature_count + ifi + 1) * weights_vals_template[yxi]);
2852             }
2853         }
2854     }
2855 #else
2856     for (uint32_t yxi = 0; yxi < weights_y * weights_x; ++yxi)
2857     {
2858         for (uint32_t ifi = 0; ifi < input_feature_count; ++ifi)
2859         {
2860             for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
2861             {
2862                 weights_vals.push_back((ofi * input_feature_count + ifi + 1) * weights_vals_template[yxi]);
2863             }
2864         }
2865     }
2866 #endif
2867     set_values(weights, weights_vals);
2868
2869
2870     // biases:
2871     std::vector<float> biases_vals;
2872     biases_vals.reserve(output_feature_count);
2873     for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
2874     {
2875         biases_vals.push_back(ofi * 1.0f);
2876     }
2877     set_values(biases, biases_vals);
2878
2879
2880     // output:
2881     std::vector<float> output_vals_template {
2882         1.125f,  1.250f,
2883         1.625f,  1.750f,
2884     };
2885     output_vals_template.resize(output_y * output_x);
2886
2887     std::vector<float> output_vals;
2888     output_vals.reserve(output_y * output_x * output_feature_count * batch_size);
2889     for (uint32_t yxi = 0; yxi < output_y * output_x; ++yxi)
2890     {
2891         for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
2892         {
2893             for (uint32_t bi = 0; bi < batch_size; ++bi)
2894             {
2895                 uint32_t template_factor = input_feature_count * input_feature_count * input_feature_count * bi * ofi +
2896                     input_feature_count * input_feature_count * (input_feature_count + 1) / 2 * (bi + ofi) +
2897                     input_feature_count * (input_feature_count + 1) * (2 * input_feature_count + 1) / 6;
2898                 float bias_factor = ofi * 1.0f;
2899
2900                 output_vals.push_back(template_factor * output_vals_template[yxi] + bias_factor);
2901             }
2902         }
2903     }
2904
2905     //auto expected_float = memory::allocate(engine, { data_types::f32,{ format::x,{ static_cast<int32_t>(output_vals.size()) } } });
2906     //auto expected_half  = memory::allocate(engine, { data_types::f16,{ format::x,{ static_cast<int32_t>(output_vals.size()) } } });
2907     //auto expected       = memory::allocate(engine, { data_types::f32,{ format::x,{ static_cast<int32_t>(output_vals.size()) } } });
2908
2909 //    set_values(expected_float, output_vals);
2910 //    auto cvt_expected_f32_f16 = reorder::create({expected_float, expected_half});
2911 //    auto cvt_expected_f16_f32 = reorder::create({expected_half, expected});
2912 //    execute({cvt_expected_f32_f16, cvt_expected_f16_f32}).wait();
2913 //
2914 //    auto expected_ptr = expected.as<const memory&>().pointer<float>();
2915
2916
2917     // Computing convolution.
2918     topology topology(
2919         input_layout("input", input.get_layout()),
2920         reorder("cvt_input", "input", {data_types::f16, input_format, input_size}),
2921         data("weights", weights),
2922         reorder("cvt_weights", "weights", {data_types::f16, weights_format, weights_size}),
2923         data("biases", biases),
2924         reorder("cvt_biases", "biases", {data_types::f16, biases_format, biases_size}),
2925         convolution(
2926             "conv",
2927             "cvt_input",
2928             { "cvt_weights" },
2929             { "cvt_biases" },
2930             { 1,1,stride_x,stride_y }),
2931         reorder("output", "conv", {data_types::f32, output_format, output_size})
2932     );
2933
2934     network network(engine, topology);
2935     network.set_input_data("input", input);
2936
2937     auto outputs = network.execute();
2938     EXPECT_EQ(outputs.size(), size_t(1));
2939     EXPECT_EQ(outputs.begin()->first, "output");
2940
2941     auto output_prim = outputs.begin()->second.get_memory();
2942
2943     auto output_ptr = output_prim.pointer<float>();
2944
2945     // Checking result.
2946     uint32_t i = 0;
2947     for (uint32_t yxi = 0; yxi < output_y * output_x; ++yxi)
2948     {
2949         for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
2950         {
2951             for (uint32_t bi = 0; bi < batch_size; ++bi, ++i)
2952             {
2953                 auto equal = are_equal(output_vals[i] /*get_value(expected_ptr, i)*/, output_ptr[i], 0.002f);
2954                 EXPECT_TRUE(equal);
2955                 if (!equal)
2956                 {
2957                     std::cout << "Failed at position (" << yxi << ", output feature = " << ofi << ", batch = " << bi << "): "
2958                         << output_vals[i] /*get_value(expected_ptr, i)*/ << " != " << output_ptr[i] << std::endl;
2959                     return;
2960                 }
2961             }
2962         }
2963     }
2964
2965 #undef USE_OLD_WEIGHTS_FORMAT
2966 }
2967
2968 class convolution_test : public tests::generic_test
2969 {
2970
2971 public:
2972
2973     static void TearDownTestCase()
2974     {
2975         for (auto generic_params : all_generic_params)
2976         {
2977             delete generic_params;
2978         }
2979
2980         for (auto layer_params : all_layer_params)
2981         {
2982             delete layer_params;
2983         }
2984     }
2985
2986     static std::vector<cldnn::primitive*> generate_specific_test_params()
2987     {
2988         // TODO: check split
2989
2990         // TODO: check convolution without bias
2991
2992         const std::vector<primitive_id>& weights = { "input1" };
2993         const std::vector<primitive_id>& bias = { "input2" };
2994
2995         std::vector<tensor> stride_sizes = { tensor(1, 1, 1, 1), tensor(1, 1, 2, 3), tensor(1, 1, 4, 1), tensor(1, 1, 5, 5) };
2996         std::vector<tensor> dilation_sizes = { tensor(1, 1, 1, 1), tensor(1, 1, 5, 4), tensor(1, 1, 1, 3), tensor(1, 1, 7, 2) };
2997         std::vector<tensor> input_offset_sizes = { tensor(0, 0, 0, 0), tensor(0, 0, 2, 2), tensor(0, 0, -5, -2), tensor(0, 0, 3, -3) };
2998
2999         std::vector<bool> activations = { false, true };
3000         std::vector<float> activation_slopes = { 0.f, -2.3f };
3001
3002         // No padding
3003         all_layer_params.push_back(new convolution("convolution", "input0", weights, bias, stride_sizes[0], input_offset_sizes[0], dilation_sizes[0], activations[0], activation_slopes[0]));
3004         all_layer_params.push_back(new convolution("convolution", "input0", weights, bias, stride_sizes[1], input_offset_sizes[1], dilation_sizes[1], activations[0], activation_slopes[0]));
3005         all_layer_params.push_back(new convolution("convolution", "input0", weights, bias, stride_sizes[2], input_offset_sizes[2], dilation_sizes[2], activations[1], activation_slopes[0]));
3006         all_layer_params.push_back(new convolution("convolution", "input0", weights, bias, stride_sizes[3], input_offset_sizes[3], dilation_sizes[3], activations[1], activation_slopes[1]));
3007
3008         // Input padding
3009         all_layer_params.push_back(new convolution("convolution", "reorder0", weights, bias, stride_sizes[1], input_offset_sizes[1], dilation_sizes[1], activations[0], activation_slopes[0]));
3010         all_layer_params.push_back(new convolution("convolution", "reorder0", weights, bias, stride_sizes[3], input_offset_sizes[3], dilation_sizes[3], activations[1], activation_slopes[1]));
3011
3012         // Output padding
3013         all_layer_params.push_back(new convolution("convolution", "input0", weights, bias, stride_sizes[1], input_offset_sizes[1], dilation_sizes[1], activations[0], activation_slopes[0], { { 0, 0, 2, 4 },{ 0, 0, 0, 19 } }));
3014         all_layer_params.push_back(new convolution("convolution", "input0", weights, bias, stride_sizes[2], input_offset_sizes[2], dilation_sizes[2], activations[1], activation_slopes[0], { { 0, 0, 1, 0 },{ 0, 0, 13, 9 } }));
3015
3016         // Input + Output padding
3017         all_layer_params.push_back(new convolution("convolution", "reorder0", weights, bias, stride_sizes[0], input_offset_sizes[0], dilation_sizes[0], activations[0], activation_slopes[0], { { 0, 0, 1, 5 },{ 0, 0, 19, 4 } }));
3018         all_layer_params.push_back(new convolution("convolution", "reorder0", weights, bias, stride_sizes[3], input_offset_sizes[3], dilation_sizes[3], activations[1], activation_slopes[1], { { 0, 0, 1, 2 },{ 0, 0, 3, 4 } }));
3019
3020         return all_layer_params;
3021     }
3022
3023     static std::vector<std::tuple<tests::test_params*, cldnn::primitive*>> generate_all_test_params()
3024     {
3025         generate_specific_test_params();
3026
3027         std::vector<cldnn::format> input_formats = { cldnn::format::bfyx, cldnn::format::yxfb };
3028         std::vector<cldnn::format> weights_formats = { cldnn::format::bfyx, cldnn::format::yxfb };
3029
3030         std::vector<int32_t> output_features_sizes = { 1, 3, 16 };
3031         std::vector<cldnn::tensor> kernel_sizes = { tensor(1, 1, 1, 1), tensor(1, 1, 4, 7), tensor(1, 1, 5, 3) };
3032
3033         std::vector<tensor> input_tensor_size = { tensor(1, 5, 59, 72), tensor(8, 3, 63, 56), tensor(16, 2, 50, 50), tensor(32, 1, 44, 62) };
3034
3035         for (cldnn::data_types data_type : test_data_types())
3036         {
3037             for (cldnn::format input_format : input_formats)
3038             {
3039                 for (cldnn::format weights_format : weights_formats)
3040                 {
3041                     cldnn::build_options network_build_options;
3042                     if (input_format == cldnn::format::bfyx)
3043                     {
3044                         network_build_options.set_option(cldnn::build_option::optimize_data(true));
3045                     }
3046                     for (cldnn::tensor input_size : input_tensor_size)
3047                     {
3048                         for (cldnn::tensor kernel_size : kernel_sizes)
3049                         {
3050                             for (auto output_features : output_features_sizes)
3051                             {
3052                                 test_params* params = new test_params(data_type, input_format, input_size.batch[0], input_size.feature[0], tensor(1, 1, input_size.spatial[0], input_size.spatial[1]), network_build_options);
3053                                 int input_features = params->input_layouts[0].size.feature[0];
3054                                 params->input_layouts.push_back(cldnn::layout(params->data_type, weights_format, cldnn::tensor(output_features, input_features, kernel_size.spatial[0], kernel_size.spatial[1]))); // weights
3055                                 params->input_layouts.push_back(cldnn::layout(params->data_type, params->fmt, cldnn::tensor(1, 1, output_features, 1))); // biases
3056                                 all_generic_params.push_back(params);
3057                             }
3058                         }
3059                     }
3060                 }
3061             }
3062         }
3063
3064         // Create all the combinations for the test.
3065         for (cldnn::primitive* layer_param : all_layer_params)
3066         {
3067             for (tests::test_params* test_param : all_generic_params)
3068             {
3069                 all_test_params.push_back(std::make_tuple(test_param, layer_param));
3070             }
3071         }
3072
3073         return all_test_params;
3074     }
3075
3076     virtual bool is_format_supported(cldnn::format format)
3077     {
3078         return ((format == cldnn_format_type::cldnn_format_bfyx) || (format == cldnn_format_type::cldnn_format_yxfb));
3079     }
3080
3081     virtual cldnn::tensor get_expected_output_tensor()
3082     {
3083         const cldnn::convolution* convolution = (cldnn::convolution*)layer_params;
3084         tensor input_size = generic_params->input_layouts[0].size;
3085         tensor dilation = convolution->dilation;
3086         tensor stride = convolution->stride;
3087         tensor input_offset = convolution->input_offset;
3088         tensor weights_size = generic_params->input_layouts[1].size;
3089
3090         int kernel_extent_y = dilation.spatial[1] * (weights_size.spatial[1] - 1) + 1;
3091         int kernel_extent_x = dilation.spatial[0] * (weights_size.spatial[0] - 1) + 1;
3092
3093         // Calculate output size
3094         int output_size_y = 1 + (input_size.spatial[1] - kernel_extent_y - 2 * input_offset.spatial[1]) / stride.spatial[1];
3095         int output_size_x = 1 + (input_size.spatial[0] - kernel_extent_x - 2 * input_offset.spatial[0]) / stride.spatial[0];
3096         int output_features = weights_size.batch[0];
3097
3098         return cldnn::tensor(input_size.batch[0], output_features, output_size_x, output_size_y);
3099     }
3100
3101     virtual void prepare_input_for_test(std::vector<cldnn::memory>& inputs)
3102     {
3103         if (generic_params->data_type == data_types::f32)
3104         {
3105             prepare_input_for_test_typed<float>(inputs);
3106         }
3107         else
3108         {
3109             prepare_input_for_test_typed<FLOAT16>(inputs);
3110         }
3111     }
3112
3113     template<typename Type>
3114     void prepare_input_for_test_typed(std::vector<cldnn::memory>& inputs)
3115     {
3116         int k = (generic_params->data_type == data_types::f32) ? 8 : 4;
3117
3118         // Update inputs.
3119         auto input = inputs[0];
3120         auto input_size = inputs[0].get_layout().size;
3121         VVVVF<Type> input_rnd = generate_random_4d<Type>(input_size.batch[0], input_size.feature[0], input_size.spatial[1], input_size.spatial[0], -2, 2, k);
3122         VF<Type> input_rnd_vec = flatten_4d<Type>(input.get_layout().format, input_rnd);
3123         set_values(input, input_rnd_vec);
3124
3125         // Update weights.
3126         auto weight_input = inputs[1];
3127         auto weight_size = inputs[1].get_layout().size;
3128         VVVVF<Type> weight_rnd = generate_random_4d<Type>(weight_size.batch[0], weight_size.feature[0], weight_size.spatial[1], weight_size.spatial[0], -2, 2, k);
3129         VF<Type> weight_rnd_vec = flatten_4d<Type>(weight_input.get_layout().format, weight_rnd);
3130         set_values(weight_input, weight_rnd_vec);
3131
3132         // Update biases.
3133         auto bias_input = inputs[2];
3134         auto bias_size = inputs[2].get_layout().size;
3135         VF<Type> bias_rnd = generate_random_1d<Type>(bias_size.spatial[0], -2, 2, k);
3136         set_values(bias_input, bias_rnd);
3137     }
3138
3139     template<typename Type>
3140     memory generate_reference_typed(const std::vector<cldnn::memory>& inputs)
3141     {
3142         // Output reference is always bfyx.
3143
3144         const cldnn::convolution* convolution = (cldnn::convolution*)layer_params;
3145
3146         data_types dt = inputs[0].get_layout().data_type;
3147
3148         tensor input_size = inputs[0].get_layout().size;
3149         tensor dilation = convolution->dilation;
3150         tensor stride = convolution->stride;
3151         bool is_relu_fused = convolution->with_activation;
3152         float activation_slope = convolution->activation_negative_slope;
3153         tensor input_offset = convolution->input_offset;
3154         tensor weights_size = inputs[1].get_layout().size;
3155         padding output_padding = convolution->output_padding;
3156
3157         tensor output_size = get_expected_output_tensor();
3158
3159         // Calculate output size
3160         int output_size_y = output_size.spatial[1];
3161         int output_size_x = output_size.spatial[0];
3162         int output_features = weights_size.batch[0];
3163         int input_features = weights_size.feature[0];
3164
3165         auto output = memory::allocate( engine, cldnn::layout(dt, cldnn::format::bfyx, output_size, output_padding) );
3166
3167         auto input_mem = inputs[0].pointer<Type>();
3168         auto weights_mem = inputs[1].pointer<Type>();
3169         auto bias_mem = inputs[2].pointer<Type>();
3170         auto output_mem = output.pointer<Type>();
3171
3172         tensor output_buffer_size = output.get_layout().get_buffer_size();
3173
3174         // Initialized output with zeros.
3175         std::fill(output_mem.begin(), output_mem.end(), static_cast<Type>(0));
3176
3177         // Add the bias
3178         for (int b = 0; b < input_size.batch[0]; b++)
3179         {
3180             for (int out_f = 0; out_f < output_features; out_f++)
3181             {
3182                 for (int y = 0; y < output_size_y; y++)
3183                 {
3184                     for (int x = 0; x < output_size_x; x++)
3185                     {
3186                         int output_index = (b * output_buffer_size.feature[0] + out_f) * output_buffer_size.spatial[1] * output_buffer_size.spatial[0];
3187                         tensor lower_output_padding = convolution->output_padding.lower_size();
3188                         output_index += (lower_output_padding.spatial[1] + y) * output_buffer_size.spatial[0] + lower_output_padding.spatial[0] + x;
3189
3190                         output_mem[output_index] += bias_mem[out_f];
3191                     }
3192                 }
3193             }
3194         }
3195
3196         const auto input0_desc = get_linear_memory_desc(inputs[0].get_layout());
3197         const auto input1_desc = get_linear_memory_desc(inputs[1].get_layout());
3198
3199         // Convolve with weights
3200         for (int b = 0; b < input_size.batch[0]; b++)
3201         {
3202             int input_bi = b;
3203             for (int out_f = 0; out_f < output_features; out_f++)
3204             {
3205                 for (int in_f = 0; in_f < input_features; in_f++)
3206                 {
3207                     int input_fi = in_f;
3208                     for (int y = 0; y < output_size_y; y++)
3209                     {
3210                         for (int x = 0; x < output_size_x; x++)
3211                         {
3212                             int output_bi = b;
3213                             int output_fi = out_f;
3214                             int output_yi = y;
3215                             int output_xi = x;
3216                             int output_index = (output_bi * output_buffer_size.feature[0] + output_fi) * output_buffer_size.spatial[1] * output_buffer_size.spatial[0];
3217                             tensor lower_output_padding = convolution->output_padding.lower_size();
3218                             output_index += (lower_output_padding.spatial[1] + output_yi) * output_buffer_size.spatial[0] + lower_output_padding.spatial[0] + output_xi;
3219
3220                             for (int kernel_y = 0; kernel_y < weights_size.spatial[1]; kernel_y++)
3221                             {
3222                                 int input_yi = y * stride.spatial[1] + input_offset.spatial[1] + kernel_y * dilation.spatial[1];
3223                                 if ((input_yi < 0) || (input_yi >= input_size.spatial[1]))
3224                                 {
3225                                     continue;
3226                                 }
3227
3228                                 for (int kernel_x = 0; kernel_x < weights_size.spatial[0]; kernel_x++)
3229                                 {
3230                                     int input_xi = x * stride.spatial[0] + input_offset.spatial[0] + kernel_x * dilation.spatial[0];
3231                                     if ((input_xi < 0) || (input_xi >= input_size.spatial[0]))
3232                                     {
3233                                         continue;
3234                                     }
3235
3236                                     size_t input_index = get_linear_index(inputs[0].get_layout(), input_bi, input_fi, input_yi, input_xi, input0_desc);
3237
3238                                     int weight_bi = out_f;
3239                                     int weight_fi = in_f;
3240                                     int weight_yi = kernel_y;
3241                                     int weight_xi = kernel_x;
3242                                     size_t weight_index = get_linear_index(inputs[1].get_layout(), weight_bi, weight_fi, weight_yi, weight_xi, input1_desc);
3243                                     output_mem[output_index] += input_mem[input_index] * weights_mem[weight_index];
3244                                 }
3245                             }
3246                         }
3247                     }
3248                 }
3249             }
3250         }
3251
3252         // Relu activation
3253         if (is_relu_fused)
3254         {
3255             for (int i = 0; i < (int)output_buffer_size.count(); i++)
3256             {
3257                 output_mem[i] = (output_mem[i] > 0.f) ? output_mem[i] : (output_mem[i] * (Type)activation_slope);
3258             }
3259         }
3260
3261         return output;
3262     }
3263
3264     virtual memory generate_reference(const std::vector<cldnn::memory>& inputs)
3265     {
3266         if (generic_params->data_type == data_types::f32)
3267         {
3268             return generate_reference_typed<float>(inputs);
3269         }
3270         else
3271         {
3272             return generate_reference_typed<FLOAT16>(inputs);
3273         }
3274     }
3275
3276 private:
3277
3278     static std::vector<tests::test_params*> all_generic_params;
3279     static std::vector<cldnn::primitive*> all_layer_params;
3280     static std::vector<std::tuple<tests::test_params*, cldnn::primitive*>> all_test_params;
3281 };
3282
3283 std::vector<tests::test_params*> convolution_test::all_generic_params = {};
3284 std::vector<cldnn::primitive*> convolution_test::all_layer_params = {};
3285 std::vector<std::tuple<tests::test_params*, cldnn::primitive*>> convolution_test::all_test_params = {};
3286
3287 TEST_P(convolution_test, CONVOLUTION)
3288 {
3289     run_single_test();
3290 }
3291
3292 INSTANTIATE_TEST_CASE_P(DISABLED_CONVOLUTION,
3293                         convolution_test,
3294                         ::testing::ValuesIn(convolution_test::generate_all_test_params()),
3295                         tests::generic_test::custom_param_name_functor());