inference-engine/thirdparty/clDNN/tests/test_cases/softmax_gpu_test.cpp

   1 /*
   2 // Copyright (c) 2016-2019 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 #include <gtest/gtest.h>
  18 #include "api/memory.hpp"
  19 #include <api/input_layout.hpp>
  20 #include "api/softmax.hpp"
  21 #include <api/topology.hpp>
  22 #include <api/network.hpp>
  23 #include <api/engine.hpp>
  24 #include "test_utils/test_utils.h"
  25
  26 using namespace cldnn;
  27 using namespace std;
  28 using namespace tests;
  29
  30 class softmax_gpu_xb_f32_test_fixture: public ::testing::Test {
  31 public:
  32     static const int32_t
  33         output_x  = 10, output_b  = 2,  // size of whole output buffer
  34         input_x   = 10, input_b   = 2,  // size of whole input buffer
  35         in_size   = input_x*input_b,
  36         out_size  = output_x*output_b;
  37
  38     float in_buffer[in_size];
  39     float out_buffer[out_size];
  40     float expected_buffer[out_size];
  41
  42     const cldnn::engine& engine;
  43     cldnn::memory input;
  44
  45     //neural::primitive output = memory::allocate({ memory::format::xb_f32, {output_b, {{output_x}}, 1}});
  46
  47     softmax_gpu_xb_f32_test_fixture()
  48         : engine(get_test_engine())
  49         ,input(memory::allocate(engine, { data_types::f32, format::yxfb, { input_b, 1, input_x, 1}}))
  50     {}
  51
  52     void compare_out_buffer_with_expected() {
  53         for(size_t i = 0; i < out_size; ++i) {
  54             // does output have expected values
  55             EXPECT_TRUE(are_equal(out_buffer[i], expected_buffer[i]))
  56                 << "At ["<< i <<  "] Expected : " << expected_buffer[i] << " actual : " << out_buffer[i];
  57         }
  58     }
  59
  60     void compare_out_buffer_with_expected_batch_wise() {
  61         for(size_t b = 0; b < output_b; ++b) {
  62             float batch_wise_sum = 0;
  63             for(size_t x = 0; x < output_x; ++x) {
  64                 auto idx = b+x*output_b;
  65                 batch_wise_sum += out_buffer[idx];
  66                 // does output have expected values
  67                 EXPECT_TRUE(are_equal(out_buffer[idx], expected_buffer[idx]))
  68                     << "At ["<< idx <<  "] Expected : " << expected_buffer[idx] << " actual : " << out_buffer[idx];
  69             }
  70             // does it sum to 1 batch wise
  71             EXPECT_TRUE(are_equal(batch_wise_sum, 1.0f))
  72                 << "Expected : " << 1.0f << " actual : " << batch_wise_sum;
  73         }
  74     }
  75 };
  76
  77 TEST_F(softmax_gpu_xb_f32_test_fixture, input_same_values) {
  78 // in_buffer filled with same value == 1.0f
  79     for(uint32_t i = 0; i < out_size; ++i) {
  80               in_buffer[i] = 1.0f;
  81         expected_buffer[i] = 0.1f;
  82     }
  83     std::vector<float> in_b(std::begin(in_buffer), std::end(in_buffer));
  84
  85     set_values(input, in_b);
  86
  87     network network(engine, topology(input_layout("input", input.get_layout()), softmax("softmax", "input")));
  88     network.set_input_data("input", input);
  89
  90     auto outputs = network.execute();
  91     EXPECT_EQ(outputs.size(), size_t(1));
  92     EXPECT_EQ(outputs.begin()->first, "softmax");
  93
  94     auto output_prim = outputs.begin()->second.get_memory();
  95
  96     auto output_ptr = output_prim.pointer<float>();
  97     for (uint32_t i = 0; i < out_size; i++)
  98     {
  99         out_buffer[i] = get_value<float>(output_ptr, i);
 100     }
 101     compare_out_buffer_with_expected();
 102 }
 103
 104 TEST_F(softmax_gpu_xb_f32_test_fixture, input_same_values_batch_wise) {
 105 // in_buffer filled with same value == 1..2 each batch accordingly (softmax can only xb_f32 )
 106     for(size_t i = 0; i < output_x; ++i) {
 107         for(size_t j = 0; j < output_b; ++j)
 108             in_buffer[j+i*output_b] = (j+i*output_b) % 2 +1.0f;
 109     }
 110
 111     std::vector<float> in_b(std::begin(in_buffer), std::end(in_buffer));
 112     set_values(input, in_b);
 113     // fill buffer with the expected 0.1f value
 114     for(size_t i = 0; i < out_size; ++i)
 115         expected_buffer[i] = 0.1f;
 116
 117     network network(engine, topology(input_layout("input", input.get_layout()), softmax("softmax", "input")));
 118     network.set_input_data("input", input);
 119
 120     auto outputs = network.execute();
 121     EXPECT_EQ(outputs.size(), size_t(1));
 122     EXPECT_EQ(outputs.begin()->first, "softmax");
 123
 124     auto output_prim = outputs.begin()->second.get_memory();
 125
 126     auto output_ptr = output_prim.pointer<float>();
 127     for (uint32_t i = 0; i < out_size; i++)
 128     {
 129         out_buffer[i] = get_value<float>(output_ptr, i);
 130     }
 131     compare_out_buffer_with_expected_batch_wise();
 132 }
 133
 134 TEST_F(softmax_gpu_xb_f32_test_fixture, values_batch_wise) {
 135
 136     float in_buf[in_size] = {
 137        //b0  b1
 138         2.0f, 2.0f, //x0
 139         2.0f, 2.0f, //x1
 140         2.0f, 2.0f, //x2
 141         3.0f, 3.0f, //x3
 142         5.0f, 5.0f, //x4
 143         4.0f, 4.0f, //x5
 144         3.0f, 3.0f, //x6
 145         2.0f, 2.0f, //x7
 146         2.0f, 2.0f, //x8
 147         2.0f, 2.0f  //x9
 148     };
 149
 150     float exp_buf[out_size] = {
 151         0.02569957f,     0.02569957f,
 152         0.02569957f,     0.02569957f,
 153         0.02569957f,     0.02569957f,
 154         0.069858674f,    0.069858674f,
 155         0.516189665f,    0.516189665f,
 156         0.189895565f,    0.189895565f,
 157         0.069858674f,    0.069858674f,
 158         0.02569957f,     0.02569957f,
 159         0.02569957f,     0.02569957f,
 160         0.02569957f,     0.02569957f
 161
 162     };
 163
 164     std::vector<float> in_b(std::begin(in_buf), std::end(in_buf));
 165     set_values(input, in_b);
 166     std::copy(exp_buf, exp_buf+in_size, expected_buffer);
 167
 168     // out_buffer filled with non-signaling NaN
 169     for(size_t i = 0; i < out_size; ++i)
 170         out_buffer[i] = NAN;
 171
 172     network network(engine, topology(input_layout("input", input.get_layout()), softmax("softmax", "input")));
 173     network.set_input_data("input", input);
 174
 175     auto outputs = network.execute();
 176     EXPECT_EQ(outputs.size(), size_t(1));
 177     EXPECT_EQ(outputs.begin()->first, "softmax");
 178
 179     auto output_prim = outputs.begin()->second.get_memory();
 180
 181     auto output_ptr = output_prim.pointer<float>();
 182     for (uint32_t i = 0; i < out_size; i++)
 183     {
 184         out_buffer[i] = get_value<float>(output_ptr, i);
 185     }
 186     compare_out_buffer_with_expected_batch_wise();
 187 }
 188
 189 TEST(softmax_gpu_bfyx_f32, normalize_fyx) {
 190     //  Input  : 2x3x2x2
 191     static const int32_t x_size = 2, y_size = 2, feature_num = 3,
 192         batch_num = 2, buf_size = x_size*y_size * batch_num * feature_num;
 193     const auto& engine = get_test_engine();
 194
 195     auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } });
 196     topology topology;
 197     topology.add(input_layout("input", input.get_layout()));
 198     topology.add(softmax("softmax", "input"));
 199
 200     set_values(input, {  //bfyx
 201              //y0x0  y0x1   y1x0    y1x1
 202         /*b0f0*/0.1f, -0.1f, 0.9f,  1.5f,
 203         /*b0f1*/0.2f, 0.2f,  -10.f, 5.2f,
 204         /*b1f2*/0.2f, 0.2f,  -10.f, 5.2f,
 205         /*b1f0*/3.f,  0.5f,  7.f,   12.f,
 206         /*b1f1*/4.f,  0.5f,  8.f,   8.2f,
 207         /*b1f2*/0.2f, 0.2f,  -10.f, 5.2f
 208     });
 209
 210     float expected_max_values[2] = {
 211         0.481618381f, 0.953259517f
 212     };
 213
 214     network network(engine, topology);
 215
 216     network.set_input_data("input", input);
 217     auto outputs = network.execute();
 218
 219     EXPECT_EQ(outputs.size(), size_t(1));
 220     EXPECT_EQ(outputs.begin()->first, "softmax");
 221
 222     auto output = outputs.at("softmax").get_memory();
 223     auto output_ptr = output.pointer<float>();
 224     float out_buffer[buf_size];
 225     for (uint32_t i = 0; i < buf_size; i++)
 226     {
 227         out_buffer[i] = get_value<float>(output_ptr, i);
 228     }
 229
 230     float sum = 0;
 231     float expected_sum = 1.0f;
 232
 233     float temp_max = 0;
 234     int max_value_buffer_index = 0;
 235
 236     for (uint32_t i = 0; i < batch_num; i++) //this for loops will sum results in a batch per feature, we expect that: sum = 1.0f
 237     {
 238         for (uint32_t j = 0; j < y_size; j++)
 239         {
 240             for (uint32_t k = 0; k < x_size; k++)
 241             {
 242                 for (uint32_t l = 0; l < feature_num; l++)
 243                 {
 244                     int index = i * feature_num * x_size * y_size + j * x_size + k + l * x_size * y_size;
 245                     sum += out_buffer[index];
 246                     if (out_buffer[index] >= temp_max)
 247                     {
 248                         temp_max = out_buffer[index];
 249                     }
 250                 }
 251             }
 252         }
 253
 254         EXPECT_EQ(true, are_equal(sum, expected_sum));
 255         sum = 0.0f;
 256         EXPECT_EQ(true, are_equal(temp_max, expected_max_values[max_value_buffer_index]));
 257         temp_max = 0;
 258         max_value_buffer_index++;
 259     }
 260 }
 261
 262 TEST(softmax_gpu_bfyx_f32, normalize_y) {
 263     //  Input  : 2x3x2x2
 264     static const int32_t x_size = 2, y_size = 2, feature_num = 3,
 265         batch_num = 2, buf_size = x_size*y_size * batch_num * feature_num;
 266     const auto& engine = get_test_engine();
 267
 268     auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } });
 269     topology topology;
 270     topology.add(input_layout("input", input.get_layout()));
 271     topology.add(softmax("softmax", "input", softmax::normalize_y));
 272
 273     vector<float> input_vec = {
 274               //y0x0  y0x1   y1x0    y1x1
 275         /*b0f0*/0.1f, -0.1f, 0.9f,  1.5f,
 276         /*b0f1*/0.2f, 0.2f,  -10.f, 5.2f,
 277         /*b0f2*/0.2f, 0.2f,  -10.f, 5.2f,
 278
 279         /*b1f0*/3.f,  0.5f,  7.f,   12.f,
 280         /*b1f1*/4.f,  0.5f,  8.f,   8.2f,
 281         /*b1f2*/0.2f, 0.2f,  -10.f, 5.2f
 282     };
 283     set_values(input, input_vec);
 284
 285     float expected_max_values[12] = {
 286         0.689974481f,   //b=0, f=0, x=0
 287         0.832018385f,   //b=0, f=0, x=1
 288
 289         0.999962831f,   //b=0, f=1, x=0
 290         0.993307149f,   //b=0, f=1, x=1
 291
 292         0.999962831f,   //b=0, f=2, x=0
 293         0.993307149f,   //b=0, f=2, x=1
 294
 295         0.98201379f,    //b=1, f=0, x=0
 296         0.99998987f,    //b=1, f=0, x=1
 297
 298         0.98201379f,    //b=1, f=1, x=0
 299         0.999547378f,   //b=1, f=1, x=1
 300
 301         0.999962831f,   //b=1, f=2, x=0
 302         0.993307149f    //b=1, f=2, x=1
 303     };
 304
 305     network network(engine, topology);
 306
 307     network.set_input_data("input", input);
 308     auto outputs = network.execute();
 309
 310     EXPECT_EQ(outputs.size(), size_t(1));
 311     EXPECT_EQ(outputs.begin()->first, "softmax");
 312
 313     auto output = outputs.at("softmax").get_memory();
 314     auto output_ptr = output.pointer<float>();
 315     float out_buffer[buf_size];
 316     for (uint32_t i = 0; i < buf_size; i++)
 317     {
 318         out_buffer[i] = get_value<float>(output_ptr, i);
 319     }
 320
 321     float temp_max = 0;
 322     float expected_sum = 1.0f;
 323     int max_value_buffer_index = 0;
 324     for (uint32_t i = 0; i < batch_num; i++) //this for loops will sum results in a batch per feature, we expect that: sum = 1.0f
 325     {
 326         for (uint32_t l = 0; l < feature_num; l++)
 327         {
 328             for (uint32_t k = 0; k < x_size; k++)
 329             {
 330                 float sum = 0.0f;
 331                 for (uint32_t j = 0; j < y_size; j++)
 332                 {
 333                     int index = i * feature_num * x_size * y_size +
 334                         l * x_size * y_size +
 335                         j * x_size +
 336                         k;
 337
 338                     if (out_buffer[index] >= temp_max)
 339                     {
 340                         temp_max = out_buffer[index];
 341                     }
 342
 343                     sum += out_buffer[index];
 344                 }
 345                 EXPECT_EQ(true, are_equal(temp_max, expected_max_values[max_value_buffer_index]));
 346                 temp_max = 0;
 347                 max_value_buffer_index++;
 348
 349                 EXPECT_EQ(true, are_equal(sum, expected_sum));
 350                 sum = 0.0f;
 351             }
 352         }
 353     }
 354 }
 355
 356 TEST(softmax_gpu_bfyx_f32, normalize_f) {
 357     //  Input  : 2x3x2x2
 358     static const int32_t x_size = 2, y_size = 2, feature_num = 3,
 359         batch_num = 2, buf_size = x_size*y_size * batch_num * feature_num;
 360     const auto& engine = get_test_engine();
 361
 362     auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } });
 363     topology topology;
 364     topology.add(input_layout("input", input.get_layout()));
 365     topology.add(softmax("softmax", "input", softmax::normalize_f));
 366
 367     vector<float> input_vec = {
 368         //y0x0  y0x1   y1x0    y1x1
 369         /*b0f0*/0.1f, -0.1f, 0.9f,  1.5f,
 370         /*b0f1*/0.2f, 0.2f,  -10.f, 5.2f,
 371         /*b0f2*/0.2f, 0.2f,  -10.f, 5.2f,
 372
 373         /*b1f0*/3.f,  0.5f,  7.f,   12.f,
 374         /*b1f1*/4.f,  0.5f,  8.f,   8.2f,
 375         /*b1f2*/0.2f, 0.2f,  -10.f, 5.2f
 376     };
 377     set_values(input, input_vec);
 378
 379     float expected_max_values[8] = {
 380         0.344253346f, //b=0, y=0, x=0
 381         0.364854551f, //b=0, y=0, x=1
 382
 383         0.999963085f, //b=0, y=1, x=0
 384         0.493894592f, //b=0, y=1, x=1
 385
 386         0.719294981f, //b=1, y=0, x=0
 387         0.364854551f, //b=1, y=0, x=1
 388
 389         0.73105857f, //b=1, y=1, x=0
 390         0.977054322f //b=1, y=1, x=1
 391     };
 392
 393     network network(engine, topology);
 394
 395     network.set_input_data("input", input);
 396     auto outputs = network.execute();
 397
 398     EXPECT_EQ(outputs.size(), size_t(1));
 399     EXPECT_EQ(outputs.begin()->first, "softmax");
 400
 401     auto output = outputs.at("softmax").get_memory();
 402     auto output_ptr = output.pointer<float>();
 403     float out_buffer[buf_size];
 404     for (uint32_t i = 0; i < buf_size; i++)
 405     {
 406         out_buffer[i] = get_value<float>(output_ptr, i);
 407     }
 408
 409     float temp_max = 0;
 410     float expected_sum = 1.0f;
 411     int max_value_buffer_index = 0;
 412     for (uint32_t i = 0; i < batch_num; i++) //this for loops will sum results in a batch per feature, we expect that: sum = 1.0f
 413     {
 414         for (uint32_t j = 0; j < y_size; j++)
 415         {
 416             for (uint32_t k = 0; k < x_size; k++)
 417             {
 418                 float sum = 0.0f;
 419                 for (uint32_t l = 0; l < feature_num; l++)
 420                 {
 421                     int index = i * feature_num * x_size * y_size +
 422                         l * x_size * y_size +
 423                         j * x_size +
 424                         k;
 425
 426                     if (out_buffer[index] >= temp_max)
 427                     {
 428                         temp_max = out_buffer[index];
 429                     }
 430
 431                     sum += out_buffer[index];
 432                 }
 433                 EXPECT_EQ(true, are_equal(temp_max, expected_max_values[max_value_buffer_index]));
 434                 temp_max = 0;
 435                 max_value_buffer_index++;
 436
 437                 EXPECT_EQ(true, are_equal(sum, expected_sum));
 438                 sum = 0.0f;
 439             }
 440         }
 441     }
 442 }
 443
 444 TEST(softmax_gpu_yxfb_f32, normalize_f) {
 445
 446     static const int32_t x_size = 1, y_size = 2, feature_num = 1,
 447         batch_num = 12, buf_size = x_size*y_size * batch_num * feature_num;
 448     const auto& engine = get_test_engine();
 449
 450     auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ batch_num, feature_num, y_size , x_size } });
 451     topology topology;
 452     topology.add(input_layout("input", input.get_layout()));
 453     topology.add(softmax("softmax", "input", softmax::normalize_fyx));
 454
 455     set_values(input, {  //yxfb
 456                 //f0b0  f0b1  f0b2  f0b3  f0b4    f0b5    f0b6   f0b7   f0b8    f0b9   f0b10  f0b11
 457         /*y0x0*/ 0.1f, -0.1f, 0.9f, 1.5f, 0.15f, -0.01f, 0.19f,  0.45f, 0.41f, -0.12f, 0.39f, 0.65f,
 458         /*y1x0*/ 0.2f, 0.2f, -10.f, 5.2f, 0.01f, 0.015f, 0.29f,  0.05f, 0.41f, -0.31f, 0.29f, 1.35f
 459     });
 460
 461     float expected_max_values[batch_num * feature_num * x_size] = {
 462         0.524979174f,
 463         0.574442506f,
 464         0.999981523f,
 465         0.975872993f,
 466         0.534942925f,
 467         0.506249666f,
 468         0.524979174f,
 469         0.598687649f,
 470         0.500000000f,
 471         0.547357619f,
 472         0.524979174f,
 473         0.668187797f
 474     };
 475
 476     network network(engine, topology);
 477
 478     network.set_input_data("input", input);
 479     auto outputs = network.execute();
 480
 481     EXPECT_EQ(outputs.size(), size_t(1));
 482     EXPECT_EQ(outputs.begin()->first, "softmax");
 483
 484     auto output = outputs.at("softmax").get_memory();
 485     auto output_ptr = output.pointer<float>();
 486     float out_buffer[buf_size];
 487     for (uint32_t i = 0; i < buf_size; i++)
 488     {
 489         out_buffer[i] = get_value<float>(output_ptr, i);
 490     }
 491
 492     float sum = 0;
 493     float expected_sum = 1.0f;
 494
 495     float temp_max = 0;
 496
 497     for (uint32_t b = 0; b < batch_num; b++)
 498     {
 499         for (uint32_t f = 0; f < feature_num; f++)
 500         {
 501             for (uint32_t x = 0; x < x_size; x++)
 502             {
 503                 float sum = 0.0f;
 504                 for (uint32_t y = 0; y < y_size; y++)
 505                 {
 506                     int index = b + y * batch_num + f * feature_num + x * x_size;
 507                     if (out_buffer[index] >= temp_max)
 508                     {
 509                         temp_max = out_buffer[index];
 510                     }
 511                     sum += out_buffer[index];
 512                 }
 513                 EXPECT_EQ(true, are_equal(temp_max, expected_max_values[b * feature_num * x_size + f * x_size + x]));
 514                 temp_max = 0;
 515                 EXPECT_EQ(true, are_equal(sum, expected_sum));
 516                 sum = 0.0f;
 517             }
 518         }
 519     }
 520 }
 521
 522 TEST(softmax_gpu_bfzyx_f32, normalize_z) {
 523     //  Input  : 2x3x2x2x2
 524     static const int32_t x_size = 2, y_size = 2, z_size = 2, feature_num = 3,
 525         batch_num = 2, buf_size = x_size  *y_size * z_size * batch_num * feature_num;
 526     const auto& engine = get_test_engine();
 527
 528     auto input = memory::allocate(engine, { data_types::f32, format::bfzyx,{ batch_num, feature_num, x_size , y_size, z_size } });
 529     topology topology;
 530     topology.add(input_layout("input", input.get_layout()));
 531     topology.add(softmax("softmax", "input", softmax::normalize_z));
 532
 533     vector<float> input_vec = {
 534         //    z0y0x0 z0y0x1 z0y1x0 z0y1x1 z1y0x0 z1y0x1 z1y1x0 z1y1x1
 535         /*b0f0*/0.1f, -0.1f, 0.9f,  1.5f, 0.2f, -0.2f, 0.9f,  2.5f,
 536         /*b0f1*/0.2f, 0.2f,  -10.f, 5.2f, 0.3f, 0.1f,  -11.f, 6.2f,
 537         /*b0f2*/0.2f, 0.2f,  -10.f, 5.2f, 0.1f, 0.3f,  -9.f,  4.2f,
 538
 539         /*b1f0*/3.f,  0.5f,  7.f,   12.f, 5.f,  0.1f,  6.f,   22.f,
 540         /*b1f1*/4.f,  0.5f,  8.f,   8.2f, 2.2f,  0.3f,  6.f,  5.2f,
 541         /*b1f2*/0.2f, 0.2f,  -10.f, 5.2f, 1.2f, 0.3f,  -12.f,  2.2f
 542     };
 543     set_values(input, input_vec);
 544
 545     float expected_max_values[24] = {
 546         0.524979f, 0.524979f,
 547         0.5f,      0.731059f,
 548         0.524979f, 0.524979f,
 549         0.731059f, 0.731059f,
 550         0.524979f, 0.524979f,
 551         0.731059f, 0.731059f,
 552         0.880797f, 0.598688f,
 553         0.731059f, 0.999955f,
 554         0.858149f, 0.549834f,
 555         0.880797f, 0.952574f,
 556         0.731059f, 0.524979f,
 557         0.880797f, 0.952574f,
 558     };
 559
 560     network network(engine, topology);
 561
 562     network.set_input_data("input", input);
 563     auto outputs = network.execute();
 564
 565     EXPECT_EQ(outputs.size(), size_t(1));
 566     EXPECT_EQ(outputs.begin()->first, "softmax");
 567
 568     auto output = outputs.at("softmax").get_memory();
 569     auto output_ptr = output.pointer<float>();
 570     float out_buffer[buf_size];
 571     for (uint32_t i = 0; i < buf_size; i++)
 572     {
 573         out_buffer[i] = get_value<float>(output_ptr, i);
 574     }
 575
 576     float temp_max = 0;
 577     float expected_sum = 1.0f;
 578     int max_value_buffer_index = 0;
 579     for (uint32_t i = 0; i < batch_num; i++)
 580     {
 581         for (uint32_t l = 0; l < feature_num; l++)
 582         {
 583             for (uint32_t j = 0; j < y_size; j++)
 584             {
 585                 for (uint32_t k = 0; k < x_size; k++)
 586                 {
 587                     float sum = 0.0f;
 588                     for (uint32_t m = 0; m < z_size; m++)
 589                     {
 590                         int index = i * feature_num * x_size * y_size * z_size +
 591                             l * x_size * y_size * z_size +
 592                             m * x_size * y_size +
 593                             j * x_size +
 594                             k;
 595
 596                         if (out_buffer[index] >= temp_max)
 597                         {
 598                             temp_max = out_buffer[index];
 599                         }
 600
 601                         sum += out_buffer[index];
 602                     }
 603                     EXPECT_EQ(true, are_equal(temp_max, expected_max_values[max_value_buffer_index]));
 604                     temp_max = 0;
 605                     max_value_buffer_index++;
 606                     EXPECT_EQ(true, are_equal(sum, expected_sum));
 607                     sum = 0.0f;
 608                 }
 609             }
 610         }
 611     }
 612 }
 613
 614 TEST(softmax_gpu_bfyx_f32, normalize_all) {
 615     //  Input  : 2x3x2x2
 616     static const int32_t x_size = 2, y_size = 2, feature_num = 3,
 617                          batch_num = 2, buf_size = x_size * y_size * batch_num * feature_num;
 618     const auto& engine = get_test_engine();
 619
 620     auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {batch_num, feature_num, x_size, y_size}});
 621     topology topology;
 622     topology.add(input_layout("input", input.get_layout()));
 623     topology.add(softmax("softmax", "input", softmax::normalize_all));
 624
 625     set_values(input, {//bfyx
 626                        //       y0x0  y0x1   y1x0    y1x1
 627                        /*b0f0*/ 0.1f, -0.1f, 0.9f, 1.5f,
 628                        /*b0f1*/ 0.2f, 0.2f, -10.f, 5.2f,
 629                        /*b0f2*/ 0.2f, 0.2f, -10.f, 5.2f,
 630                        /*b1f0*/ 3.f, 0.5f, 7.f, 12.f,
 631                        /*b1f1*/ 4.f, 0.5f, 8.f, 8.2f,
 632                        /*b1f2*/ 0.2f, 0.2f, -10.f, 5.2f});
 633
 634     network network(engine, topology);
 635
 636     network.set_input_data("input", input);
 637     auto outputs = network.execute();
 638
 639     EXPECT_EQ(outputs.size(), size_t(1));
 640     EXPECT_EQ(outputs.begin()->first, "softmax");
 641
 642     auto output = outputs.at("softmax").get_memory();
 643     auto output_ptr = output.pointer<float>();
 644     float sum = 0.0f;
 645     float expected_sum = 1.0f;
 646     for (uint32_t i = 0; i < buf_size; i++) {
 647         sum += get_value<float>(output_ptr, i);
 648     }
 649     EXPECT_EQ(true, are_equal(sum, expected_sum));
 650 }
 651
 652 TEST(softmax_gpu_yxfb_f32, normalize_all) {
 653     //  Input  : 2x2x3x2
 654     static const int32_t x_size = 2, y_size = 2, feature_num = 3,
 655                          batch_num = 2, buf_size = x_size * y_size * batch_num * feature_num;
 656     const auto& engine = get_test_engine();
 657
 658     auto input = memory::allocate(engine, {data_types::f32, format::yxfb, {y_size, x_size, feature_num, batch_num}});
 659     topology topology;
 660     topology.add(input_layout("input", input.get_layout()));
 661     topology.add(softmax("softmax", "input", softmax::normalize_all));
 662
 663     set_values(input, {//yxfb
 664                        //       f0b0  f0b1   f1b0    f1b1
 665                        /*y0x0*/ 0.1f, -0.1f, 0.9f, 1.5f,
 666                        /*y0x1*/ 0.2f, 0.2f, -10.f, 5.2f,
 667                        /*y0x2*/ 0.2f, 0.2f, -10.f, 5.2f,
 668                        /*y1x0*/ 3.f, 0.5f, 7.f, 12.f,
 669                        /*y1x1*/ 4.f, 0.5f, 8.f, 8.2f,
 670                        /*y1x2*/ 0.2f, 0.2f, -10.f, 5.2f});
 671
 672     network network(engine, topology);
 673
 674     network.set_input_data("input", input);
 675     auto outputs = network.execute();
 676
 677     EXPECT_EQ(outputs.size(), size_t(1));
 678     EXPECT_EQ(outputs.begin()->first, "softmax");
 679
 680     auto output = outputs.at("softmax").get_memory();
 681     auto output_ptr = output.pointer<float>();
 682     float sum = 0.0f;
 683     float expected_sum = 1.0f;
 684     for (uint32_t i = 0; i < buf_size; i++) {
 685         sum += get_value<float>(output_ptr, i);
 686     }
 687     EXPECT_EQ(true, are_equal(sum, expected_sum));
 688 }
 689
 690 TEST(softmax_gpu_bfzyx_f32, normalize_all) {
 691     //  Input  : 2x3x2x2x2
 692     static const int32_t x_size = 2, y_size = 2, z_size = 2, feature_num = 3,
 693                          batch_num = 2, buf_size = x_size * y_size * z_size * batch_num * feature_num;
 694     const auto& engine = get_test_engine();
 695
 696     auto input = memory::allocate(engine, {data_types::f32, format::bfzyx, {batch_num, feature_num, x_size, y_size, z_size}});
 697     topology topology;
 698     topology.add(input_layout("input", input.get_layout()));
 699     topology.add(softmax("softmax", "input", softmax::normalize_all));
 700
 701     set_values(input, {//    z0y0x0 z0y0x1 z0y1x0 z0y1x1 z1y0x0 z1y0x1 z1y1x0 z1y1x1
 702                        /*b0f0*/ 0.1f, -0.1f, 0.9f, 1.5f, 0.2f, -0.2f, 0.9f, 2.5f,
 703                        /*b0f1*/ 0.2f, 0.2f, -10.f, 5.2f, 0.3f, 0.1f, -11.f, 6.2f,
 704                        /*b0f2*/ 0.2f, 0.2f, -10.f, 5.2f, 0.1f, 0.3f, -9.f, 4.2f,
 705
 706                        /*b1f0*/ 3.f, 0.5f, 7.f, 12.f, 5.f, 0.1f, 6.f, 22.f,
 707                        /*b1f1*/ 4.f, 0.5f, 8.f, 8.2f, 2.2f, 0.3f, 6.f, 5.2f,
 708                        /*b1f2*/ 0.2f, 0.2f, -10.f, 5.2f, 1.2f, 0.3f, -12.f, 2.2f});
 709     network network(engine, topology);
 710
 711     network.set_input_data("input", input);
 712     auto outputs = network.execute();
 713
 714     EXPECT_EQ(outputs.size(), size_t(1));
 715     EXPECT_EQ(outputs.begin()->first, "softmax");
 716
 717     auto output = outputs.at("softmax").get_memory();
 718     auto output_ptr = output.pointer<float>();
 719     float sum = 0.0f;
 720     float expected_sum = 1.0f;
 721     for (uint32_t i = 0; i < buf_size; i++) {
 722         sum += get_value<float>(output_ptr, i);
 723     }
 724     EXPECT_EQ(true, are_equal(sum, expected_sum));
 725 }
 726
 727 TEST(softmax_gpu_bfyx_f16, normalize_all) {
 728     //  Input  : 2x3x2x2
 729     static const int32_t x_size = 2, y_size = 2, feature_num = 3,
 730                          batch_num = 2, buf_size = x_size * y_size * batch_num * feature_num;
 731     const auto& engine = get_test_engine();
 732
 733     auto input = memory::allocate(engine, {data_types::f16, format::bfyx, {batch_num, feature_num, x_size, y_size}});
 734     topology topology;
 735     topology.add(input_layout("input", input.get_layout()));
 736     topology.add(softmax("softmax", "input", softmax::normalize_all));
 737
 738     set_values(input, {//bfyx
 739                        //           y0x0            y0x1            y1x0            y1x1
 740                        /*b0f0*/ FLOAT16(0.1f), FLOAT16(-0.1f), FLOAT16(0.9f), FLOAT16(1.5f),
 741                        /*b0f1*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f),
 742                        /*b0f2*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f),
 743                        /*b1f0*/ FLOAT16(3.f), FLOAT16(0.5f), FLOAT16(7.f), FLOAT16(12.f),
 744                        /*b1f1*/ FLOAT16(4.f), FLOAT16(0.5f), FLOAT16(8.f), FLOAT16(8.2f),
 745                        /*b1f2*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f)});
 746
 747     network network(engine, topology);
 748
 749     network.set_input_data("input", input);
 750     auto outputs = network.execute();
 751
 752     EXPECT_EQ(outputs.size(), size_t(1));
 753     EXPECT_EQ(outputs.begin()->first, "softmax");
 754
 755     auto output = outputs.at("softmax").get_memory();
 756     auto output_ptr = output.pointer<uint16_t>();
 757     float sum = 0.0f;
 758     float expected_sum = 1.0f;
 759     for (uint32_t i = 0; i < buf_size; i++) {
 760         sum += float16_to_float32(get_value<uint16_t>(output_ptr, i));
 761     }
 762     ASSERT_NEAR(sum, expected_sum, 0.001);
 763 }
 764
 765 TEST(softmax_gpu_yxfb_f16, normalize_all) {
 766     //  Input  : 2x2x3x2
 767     static const int32_t x_size = 2, y_size = 2, feature_num = 3,
 768                          batch_num = 2, buf_size = x_size * y_size * batch_num * feature_num;
 769     const auto& engine = get_test_engine();
 770
 771     auto input = memory::allocate(engine, {data_types::f16, format::yxfb, {y_size, x_size, feature_num, batch_num}});
 772     topology topology;
 773     topology.add(input_layout("input", input.get_layout()));
 774     topology.add(softmax("softmax", "input", softmax::normalize_all));
 775
 776     set_values(input, {//yxfb
 777                        //           f0b0            f0b1            f1b0            f1b1
 778                        /*y0x0*/ FLOAT16(0.1f), FLOAT16(-0.1f), FLOAT16(0.9f), FLOAT16(1.5f),
 779                        /*y0x1*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f),
 780                        /*y0x2*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f),
 781                        /*y1x0*/ FLOAT16(3.f), FLOAT16(0.5f), FLOAT16(7.f), FLOAT16(12.f),
 782                        /*y1x1*/ FLOAT16(4.f), FLOAT16(0.5f), FLOAT16(8.f), FLOAT16(8.2f),
 783                        /*y1x2*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f)});
 784
 785     network network(engine, topology);
 786
 787     network.set_input_data("input", input);
 788     auto outputs = network.execute();
 789
 790     EXPECT_EQ(outputs.size(), size_t(1));
 791     EXPECT_EQ(outputs.begin()->first, "softmax");
 792
 793     auto output = outputs.at("softmax").get_memory();
 794     auto output_ptr = output.pointer<uint16_t>();
 795     float sum = 0.0f;
 796     float expected_sum = 1.0f;
 797     for (uint32_t i = 0; i < buf_size; i++) {
 798         sum += float16_to_float32(get_value<uint16_t>(output_ptr, i));
 799     }
 800     ASSERT_NEAR(sum, expected_sum, 0.001);
 801 }
 802
 803 TEST(softmax_gpu_bfzyx_f16, normalize_all) {
 804     //  Input  : 2x3x2x2x2
 805     static const int32_t x_size = 2, y_size = 2, z_size = 2, feature_num = 3,
 806                          batch_num = 2, buf_size = x_size * y_size * z_size * batch_num * feature_num;
 807     const auto& engine = get_test_engine();
 808
 809     auto input = memory::allocate(engine, {data_types::f16, format::bfzyx, {batch_num, feature_num, x_size, y_size, z_size}});
 810     topology topology;
 811     topology.add(input_layout("input", input.get_layout()));
 812     topology.add(softmax("softmax", "input", softmax::normalize_all));
 813
 814     set_values(input, {//           z0y0x0          z0y0x1          z0y1x0        z0y1x1        z1y0x0          z1y0x1          z1y1x0          z1y1x1
 815                        /*b0f0*/ FLOAT16(0.1f), FLOAT16(-0.1f), FLOAT16(0.9f), FLOAT16(1.5f), FLOAT16(0.2f), FLOAT16(-0.2f), FLOAT16(0.9f), FLOAT16(2.5f),
 816                        /*b0f1*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f), FLOAT16(0.3f), FLOAT16(0.1f), FLOAT16(-11.f), FLOAT16(6.2f),
 817                        /*b0f2*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f), FLOAT16(0.1f), FLOAT16(0.3f), FLOAT16(-9.f), FLOAT16(4.2f),
 818
 819                        /*b1f0*/ FLOAT16(3.f), FLOAT16(0.5f), FLOAT16(7.f), FLOAT16(12.f), FLOAT16(5.f), FLOAT16(0.1f), FLOAT16(6.f), FLOAT16(22.f),
 820                        /*b1f1*/ FLOAT16(4.f), FLOAT16(0.5f), FLOAT16(8.f), FLOAT16(8.2f), FLOAT16(2.2f), FLOAT16(0.3f), FLOAT16(6.f), FLOAT16(5.2f),
 821                        /*b1f2*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f), FLOAT16(1.2f), FLOAT16(0.3f), FLOAT16(-12.f), FLOAT16(2.2f)});
 822     network network(engine, topology);
 823
 824     network.set_input_data("input", input);
 825     auto outputs = network.execute();
 826
 827     EXPECT_EQ(outputs.size(), size_t(1));
 828     EXPECT_EQ(outputs.begin()->first, "softmax");
 829
 830     auto output = outputs.at("softmax").get_memory();
 831     auto output_ptr = output.pointer<uint16_t>();
 832     float sum = 0.0f;
 833     float expected_sum = 1.0f;
 834     for (uint32_t i = 0; i < buf_size; i++) {
 835         sum += float16_to_float32(get_value<uint16_t>(output_ptr, i));
 836     }
 837     ASSERT_NEAR(sum, expected_sum, 0.001);
 838 }
 839
 840 //////////////////////////////////////////////////////////////////////////////
 841 //                                                                          //
 842 //                      Exhaustive Negative Matrix tests                    //
 843 //                                                                          //
 844 //////////////////////////////////////////////////////////////////////////////
 845
 846 //TODO:
 847 //TEST(NegativeSoftmaxTest, DISABLED_TestAll) {
 848 //}
 849
 850 //////////////////////////////////////////////////////////////////////////////
 851 //                                                                          //
 852 //                      Exhaustive Positive Matrix tests                    //
 853 //                                                                          //
 854 //////////////////////////////////////////////////////////////////////////////
 855
 856 using namespace cldnn;
 857
 858 class softmax_test : public tests::generic_test
 859 {
 860
 861 public:
 862     softmax_test() : tests::generic_test()
 863     {
 864     }
 865
 866     virtual void SetUp() override
 867     {
 868         max_ulps_diff_allowed = 6;
 869     }
 870
 871     static void TearDownTestCase()
 872     {
 873         for (auto generic_params : all_generic_params)
 874         {
 875             delete generic_params;
 876         }
 877
 878         all_layer_params.clear();
 879     }
 880
 881     static std::vector<std::shared_ptr<cldnn::primitive>> generate_specific_test_params()
 882     {
 883         all_layer_params.emplace_back(new softmax("softmax", "input0", softmax::normalize_f));
 884
 885         //The test checks only valid combinations.
 886         //TODO: add more combinations.
 887
 888         return all_layer_params;
 889     }
 890
 891     static std::vector<tests::test_params*> generate_generic_test_params()
 892     {
 893         return generic_test::generate_generic_test_params(all_generic_params);
 894     }
 895
 896     virtual bool is_format_supported(cldnn::format format) override
 897     {
 898         return
 899             format == cldnn::format::yxfb ||
 900             format == cldnn::format::bfyx;
 901     }
 902
 903     template<typename Type>
 904     memory generate_reference_typed(const std::vector<memory> & inputs)
 905     {
 906         assert(inputs.size() == 1);
 907         const memory & input = inputs[0];
 908
 909         //Output is bfyx
 910         auto output = memory::allocate(engine, cldnn::layout(input.get_layout().data_type, input.get_layout().format, input.get_layout().size));
 911
 912 //        const auto params = static_cast<cldnn::softmax *>(layer_parmas);
 913
 914         const auto in0_mem = input.pointer<Type>();
 915         auto out_mem = output.pointer<Type>();
 916
 917         const int in0_b = input.get_layout().size.sizes()[0];
 918         const int in0_f = input.get_layout().size.sizes()[1];
 919         const int in0_h = input.get_layout().size.sizes()[3];
 920         const int in0_w = input.get_layout().size.sizes()[2];
 921
 922 //        const int out_b = output.get_layout().size.transform(cldnn::format::bfyx, 0).sizes()[0];
 923 //        const int out_f = output.get_layout().size.transform(cldnn::format::bfyx, 0).sizes()[1];
 924 //        const int out_h = output.get_layout().size.transform(cldnn::format::bfyx, 0).sizes()[2];
 925 //        const int out_w = output.get_layout().size.transform(cldnn::format::bfyx, 0).sizes()[3];
 926
 927 //        assert(in0_b == out_b);
 928 //        assert(in0_f == out_f);
 929 //        assert(in0_h == out_h);
 930 //        assert(in0_w == out_w);
 931
 932         std::vector<float> cached_exp_vals;
 933         cached_exp_vals.resize(in0_f);
 934
 935         const auto input_desc = get_linear_memory_desc(input.get_layout());
 936
 937         for (int n = 0; n < in0_b; ++n)
 938         for (int y = 0; y < in0_h; ++y)
 939         for (int x = 0; x < in0_w; ++x)
 940         {
 941             float max_val = -std::numeric_limits<float>::infinity();
 942
 943             for (int c = 0; c < in0_f; ++c)
 944             {
 945                 const size_t in0_idx = get_linear_index(input.get_layout(), n, c, y, x, input_desc);
 946
 947                 max_val = std::max(max_val, static_cast<float>(in0_mem[in0_idx]));
 948             }
 949
 950             float Z = 0;
 951
 952             for (int c = 0; c < in0_f; ++c)
 953             {
 954                 const size_t in0_idx = get_linear_index(input.get_layout(), n, c, y, x, input_desc);
 955
 956                 float tmp = static_cast<float>((Type)std::exp(static_cast<float>(in0_mem[in0_idx]) - max_val));
 957                 Z += tmp;
 958                 cached_exp_vals[c] = tmp;
 959             }
 960
 961             for (int c = 0; c < in0_f; ++c)
 962             {
 963                 const size_t out_idx = get_linear_index(output.get_layout(), n, c, y, x, input_desc);
 964                 out_mem[out_idx] = (Type)(cached_exp_vals[c] / Z);
 965             }
 966         }
 967
 968         return output;
 969     }
 970
 971     virtual memory generate_reference(const std::vector<memory> & inputs) override
 972     {
 973         if (generic_params->data_type == data_types::f32)
 974         {
 975             return generate_reference_typed<float>(inputs);
 976         }
 977         else
 978         {
 979             return generate_reference_typed<FLOAT16>(inputs);
 980         }
 981     }
 982
 983     static std::string custom_param_name(const ::testing::TestParamInfo<std::tuple<test_params*, std::shared_ptr<cldnn::primitive>>>& info)
 984     {
 985         std::stringstream res;
 986
 987         const auto & p = std::get<0>(info.param);
 988
 989         assert (p->data_type == data_types::f32 ||
 990                 p->data_type == data_types::f16);
 991
 992         res << info.index
 993             << "_" << (p->data_type == data_types::f32 ? "f32" : "f16");
 994
 995         for (unsigned i = 0; i < p->input_layouts.size(); ++i)
 996         {
 997             const auto chans = format::traits(p->fmt).order;
 998
 999             res << "_" << "Input" << i;
1000             for (unsigned int j = 0; j < p->input_layouts[i].size.sizes(p->fmt).size(); ++j)
1001             {
1002                 res << chans[j] << p->input_layouts[i].size.sizes(p->fmt)[j];
1003             }
1004         }
1005
1006         return res.str();
1007     }
1008
1009 private:
1010
1011     static std::vector<tests::test_params*> all_generic_params;
1012     static std::vector<std::shared_ptr<cldnn::primitive>> all_layer_params;
1013
1014 };
1015
1016 std::vector<std::shared_ptr<cldnn::primitive>> softmax_test::all_layer_params = {};
1017 std::vector<tests::test_params*> softmax_test::all_generic_params = {};
1018
1019 TEST_P(softmax_test, SOFTMAX)
1020 {
1021     run_single_test();
1022 }
1023
1024 INSTANTIATE_TEST_CASE_P(DISABLED_SOFTMAX,
1025     softmax_test,
1026     ::testing::Combine(::testing::ValuesIn(softmax_test::generate_generic_test_params()), ::testing::ValuesIn(softmax_test::generate_specific_test_params())),
1027     softmax_test::custom_param_name);
1028