Publishing 2019 R3 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / tests / test_cases / softmax_gpu_test.cpp
1 /*
2 // Copyright (c) 2016-2019 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include <gtest/gtest.h>
18 #include "api/memory.hpp"
19 #include <api/input_layout.hpp>
20 #include "api/softmax.hpp"
21 #include <api/topology.hpp>
22 #include <api/network.hpp>
23 #include <api/engine.hpp>
24 #include "test_utils/test_utils.h"
25
26 using namespace cldnn;
27 using namespace std;
28 using namespace tests;
29
30 class softmax_gpu_xb_f32_test_fixture: public ::testing::Test {
31 public:
32     static const int32_t
33         output_x  = 10, output_b  = 2,  // size of whole output buffer
34         input_x   = 10, input_b   = 2,  // size of whole input buffer
35         in_size   = input_x*input_b,
36         out_size  = output_x*output_b;
37
38     float in_buffer[in_size];
39     float out_buffer[out_size];
40     float expected_buffer[out_size];
41
42     const cldnn::engine& engine;
43     cldnn::memory input;
44
45     //neural::primitive output = memory::allocate({ memory::format::xb_f32, {output_b, {{output_x}}, 1}});
46
47     softmax_gpu_xb_f32_test_fixture()
48         : engine(get_test_engine())
49         ,input(memory::allocate(engine, { data_types::f32, format::yxfb, { input_b, 1, input_x, 1}}))
50     {}
51
52     void compare_out_buffer_with_expected() {
53         for(size_t i = 0; i < out_size; ++i) {
54             // does output have expected values
55             EXPECT_TRUE(are_equal(out_buffer[i], expected_buffer[i]))
56                 << "At ["<< i <<  "] Expected : " << expected_buffer[i] << " actual : " << out_buffer[i];
57         }
58     }
59
60     void compare_out_buffer_with_expected_batch_wise() {
61         for(size_t b = 0; b < output_b; ++b) {
62             float batch_wise_sum = 0;
63             for(size_t x = 0; x < output_x; ++x) {
64                 auto idx = b+x*output_b;
65                 batch_wise_sum += out_buffer[idx];
66                 // does output have expected values
67                 EXPECT_TRUE(are_equal(out_buffer[idx], expected_buffer[idx]))
68                     << "At ["<< idx <<  "] Expected : " << expected_buffer[idx] << " actual : " << out_buffer[idx];
69             }
70             // does it sum to 1 batch wise
71             EXPECT_TRUE(are_equal(batch_wise_sum, 1.0f))
72                 << "Expected : " << 1.0f << " actual : " << batch_wise_sum;
73         }
74     }
75 };
76
77 TEST_F(softmax_gpu_xb_f32_test_fixture, input_same_values) {
78 // in_buffer filled with same value == 1.0f
79     for(uint32_t i = 0; i < out_size; ++i) {
80               in_buffer[i] = 1.0f;
81         expected_buffer[i] = 0.1f;
82     }
83     std::vector<float> in_b(std::begin(in_buffer), std::end(in_buffer));
84
85     set_values(input, in_b);
86
87     network network(engine, topology(input_layout("input", input.get_layout()), softmax("softmax", "input")));
88     network.set_input_data("input", input);
89
90     auto outputs = network.execute();
91     EXPECT_EQ(outputs.size(), size_t(1));
92     EXPECT_EQ(outputs.begin()->first, "softmax");
93
94     auto output_prim = outputs.begin()->second.get_memory();
95
96     auto output_ptr = output_prim.pointer<float>();
97     for (uint32_t i = 0; i < out_size; i++)
98     {
99         out_buffer[i] = get_value<float>(output_ptr, i);
100     }
101     compare_out_buffer_with_expected();
102 }
103
104 TEST_F(softmax_gpu_xb_f32_test_fixture, input_same_values_batch_wise) {
105 // in_buffer filled with same value == 1..2 each batch accordingly (softmax can only xb_f32 )
106     for(size_t i = 0; i < output_x; ++i) {
107         for(size_t j = 0; j < output_b; ++j)
108             in_buffer[j+i*output_b] = (j+i*output_b) % 2 +1.0f;
109     }
110
111     std::vector<float> in_b(std::begin(in_buffer), std::end(in_buffer));
112     set_values(input, in_b);
113     // fill buffer with the expected 0.1f value
114     for(size_t i = 0; i < out_size; ++i)
115         expected_buffer[i] = 0.1f;
116
117     network network(engine, topology(input_layout("input", input.get_layout()), softmax("softmax", "input")));
118     network.set_input_data("input", input);
119
120     auto outputs = network.execute();
121     EXPECT_EQ(outputs.size(), size_t(1));
122     EXPECT_EQ(outputs.begin()->first, "softmax");
123
124     auto output_prim = outputs.begin()->second.get_memory();
125
126     auto output_ptr = output_prim.pointer<float>();
127     for (uint32_t i = 0; i < out_size; i++)
128     {
129         out_buffer[i] = get_value<float>(output_ptr, i);
130     }
131     compare_out_buffer_with_expected_batch_wise();
132 }
133
134 TEST_F(softmax_gpu_xb_f32_test_fixture, values_batch_wise) {
135
136     float in_buf[in_size] = {
137        //b0  b1
138         2.0f, 2.0f, //x0
139         2.0f, 2.0f, //x1
140         2.0f, 2.0f, //x2
141         3.0f, 3.0f, //x3
142         5.0f, 5.0f, //x4
143         4.0f, 4.0f, //x5
144         3.0f, 3.0f, //x6
145         2.0f, 2.0f, //x7
146         2.0f, 2.0f, //x8
147         2.0f, 2.0f  //x9
148     };
149
150     float exp_buf[out_size] = {
151         0.02569957f,     0.02569957f,
152         0.02569957f,     0.02569957f,
153         0.02569957f,     0.02569957f,
154         0.069858674f,    0.069858674f,
155         0.516189665f,    0.516189665f,
156         0.189895565f,    0.189895565f,
157         0.069858674f,    0.069858674f,
158         0.02569957f,     0.02569957f,
159         0.02569957f,     0.02569957f,
160         0.02569957f,     0.02569957f
161
162     };
163
164     std::vector<float> in_b(std::begin(in_buf), std::end(in_buf));
165     set_values(input, in_b);
166     std::copy(exp_buf, exp_buf+in_size, expected_buffer);
167
168     // out_buffer filled with non-signaling NaN
169     for(size_t i = 0; i < out_size; ++i)
170         out_buffer[i] = NAN;
171
172     network network(engine, topology(input_layout("input", input.get_layout()), softmax("softmax", "input")));
173     network.set_input_data("input", input);
174
175     auto outputs = network.execute();
176     EXPECT_EQ(outputs.size(), size_t(1));
177     EXPECT_EQ(outputs.begin()->first, "softmax");
178
179     auto output_prim = outputs.begin()->second.get_memory();
180
181     auto output_ptr = output_prim.pointer<float>();
182     for (uint32_t i = 0; i < out_size; i++)
183     {
184         out_buffer[i] = get_value<float>(output_ptr, i);
185     }
186     compare_out_buffer_with_expected_batch_wise();
187 }
188
189 TEST(softmax_gpu_bfyx_f32, normalize_fyx) {
190     //  Input  : 2x3x2x2
191     static const int32_t x_size = 2, y_size = 2, feature_num = 3,
192         batch_num = 2, buf_size = x_size*y_size * batch_num * feature_num;
193     const auto& engine = get_test_engine();
194
195     auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } });
196     topology topology;
197     topology.add(input_layout("input", input.get_layout()));
198     topology.add(softmax("softmax", "input"));
199
200     set_values(input, {  //bfyx    
201              //y0x0  y0x1   y1x0    y1x1
202         /*b0f0*/0.1f, -0.1f, 0.9f,  1.5f,
203         /*b0f1*/0.2f, 0.2f,  -10.f, 5.2f,
204         /*b1f2*/0.2f, 0.2f,  -10.f, 5.2f,
205         /*b1f0*/3.f,  0.5f,  7.f,   12.f,
206         /*b1f1*/4.f,  0.5f,  8.f,   8.2f,
207         /*b1f2*/0.2f, 0.2f,  -10.f, 5.2f
208     });
209
210     float expected_max_values[2] = {
211         0.481618381f, 0.953259517f
212     };
213
214     network network(engine, topology);
215
216     network.set_input_data("input", input);
217     auto outputs = network.execute();
218
219     EXPECT_EQ(outputs.size(), size_t(1));
220     EXPECT_EQ(outputs.begin()->first, "softmax");
221
222     auto output = outputs.at("softmax").get_memory();
223     auto output_ptr = output.pointer<float>();
224     float out_buffer[buf_size];
225     for (uint32_t i = 0; i < buf_size; i++)
226     {
227         out_buffer[i] = get_value<float>(output_ptr, i);
228     }
229
230     float sum = 0;
231     float expected_sum = 1.0f;
232     
233     float temp_max = 0;
234     int max_value_buffer_index = 0;
235     
236     for (uint32_t i = 0; i < batch_num; i++) //this for loops will sum results in a batch per feature, we expect that: sum = 1.0f
237     {
238         for (uint32_t j = 0; j < y_size; j++)
239         {
240             for (uint32_t k = 0; k < x_size; k++)
241             {
242                 for (uint32_t l = 0; l < feature_num; l++)
243                 {
244                     int index = i * feature_num * x_size * y_size + j * x_size + k + l * x_size * y_size;
245                     sum += out_buffer[index];
246                     if (out_buffer[index] >= temp_max)
247                     {
248                         temp_max = out_buffer[index];
249                     }
250                 }
251             }
252         }
253
254         EXPECT_EQ(true, are_equal(sum, expected_sum));
255         sum = 0.0f;
256         EXPECT_EQ(true, are_equal(temp_max, expected_max_values[max_value_buffer_index]));
257         temp_max = 0;
258         max_value_buffer_index++;
259     }
260 }
261
262 TEST(softmax_gpu_bfyx_f32, normalize_y) {
263     //  Input  : 2x3x2x2
264     static const int32_t x_size = 2, y_size = 2, feature_num = 3,
265         batch_num = 2, buf_size = x_size*y_size * batch_num * feature_num;
266     const auto& engine = get_test_engine();
267
268     auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } });
269     topology topology;
270     topology.add(input_layout("input", input.get_layout()));
271     topology.add(softmax("softmax", "input", softmax::normalize_y));
272
273     vector<float> input_vec = {
274               //y0x0  y0x1   y1x0    y1x1
275         /*b0f0*/0.1f, -0.1f, 0.9f,  1.5f,
276         /*b0f1*/0.2f, 0.2f,  -10.f, 5.2f,
277         /*b0f2*/0.2f, 0.2f,  -10.f, 5.2f,
278
279         /*b1f0*/3.f,  0.5f,  7.f,   12.f,
280         /*b1f1*/4.f,  0.5f,  8.f,   8.2f,
281         /*b1f2*/0.2f, 0.2f,  -10.f, 5.2f
282     };
283     set_values(input, input_vec);
284
285     float expected_max_values[12] = {
286         0.689974481f,   //b=0, f=0, x=0
287         0.832018385f,   //b=0, f=0, x=1
288
289         0.999962831f,   //b=0, f=1, x=0
290         0.993307149f,   //b=0, f=1, x=1
291
292         0.999962831f,   //b=0, f=2, x=0
293         0.993307149f,   //b=0, f=2, x=1
294
295         0.98201379f,    //b=1, f=0, x=0
296         0.99998987f,    //b=1, f=0, x=1
297
298         0.98201379f,    //b=1, f=1, x=0
299         0.999547378f,   //b=1, f=1, x=1
300
301         0.999962831f,   //b=1, f=2, x=0
302         0.993307149f    //b=1, f=2, x=1
303     };
304
305     network network(engine, topology);
306
307     network.set_input_data("input", input);
308     auto outputs = network.execute();
309
310     EXPECT_EQ(outputs.size(), size_t(1));
311     EXPECT_EQ(outputs.begin()->first, "softmax");
312
313     auto output = outputs.at("softmax").get_memory();
314     auto output_ptr = output.pointer<float>();
315     float out_buffer[buf_size];
316     for (uint32_t i = 0; i < buf_size; i++)
317     {
318         out_buffer[i] = get_value<float>(output_ptr, i);
319     }
320
321     float temp_max = 0;
322     float expected_sum = 1.0f;
323     int max_value_buffer_index = 0;
324     for (uint32_t i = 0; i < batch_num; i++) //this for loops will sum results in a batch per feature, we expect that: sum = 1.0f
325     {
326         for (uint32_t l = 0; l < feature_num; l++)
327         {
328             for (uint32_t k = 0; k < x_size; k++)
329             {
330                 float sum = 0.0f;
331                 for (uint32_t j = 0; j < y_size; j++)
332                 {
333                     int index = i * feature_num * x_size * y_size +
334                         l * x_size * y_size +
335                         j * x_size +
336                         k;
337
338                     if (out_buffer[index] >= temp_max)
339                     {
340                         temp_max = out_buffer[index];
341                     }
342
343                     sum += out_buffer[index];
344                 }
345                 EXPECT_EQ(true, are_equal(temp_max, expected_max_values[max_value_buffer_index]));
346                 temp_max = 0;
347                 max_value_buffer_index++;
348
349                 EXPECT_EQ(true, are_equal(sum, expected_sum));
350                 sum = 0.0f;
351             }
352         }
353     }
354 }
355
356 TEST(softmax_gpu_bfyx_f32, normalize_f) {
357     //  Input  : 2x3x2x2
358     static const int32_t x_size = 2, y_size = 2, feature_num = 3,
359         batch_num = 2, buf_size = x_size*y_size * batch_num * feature_num;
360     const auto& engine = get_test_engine();
361
362     auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } });
363     topology topology;
364     topology.add(input_layout("input", input.get_layout()));
365     topology.add(softmax("softmax", "input", softmax::normalize_f));
366
367     vector<float> input_vec = {
368         //y0x0  y0x1   y1x0    y1x1
369         /*b0f0*/0.1f, -0.1f, 0.9f,  1.5f,
370         /*b0f1*/0.2f, 0.2f,  -10.f, 5.2f,
371         /*b0f2*/0.2f, 0.2f,  -10.f, 5.2f,
372
373         /*b1f0*/3.f,  0.5f,  7.f,   12.f,
374         /*b1f1*/4.f,  0.5f,  8.f,   8.2f,
375         /*b1f2*/0.2f, 0.2f,  -10.f, 5.2f
376     };
377     set_values(input, input_vec);
378
379     float expected_max_values[8] = {
380         0.344253346f, //b=0, y=0, x=0
381         0.364854551f, //b=0, y=0, x=1
382
383         0.999963085f, //b=0, y=1, x=0
384         0.493894592f, //b=0, y=1, x=1
385
386         0.719294981f, //b=1, y=0, x=0
387         0.364854551f, //b=1, y=0, x=1
388
389         0.73105857f, //b=1, y=1, x=0
390         0.977054322f //b=1, y=1, x=1
391     };
392
393     network network(engine, topology);
394
395     network.set_input_data("input", input);
396     auto outputs = network.execute();
397
398     EXPECT_EQ(outputs.size(), size_t(1));
399     EXPECT_EQ(outputs.begin()->first, "softmax");
400
401     auto output = outputs.at("softmax").get_memory();
402     auto output_ptr = output.pointer<float>();
403     float out_buffer[buf_size];
404     for (uint32_t i = 0; i < buf_size; i++)
405     {
406         out_buffer[i] = get_value<float>(output_ptr, i);
407     }
408
409     float temp_max = 0;
410     float expected_sum = 1.0f;
411     int max_value_buffer_index = 0;
412     for (uint32_t i = 0; i < batch_num; i++) //this for loops will sum results in a batch per feature, we expect that: sum = 1.0f
413     {
414         for (uint32_t j = 0; j < y_size; j++)
415         {
416             for (uint32_t k = 0; k < x_size; k++)
417             {
418                 float sum = 0.0f;
419                 for (uint32_t l = 0; l < feature_num; l++)
420                 {
421                     int index = i * feature_num * x_size * y_size +
422                         l * x_size * y_size +
423                         j * x_size +
424                         k;
425
426                     if (out_buffer[index] >= temp_max)
427                     {
428                         temp_max = out_buffer[index];
429                     }
430
431                     sum += out_buffer[index];
432                 }
433                 EXPECT_EQ(true, are_equal(temp_max, expected_max_values[max_value_buffer_index]));
434                 temp_max = 0;
435                 max_value_buffer_index++;
436
437                 EXPECT_EQ(true, are_equal(sum, expected_sum));
438                 sum = 0.0f;
439             }
440         }
441     }
442 }
443
444 TEST(softmax_gpu_yxfb_f32, normalize_f) {
445
446     static const int32_t x_size = 1, y_size = 2, feature_num = 1,
447         batch_num = 12, buf_size = x_size*y_size * batch_num * feature_num;
448     const auto& engine = get_test_engine();
449
450     auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ batch_num, feature_num, y_size , x_size } });
451     topology topology;
452     topology.add(input_layout("input", input.get_layout()));
453     topology.add(softmax("softmax", "input", softmax::normalize_fyx));
454
455     set_values(input, {  //yxfb
456                 //f0b0  f0b1  f0b2  f0b3  f0b4    f0b5    f0b6   f0b7   f0b8    f0b9   f0b10  f0b11 
457         /*y0x0*/ 0.1f, -0.1f, 0.9f, 1.5f, 0.15f, -0.01f, 0.19f,  0.45f, 0.41f, -0.12f, 0.39f, 0.65f,
458         /*y1x0*/ 0.2f, 0.2f, -10.f, 5.2f, 0.01f, 0.015f, 0.29f,  0.05f, 0.41f, -0.31f, 0.29f, 1.35f
459     });
460
461     float expected_max_values[batch_num * feature_num * x_size] = {
462         0.524979174f,
463         0.574442506f,
464         0.999981523f,
465         0.975872993f,
466         0.534942925f,
467         0.506249666f,
468         0.524979174f,
469         0.598687649f,
470         0.500000000f,
471         0.547357619f,
472         0.524979174f,
473         0.668187797f
474     };
475
476     network network(engine, topology);
477
478     network.set_input_data("input", input);
479     auto outputs = network.execute();
480
481     EXPECT_EQ(outputs.size(), size_t(1));
482     EXPECT_EQ(outputs.begin()->first, "softmax");
483
484     auto output = outputs.at("softmax").get_memory();
485     auto output_ptr = output.pointer<float>();
486     float out_buffer[buf_size];
487     for (uint32_t i = 0; i < buf_size; i++)
488     {
489         out_buffer[i] = get_value<float>(output_ptr, i);
490     }
491
492     float sum = 0;
493     float expected_sum = 1.0f;
494
495     float temp_max = 0;
496
497     for (uint32_t b = 0; b < batch_num; b++)
498     {
499         for (uint32_t f = 0; f < feature_num; f++)
500         {
501             for (uint32_t x = 0; x < x_size; x++)
502             {
503                 float sum = 0.0f;
504                 for (uint32_t y = 0; y < y_size; y++)
505                 {
506                     int index = b + y * batch_num + f * feature_num + x * x_size;
507                     if (out_buffer[index] >= temp_max)
508                     {
509                         temp_max = out_buffer[index];
510                     }
511                     sum += out_buffer[index];
512                 }
513                 EXPECT_EQ(true, are_equal(temp_max, expected_max_values[b * feature_num * x_size + f * x_size + x]));
514                 temp_max = 0;
515                 EXPECT_EQ(true, are_equal(sum, expected_sum));
516                 sum = 0.0f;
517             }
518         }
519     }
520 }
521
522 TEST(softmax_gpu_bfzyx_f32, normalize_z) {
523     //  Input  : 2x3x2x2x2
524     static const int32_t x_size = 2, y_size = 2, z_size = 2, feature_num = 3,
525         batch_num = 2, buf_size = x_size  *y_size * z_size * batch_num * feature_num;
526     const auto& engine = get_test_engine();
527
528     auto input = memory::allocate(engine, { data_types::f32, format::bfzyx,{ batch_num, feature_num, x_size , y_size, z_size } });
529     topology topology;
530     topology.add(input_layout("input", input.get_layout()));
531     topology.add(softmax("softmax", "input", softmax::normalize_z));
532
533     vector<float> input_vec = {
534         //    z0y0x0 z0y0x1 z0y1x0 z0y1x1 z1y0x0 z1y0x1 z1y1x0 z1y1x1
535         /*b0f0*/0.1f, -0.1f, 0.9f,  1.5f, 0.2f, -0.2f, 0.9f,  2.5f,
536         /*b0f1*/0.2f, 0.2f,  -10.f, 5.2f, 0.3f, 0.1f,  -11.f, 6.2f,
537         /*b0f2*/0.2f, 0.2f,  -10.f, 5.2f, 0.1f, 0.3f,  -9.f,  4.2f,
538
539         /*b1f0*/3.f,  0.5f,  7.f,   12.f, 5.f,  0.1f,  6.f,   22.f,
540         /*b1f1*/4.f,  0.5f,  8.f,   8.2f, 2.2f,  0.3f,  6.f,  5.2f,
541         /*b1f2*/0.2f, 0.2f,  -10.f, 5.2f, 1.2f, 0.3f,  -12.f,  2.2f
542     };
543     set_values(input, input_vec);
544
545     float expected_max_values[24] = {
546         0.524979f, 0.524979f,
547         0.5f,      0.731059f,
548         0.524979f, 0.524979f,
549         0.731059f, 0.731059f,
550         0.524979f, 0.524979f,
551         0.731059f, 0.731059f,
552         0.880797f, 0.598688f,
553         0.731059f, 0.999955f,
554         0.858149f, 0.549834f,
555         0.880797f, 0.952574f,
556         0.731059f, 0.524979f,
557         0.880797f, 0.952574f,
558     };
559
560     network network(engine, topology);
561
562     network.set_input_data("input", input);
563     auto outputs = network.execute();
564
565     EXPECT_EQ(outputs.size(), size_t(1));
566     EXPECT_EQ(outputs.begin()->first, "softmax");
567
568     auto output = outputs.at("softmax").get_memory();
569     auto output_ptr = output.pointer<float>();
570     float out_buffer[buf_size];
571     for (uint32_t i = 0; i < buf_size; i++)
572     {
573         out_buffer[i] = get_value<float>(output_ptr, i);
574     }
575
576     float temp_max = 0;
577     float expected_sum = 1.0f;
578     int max_value_buffer_index = 0;
579     for (uint32_t i = 0; i < batch_num; i++)
580     {
581         for (uint32_t l = 0; l < feature_num; l++)
582         {
583             for (uint32_t j = 0; j < y_size; j++)
584             {
585                 for (uint32_t k = 0; k < x_size; k++)
586                 {
587                     float sum = 0.0f;
588                     for (uint32_t m = 0; m < z_size; m++)
589                     {
590                         int index = i * feature_num * x_size * y_size * z_size +
591                             l * x_size * y_size * z_size +
592                             m * x_size * y_size +
593                             j * x_size +
594                             k;
595
596                         if (out_buffer[index] >= temp_max)
597                         {
598                             temp_max = out_buffer[index];
599                         }
600
601                         sum += out_buffer[index];
602                     }
603                     EXPECT_EQ(true, are_equal(temp_max, expected_max_values[max_value_buffer_index]));
604                     temp_max = 0;
605                     max_value_buffer_index++;
606                     EXPECT_EQ(true, are_equal(sum, expected_sum));
607                     sum = 0.0f;
608                 }
609             }
610         }
611     }
612 }
613
614 TEST(softmax_gpu_bfyx_f32, normalize_all) {
615     //  Input  : 2x3x2x2
616     static const int32_t x_size = 2, y_size = 2, feature_num = 3,
617                          batch_num = 2, buf_size = x_size * y_size * batch_num * feature_num;
618     const auto& engine = get_test_engine();
619
620     auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {batch_num, feature_num, x_size, y_size}});
621     topology topology;
622     topology.add(input_layout("input", input.get_layout()));
623     topology.add(softmax("softmax", "input", softmax::normalize_all));
624
625     set_values(input, {//bfyx
626                        //       y0x0  y0x1   y1x0    y1x1
627                        /*b0f0*/ 0.1f, -0.1f, 0.9f, 1.5f,
628                        /*b0f1*/ 0.2f, 0.2f, -10.f, 5.2f,
629                        /*b0f2*/ 0.2f, 0.2f, -10.f, 5.2f,
630                        /*b1f0*/ 3.f, 0.5f, 7.f, 12.f,
631                        /*b1f1*/ 4.f, 0.5f, 8.f, 8.2f,
632                        /*b1f2*/ 0.2f, 0.2f, -10.f, 5.2f});
633
634     network network(engine, topology);
635
636     network.set_input_data("input", input);
637     auto outputs = network.execute();
638
639     EXPECT_EQ(outputs.size(), size_t(1));
640     EXPECT_EQ(outputs.begin()->first, "softmax");
641
642     auto output = outputs.at("softmax").get_memory();
643     auto output_ptr = output.pointer<float>();
644     float sum = 0.0f;
645     float expected_sum = 1.0f;
646     for (uint32_t i = 0; i < buf_size; i++) {
647         sum += get_value<float>(output_ptr, i);
648     }
649     EXPECT_EQ(true, are_equal(sum, expected_sum));
650 }
651
652 TEST(softmax_gpu_yxfb_f32, normalize_all) {
653     //  Input  : 2x2x3x2
654     static const int32_t x_size = 2, y_size = 2, feature_num = 3,
655                          batch_num = 2, buf_size = x_size * y_size * batch_num * feature_num;
656     const auto& engine = get_test_engine();
657
658     auto input = memory::allocate(engine, {data_types::f32, format::yxfb, {y_size, x_size, feature_num, batch_num}});
659     topology topology;
660     topology.add(input_layout("input", input.get_layout()));
661     topology.add(softmax("softmax", "input", softmax::normalize_all));
662
663     set_values(input, {//yxfb
664                        //       f0b0  f0b1   f1b0    f1b1
665                        /*y0x0*/ 0.1f, -0.1f, 0.9f, 1.5f,
666                        /*y0x1*/ 0.2f, 0.2f, -10.f, 5.2f,
667                        /*y0x2*/ 0.2f, 0.2f, -10.f, 5.2f,
668                        /*y1x0*/ 3.f, 0.5f, 7.f, 12.f,
669                        /*y1x1*/ 4.f, 0.5f, 8.f, 8.2f,
670                        /*y1x2*/ 0.2f, 0.2f, -10.f, 5.2f});
671
672     network network(engine, topology);
673
674     network.set_input_data("input", input);
675     auto outputs = network.execute();
676
677     EXPECT_EQ(outputs.size(), size_t(1));
678     EXPECT_EQ(outputs.begin()->first, "softmax");
679
680     auto output = outputs.at("softmax").get_memory();
681     auto output_ptr = output.pointer<float>();
682     float sum = 0.0f;
683     float expected_sum = 1.0f;
684     for (uint32_t i = 0; i < buf_size; i++) {
685         sum += get_value<float>(output_ptr, i);
686     }
687     EXPECT_EQ(true, are_equal(sum, expected_sum));
688 }
689
690 TEST(softmax_gpu_bfzyx_f32, normalize_all) {
691     //  Input  : 2x3x2x2x2
692     static const int32_t x_size = 2, y_size = 2, z_size = 2, feature_num = 3,
693                          batch_num = 2, buf_size = x_size * y_size * z_size * batch_num * feature_num;
694     const auto& engine = get_test_engine();
695
696     auto input = memory::allocate(engine, {data_types::f32, format::bfzyx, {batch_num, feature_num, x_size, y_size, z_size}});
697     topology topology;
698     topology.add(input_layout("input", input.get_layout()));
699     topology.add(softmax("softmax", "input", softmax::normalize_all));
700
701     set_values(input, {//    z0y0x0 z0y0x1 z0y1x0 z0y1x1 z1y0x0 z1y0x1 z1y1x0 z1y1x1
702                        /*b0f0*/ 0.1f, -0.1f, 0.9f, 1.5f, 0.2f, -0.2f, 0.9f, 2.5f,
703                        /*b0f1*/ 0.2f, 0.2f, -10.f, 5.2f, 0.3f, 0.1f, -11.f, 6.2f,
704                        /*b0f2*/ 0.2f, 0.2f, -10.f, 5.2f, 0.1f, 0.3f, -9.f, 4.2f,
705
706                        /*b1f0*/ 3.f, 0.5f, 7.f, 12.f, 5.f, 0.1f, 6.f, 22.f,
707                        /*b1f1*/ 4.f, 0.5f, 8.f, 8.2f, 2.2f, 0.3f, 6.f, 5.2f,
708                        /*b1f2*/ 0.2f, 0.2f, -10.f, 5.2f, 1.2f, 0.3f, -12.f, 2.2f});
709     network network(engine, topology);
710
711     network.set_input_data("input", input);
712     auto outputs = network.execute();
713
714     EXPECT_EQ(outputs.size(), size_t(1));
715     EXPECT_EQ(outputs.begin()->first, "softmax");
716
717     auto output = outputs.at("softmax").get_memory();
718     auto output_ptr = output.pointer<float>();
719     float sum = 0.0f;
720     float expected_sum = 1.0f;
721     for (uint32_t i = 0; i < buf_size; i++) {
722         sum += get_value<float>(output_ptr, i);
723     }
724     EXPECT_EQ(true, are_equal(sum, expected_sum));
725 }
726
727 TEST(softmax_gpu_bfyx_f16, normalize_all) {
728     //  Input  : 2x3x2x2
729     static const int32_t x_size = 2, y_size = 2, feature_num = 3,
730                          batch_num = 2, buf_size = x_size * y_size * batch_num * feature_num;
731     const auto& engine = get_test_engine();
732
733     auto input = memory::allocate(engine, {data_types::f16, format::bfyx, {batch_num, feature_num, x_size, y_size}});
734     topology topology;
735     topology.add(input_layout("input", input.get_layout()));
736     topology.add(softmax("softmax", "input", softmax::normalize_all));
737
738     set_values(input, {//bfyx
739                        //           y0x0            y0x1            y1x0            y1x1
740                        /*b0f0*/ FLOAT16(0.1f), FLOAT16(-0.1f), FLOAT16(0.9f), FLOAT16(1.5f),
741                        /*b0f1*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f),
742                        /*b0f2*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f),
743                        /*b1f0*/ FLOAT16(3.f), FLOAT16(0.5f), FLOAT16(7.f), FLOAT16(12.f),
744                        /*b1f1*/ FLOAT16(4.f), FLOAT16(0.5f), FLOAT16(8.f), FLOAT16(8.2f),
745                        /*b1f2*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f)});
746
747     network network(engine, topology);
748
749     network.set_input_data("input", input);
750     auto outputs = network.execute();
751
752     EXPECT_EQ(outputs.size(), size_t(1));
753     EXPECT_EQ(outputs.begin()->first, "softmax");
754
755     auto output = outputs.at("softmax").get_memory();
756     auto output_ptr = output.pointer<uint16_t>();
757     float sum = 0.0f;
758     float expected_sum = 1.0f;
759     for (uint32_t i = 0; i < buf_size; i++) {
760         sum += float16_to_float32(get_value<uint16_t>(output_ptr, i));
761     }
762     ASSERT_NEAR(sum, expected_sum, 0.001);
763 }
764
765 TEST(softmax_gpu_yxfb_f16, normalize_all) {
766     //  Input  : 2x2x3x2
767     static const int32_t x_size = 2, y_size = 2, feature_num = 3,
768                          batch_num = 2, buf_size = x_size * y_size * batch_num * feature_num;
769     const auto& engine = get_test_engine();
770
771     auto input = memory::allocate(engine, {data_types::f16, format::yxfb, {y_size, x_size, feature_num, batch_num}});
772     topology topology;
773     topology.add(input_layout("input", input.get_layout()));
774     topology.add(softmax("softmax", "input", softmax::normalize_all));
775
776     set_values(input, {//yxfb
777                        //           f0b0            f0b1            f1b0            f1b1
778                        /*y0x0*/ FLOAT16(0.1f), FLOAT16(-0.1f), FLOAT16(0.9f), FLOAT16(1.5f),
779                        /*y0x1*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f),
780                        /*y0x2*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f),
781                        /*y1x0*/ FLOAT16(3.f), FLOAT16(0.5f), FLOAT16(7.f), FLOAT16(12.f),
782                        /*y1x1*/ FLOAT16(4.f), FLOAT16(0.5f), FLOAT16(8.f), FLOAT16(8.2f),
783                        /*y1x2*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f)});
784
785     network network(engine, topology);
786
787     network.set_input_data("input", input);
788     auto outputs = network.execute();
789
790     EXPECT_EQ(outputs.size(), size_t(1));
791     EXPECT_EQ(outputs.begin()->first, "softmax");
792
793     auto output = outputs.at("softmax").get_memory();
794     auto output_ptr = output.pointer<uint16_t>();
795     float sum = 0.0f;
796     float expected_sum = 1.0f;
797     for (uint32_t i = 0; i < buf_size; i++) {
798         sum += float16_to_float32(get_value<uint16_t>(output_ptr, i));
799     }
800     ASSERT_NEAR(sum, expected_sum, 0.001);
801 }
802
803 TEST(softmax_gpu_bfzyx_f16, normalize_all) {
804     //  Input  : 2x3x2x2x2
805     static const int32_t x_size = 2, y_size = 2, z_size = 2, feature_num = 3,
806                          batch_num = 2, buf_size = x_size * y_size * z_size * batch_num * feature_num;
807     const auto& engine = get_test_engine();
808
809     auto input = memory::allocate(engine, {data_types::f16, format::bfzyx, {batch_num, feature_num, x_size, y_size, z_size}});
810     topology topology;
811     topology.add(input_layout("input", input.get_layout()));
812     topology.add(softmax("softmax", "input", softmax::normalize_all));
813
814     set_values(input, {//           z0y0x0          z0y0x1          z0y1x0        z0y1x1        z1y0x0          z1y0x1          z1y1x0          z1y1x1
815                        /*b0f0*/ FLOAT16(0.1f), FLOAT16(-0.1f), FLOAT16(0.9f), FLOAT16(1.5f), FLOAT16(0.2f), FLOAT16(-0.2f), FLOAT16(0.9f), FLOAT16(2.5f),
816                        /*b0f1*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f), FLOAT16(0.3f), FLOAT16(0.1f), FLOAT16(-11.f), FLOAT16(6.2f),
817                        /*b0f2*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f), FLOAT16(0.1f), FLOAT16(0.3f), FLOAT16(-9.f), FLOAT16(4.2f),
818
819                        /*b1f0*/ FLOAT16(3.f), FLOAT16(0.5f), FLOAT16(7.f), FLOAT16(12.f), FLOAT16(5.f), FLOAT16(0.1f), FLOAT16(6.f), FLOAT16(22.f),
820                        /*b1f1*/ FLOAT16(4.f), FLOAT16(0.5f), FLOAT16(8.f), FLOAT16(8.2f), FLOAT16(2.2f), FLOAT16(0.3f), FLOAT16(6.f), FLOAT16(5.2f),
821                        /*b1f2*/ FLOAT16(0.2f), FLOAT16(0.2f), FLOAT16(-10.f), FLOAT16(5.2f), FLOAT16(1.2f), FLOAT16(0.3f), FLOAT16(-12.f), FLOAT16(2.2f)});
822     network network(engine, topology);
823
824     network.set_input_data("input", input);
825     auto outputs = network.execute();
826
827     EXPECT_EQ(outputs.size(), size_t(1));
828     EXPECT_EQ(outputs.begin()->first, "softmax");
829
830     auto output = outputs.at("softmax").get_memory();
831     auto output_ptr = output.pointer<uint16_t>();
832     float sum = 0.0f;
833     float expected_sum = 1.0f;
834     for (uint32_t i = 0; i < buf_size; i++) {
835         sum += float16_to_float32(get_value<uint16_t>(output_ptr, i));
836     }
837     ASSERT_NEAR(sum, expected_sum, 0.001);
838 }
839
840 //////////////////////////////////////////////////////////////////////////////
841 //                                                                          //
842 //                      Exhaustive Negative Matrix tests                    //
843 //                                                                          //
844 //////////////////////////////////////////////////////////////////////////////
845
846 //TODO:
847 //TEST(NegativeSoftmaxTest, DISABLED_TestAll) {
848 //}
849
850 //////////////////////////////////////////////////////////////////////////////
851 //                                                                          //
852 //                      Exhaustive Positive Matrix tests                    //
853 //                                                                          //
854 //////////////////////////////////////////////////////////////////////////////
855
856 using namespace cldnn;
857
858 class softmax_test : public tests::generic_test
859 {
860
861 public:
862     softmax_test() : tests::generic_test()
863     {
864     }
865
866     virtual void SetUp() override
867     {
868         max_ulps_diff_allowed = 6;
869     }
870
871     static void TearDownTestCase()
872     {
873         for (auto generic_params : all_generic_params)
874         {
875             delete generic_params;
876         }
877
878         all_layer_params.clear();
879     }
880
881     static std::vector<std::shared_ptr<cldnn::primitive>> generate_specific_test_params()
882     {
883         all_layer_params.emplace_back(new softmax("softmax", "input0", softmax::normalize_f));
884
885         //The test checks only valid combinations.
886         //TODO: add more combinations.
887
888         return all_layer_params;
889     }
890
891     static std::vector<tests::test_params*> generate_generic_test_params()
892     {
893         return generic_test::generate_generic_test_params(all_generic_params);
894     }
895
896     virtual bool is_format_supported(cldnn::format format) override
897     {
898         return
899             format == cldnn::format::yxfb ||
900             format == cldnn::format::bfyx;
901     }
902
903     template<typename Type>
904     memory generate_reference_typed(const std::vector<memory> & inputs)
905     {
906         assert(inputs.size() == 1);
907         const memory & input = inputs[0];
908
909         //Output is bfyx
910         auto output = memory::allocate(engine, cldnn::layout(input.get_layout().data_type, input.get_layout().format, input.get_layout().size));
911
912 //        const auto params = static_cast<cldnn::softmax *>(layer_parmas);
913
914         const auto in0_mem = input.pointer<Type>();
915         auto out_mem = output.pointer<Type>();
916
917         const int in0_b = input.get_layout().size.sizes()[0];
918         const int in0_f = input.get_layout().size.sizes()[1];
919         const int in0_h = input.get_layout().size.sizes()[3];
920         const int in0_w = input.get_layout().size.sizes()[2];
921
922 //        const int out_b = output.get_layout().size.transform(cldnn::format::bfyx, 0).sizes()[0];
923 //        const int out_f = output.get_layout().size.transform(cldnn::format::bfyx, 0).sizes()[1];
924 //        const int out_h = output.get_layout().size.transform(cldnn::format::bfyx, 0).sizes()[2];
925 //        const int out_w = output.get_layout().size.transform(cldnn::format::bfyx, 0).sizes()[3];
926
927 //        assert(in0_b == out_b);
928 //        assert(in0_f == out_f);
929 //        assert(in0_h == out_h);
930 //        assert(in0_w == out_w);
931
932         std::vector<float> cached_exp_vals;
933         cached_exp_vals.resize(in0_f);
934
935         const auto input_desc = get_linear_memory_desc(input.get_layout());
936
937         for (int n = 0; n < in0_b; ++n)
938         for (int y = 0; y < in0_h; ++y)
939         for (int x = 0; x < in0_w; ++x)
940         {
941             float max_val = -std::numeric_limits<float>::infinity();
942
943             for (int c = 0; c < in0_f; ++c)
944             {
945                 const size_t in0_idx = get_linear_index(input.get_layout(), n, c, y, x, input_desc);
946
947                 max_val = std::max(max_val, static_cast<float>(in0_mem[in0_idx]));
948             }
949
950             float Z = 0;
951
952             for (int c = 0; c < in0_f; ++c)
953             {
954                 const size_t in0_idx = get_linear_index(input.get_layout(), n, c, y, x, input_desc);
955
956                 float tmp = static_cast<float>((Type)std::exp(static_cast<float>(in0_mem[in0_idx]) - max_val));
957                 Z += tmp;
958                 cached_exp_vals[c] = tmp;
959             }
960
961             for (int c = 0; c < in0_f; ++c)
962             {
963                 const size_t out_idx = get_linear_index(output.get_layout(), n, c, y, x, input_desc);
964                 out_mem[out_idx] = (Type)(cached_exp_vals[c] / Z);
965             }
966         }
967
968         return output;
969     }
970
971     virtual memory generate_reference(const std::vector<memory> & inputs) override
972     {
973         if (generic_params->data_type == data_types::f32)
974         {
975             return generate_reference_typed<float>(inputs);
976         }
977         else
978         {
979             return generate_reference_typed<FLOAT16>(inputs);
980         }
981     }
982
983     static std::string custom_param_name(const ::testing::TestParamInfo<std::tuple<test_params*, std::shared_ptr<cldnn::primitive>>>& info)
984     {
985         std::stringstream res;
986
987         const auto & p = std::get<0>(info.param);
988
989         assert (p->data_type == data_types::f32 ||
990                 p->data_type == data_types::f16);
991
992         res << info.index
993             << "_" << (p->data_type == data_types::f32 ? "f32" : "f16");
994
995         for (unsigned i = 0; i < p->input_layouts.size(); ++i)
996         {
997             const auto chans = format::traits(p->fmt).order;
998
999             res << "_" << "Input" << i;
1000             for (unsigned int j = 0; j < p->input_layouts[i].size.sizes(p->fmt).size(); ++j)
1001             {
1002                 res << chans[j] << p->input_layouts[i].size.sizes(p->fmt)[j];
1003             }
1004         }
1005
1006         return res.str();
1007     }
1008
1009 private:
1010
1011     static std::vector<tests::test_params*> all_generic_params;
1012     static std::vector<std::shared_ptr<cldnn::primitive>> all_layer_params;
1013
1014 };
1015
1016 std::vector<std::shared_ptr<cldnn::primitive>> softmax_test::all_layer_params = {};
1017 std::vector<tests::test_params*> softmax_test::all_generic_params = {};
1018
1019 TEST_P(softmax_test, SOFTMAX)
1020 {
1021     run_single_test();
1022 }
1023
1024 INSTANTIATE_TEST_CASE_P(DISABLED_SOFTMAX,
1025     softmax_test,
1026     ::testing::Combine(::testing::ValuesIn(softmax_test::generate_generic_test_params()), ::testing::ValuesIn(softmax_test::generate_specific_test_params())),
1027     softmax_test::custom_param_name);
1028