Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / tests / test_cases / pooling_gpu_test.cpp
1 /*
2 // Copyright (c) 2016-2019 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 ///////////////////////////////////////////////////////////////////////////////////////////////////
18 #include <gtest/gtest.h>
19 #include "api/CPP/memory.hpp"
20 #include <api/CPP/input_layout.hpp>
21 #include "api/CPP/pooling.hpp"
22 #include "api/CPP/mutable_data.hpp"
23 #include <api/CPP/topology.hpp>
24 #include <api/CPP/network.hpp>
25 #include <api/CPP/engine.hpp>
26 #include "test_utils/test_utils.h"
27 #include "api/CPP/reorder.hpp"
28 #include <api/CPP/data.hpp>
29 #include "test_utils/float16.h"
30
31 using namespace cldnn;
32 using namespace tests;
33
34
35 TEST(pooling_forward_gpu, basic_max_byxf_f32_wsiz3x3_wstr1x1_i1x3x3x8_nopad) {
36     //  Brief test description.
37     //
38     //  Pool window: 3x3
39     //  Pool stride: 1x1
40     //  Pool mode: max
41     //  Padding: none
42     //
43     //  Input data:
44     //  [ 0.5, -0.5, -0.5, -0.5, 0.5f, -0.5, -0.5f, -0.5 ]
45     //  [ 1.0, 0.0, 0.0, 0.0, 0.5, -0.5, -0.5, -0.5 ]
46     //  [ 2.0, 0.0, 0.0, 0.0, 0.5, -0.5, -0.5, -0.5 ]
47     //  [ 3.0, 0.0, 0.0, 0.0, 0.5, -0.5, -0.5, -0.5 ]
48     //  [ 4.0, 0.0, 0.0, 0.0, 0.5, -0.5, -0.5, -0.5 ]
49     //  [ 5.0, 0.0, 0.0, 0.0, 0.5, -0.5, -0.5, -0.5 ]
50     //  [ 6.0, 0.0, 0.0, 0.0, 0.5, -0.5, -0.5, -0.5 ]
51     //  [ 7.0, 0.0, 0.0, 0.0, 0.5, -0.5, -0.5, -0.5 ]
52     //  [ 8.0, 0.0, 0.0, 4.0, 0.5, -0.5, -0.5, -0.5 ]
53     //
54     //  Expected output:
55     //  [ 8.0, 0.0, 0.0, 4,0, 0,5, -0.5, -0.5, -0.5 ]
56
57     const auto& engine = get_test_engine();
58
59     auto input_prim = memory::allocate(engine, { data_types::f32,  format::byxf,{ 1, 8, 3, 3 } });
60
61     topology topology;
62     topology.add(input_layout("input_prim", input_prim.get_layout()));
63     topology.add(pooling("pool_prim", "input_prim", pooling_mode::max, { 1,1,3,3 }, { 1,1,1,1 }));
64     network network(engine, topology);
65     set_values(input_prim, { 0.5f, -0.5f, -0.5f, -0.5f, 0.5f, -0.5f, -0.5f, -0.5f,
66         1.0f, 0.0f, 0.0f, 0.0f, 0.5f, -0.5f, -0.5f, -0.5f,
67         2.0f, 0.0f, 0.0f, 0.0f, 0.5f, -0.5f, -0.5f, -0.5f,
68         3.0f, 0.0f, 0.0f, 0.0f, 0.5f, -0.5f, -0.5f, -0.5f,
69         4.0f, 0.0f, 0.0f, 0.0f, 0.5f, -0.5f, -0.5f, -0.5f,
70         5.0f, 0.0f, 0.0f, 0.0f, 0.5f, -0.5f, -0.5f, -0.5f,
71         6.0f, 0.0f, 0.0f, 0.0f, 0.5f, -0.5f, -0.5f, -0.5f,
72         7.0f, 0.0f, 0.0f, 0.0f, 0.5f, -0.5f, -0.5f, -0.5f,
73         8.0f, 0.0f, 0.0f, 4.0f, 0.5f, -0.5f, -0.5f, -0.5f });
74     network.set_input_data("input_prim", input_prim);
75
76     auto outputs = network.execute();
77     EXPECT_EQ(outputs.size(), size_t(1));
78     EXPECT_EQ(outputs.begin()->first, "pool_prim");
79
80     auto output_prim = outputs.begin()->second.get_memory();
81
82     auto output_ptr = output_prim.pointer<float>();
83     EXPECT_EQ(4.0f, output_ptr[3]);
84 }
85
86 TEST(pooling_forward_gpu, basic_max_yxfb_f32_wsiz3x3_wstr1x1_i3x3x1x1_nopad) {
87     //  Brief test description.
88     //
89     //  Pool window: 3x3
90     //  Pool stride: 1x1
91     //  Pool mode: max
92     //  Padding: none
93     //
94     //  Input data:
95     //  [-0.5,  1.0,  0.5]
96     //  [ 2.0,  1.5, -0.5]
97     //  [ 0.0, -1.0,  0.5]
98     //
99     //  Expected output:
100     //  [ 2.0]
101
102     const auto& engine = get_test_engine();
103
104     auto input_prim = memory::allocate(engine, { data_types::f32,  format::yxfb, { 1, 1, 3, 3 } });
105
106     topology topology;
107     topology.add(input_layout("input_prim", input_prim.get_layout()));
108     topology.add(pooling("pool_prim", "input_prim", pooling_mode::max, { 1,1,3,3 }, { 1,1,1,1 }));
109
110     network network(engine, topology);
111     set_values(input_prim, { -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f });
112     network.set_input_data("input_prim", input_prim);
113
114     auto outputs = network.execute();
115     EXPECT_EQ(outputs.size(), size_t(1));
116     EXPECT_EQ(outputs.begin()->first, "pool_prim");
117
118     auto output_prim = outputs.begin()->second.get_memory();
119
120     auto output_ptr = output_prim.pointer<float>();
121
122     EXPECT_EQ(2.0f, output_ptr[0]);
123 }
124
125 TEST(pooling_forward_gpu, basic_max_yxfb_f32_global_i3x3x1x1_nopad) {
126     //  Brief test description.
127     //
128     //  Pool mode: max
129     //  Global pooling: true
130     //  Padding: none
131     //
132     //  Input data:
133     //  [-0.5,  1.0,  0.5]
134     //  [ 2.0,  1.5, -0.5]
135     //  [ 0.0, -1.0,  0.5]
136     //
137     //  Expected output:
138     //  [ 2.0]
139
140     const auto& engine = get_test_engine();
141
142     auto input_prim = memory::allocate(engine, { data_types::f32,  format::yxfb,{ 1, 1, 3, 3 } });
143
144     topology topology;
145     topology.add(input_layout("input_prim", input_prim.get_layout()));
146     topology.add(pooling("pool_prim", "input_prim", pooling_mode::max));
147
148     network network(engine, topology);
149     set_values(input_prim, { -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f });
150     network.set_input_data("input_prim", input_prim);
151
152     auto outputs = network.execute();
153     EXPECT_EQ(outputs.size(), size_t(1));
154     EXPECT_EQ(outputs.begin()->first, "pool_prim");
155
156     auto output_prim = outputs.begin()->second.get_memory();
157
158     auto output_ptr = output_prim.pointer<float>();
159
160     EXPECT_EQ(2.0f, output_ptr[0]);
161 }
162
163 TEST(pooling_forward_gpu, basic_max_pooling_int8) {
164
165     const auto& engine = get_test_engine();
166     layout in_layout = { type_to_data_type<float>::value,format::byxf,{ 1,1,3,3 } };
167     layout out_layout = { type_to_data_type<float>::value,format::byxf,{ 1,1,1,1 } };
168     layout byte_layout = { type_to_data_type<int8_t>::value, format::bfyx,{ 1,1,3,3 } };
169     std::initializer_list<float> input_f = { 1.0f, -2.5f, 3.1f, -4.0f, 5.03f, -6.99f, 7.0f, -8.0f, 9.5f };
170     std::list<float> final_results = { 9.0f };
171
172     // Allocate memory for input image.
173     auto input_memory = memory::allocate(engine, in_layout);
174     set_values(input_memory, input_f);
175
176     // Create input_layout description
177     // "input" - is the primitive id inside topology
178     input_layout input("input", in_layout);
179
180     topology topology(
181         // 1. input layout primitive.
182         input,
183         // 2. reorder primitive with id "reorder_input"
184         reorder("reorder_input", input, byte_layout),
185         pooling("pool1", "reorder_input", pooling_mode::max, { 1,1,3,3 }, {1,1,1,1}),
186         reorder("reorder2", "pool1", out_layout)
187     );
188
189     network network(
190         engine,
191         topology,
192         {
193             build_option::outputs({ "reorder2" })
194         });
195
196     network.set_input_data("input", input_memory);
197
198     auto outputs = network.execute();
199
200     auto interm = outputs.at("reorder2").get_memory();
201     auto interm_ptr = interm.pointer<float>();
202     auto output_size = outputs.at("reorder2").get_memory().count();
203     unsigned int cntr = 0;
204     for (const auto& exp : final_results)
205     {
206         EXPECT_EQ(exp, interm_ptr[cntr++]);
207     }
208 }
209
210 TEST(pooling_forward_gpu, basic_avg_pooling_int8) {
211
212     const auto& engine = get_test_engine();
213     layout in_layout = { type_to_data_type<float>::value,format::byxf,{ 1,1,3,3 } };
214     layout out_layout = { type_to_data_type<float>::value,format::byxf,{ 1,1,1,1 } };
215     layout byte_layout = { type_to_data_type<int8_t>::value, format::bfyx,{ 1,1,3,3 } };
216     std::initializer_list<float> input_f = { 2.0f, -2.5f, 5.1f, -4.0f, 8.03f, -6.99f, 17.0f, -8.0f, 19.5f };
217     auto final_result = 0.0f;
218     for (const auto& val : input_f)
219     {
220         final_result += (float)((char)val);
221     }
222     final_result /= input_f.size();
223     final_result = (float)((char)final_result);
224     // Allocate memory for input image.
225     auto input_memory = memory::allocate(engine, in_layout);
226     set_values(input_memory, input_f);
227
228     // Create input_layout description
229     // "input" - is the primitive id inside topology
230     input_layout input("input", in_layout);
231
232     topology topology(
233         // 1. input layout primitive.
234         input,
235         // 2. reorder primitive with id "reorder_input"
236         reorder("reorder_input", input, byte_layout),
237         pooling("pool1", "reorder_input", pooling_mode::average, { 1,1,3,3 }, { 1,1,1,1 }),
238         reorder("reorder2", "pool1", out_layout)
239     );
240
241     network network(
242         engine,
243         topology,
244         {
245             build_option::outputs({ "reorder2" })
246         });
247
248     network.set_input_data("input", input_memory);
249
250     auto outputs = network.execute();
251
252     auto interm = outputs.at("reorder2").get_memory();
253     auto interm_ptr = interm.pointer<float>();
254     auto output_size = outputs.at("reorder2").get_memory().count();
255     unsigned int cntr = 0;
256     EXPECT_EQ(final_result, interm_ptr[0]);
257 }
258
259 TEST(pooling_forward_gpu, basic_max_yxfb_f32_wsiz2x2_wstr1x1_i3x3x1x1_nopad) {
260     //  Brief test description.
261     //
262     //  Pool window: 2x2
263     //  Pool stride: 1x1
264     //  Pool mode: max
265     //  Padding: none
266     //
267     //  Input data:
268     //  [-0.5,  1.0,  0.5]
269     //  [ 2.0,  1.5, -0.5]
270     //  [ 0.0, -1.0,  0.5]
271     //
272     //  Expected output:
273     //  [ 2.0,  1.5]
274     //  [ 2.0,  1.5]
275
276     const auto& engine = get_test_engine();
277
278     auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 3, 3 } });
279
280     topology topology;
281     topology.add(input_layout("input_prim", input_prim.get_layout()));
282     topology.add(pooling("pool_prim", "input_prim", pooling_mode::max, { 1,1,2,2 }, { 1,1,1,1 }));
283
284     network network(engine, topology);
285     set_values(input_prim, { -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f });
286     network.set_input_data("input_prim", input_prim);
287
288     auto outputs = network.execute();
289     EXPECT_EQ(outputs.size(), size_t(1));
290     EXPECT_EQ(outputs.begin()->first, "pool_prim");
291
292     auto output_prim = outputs.begin()->second.get_memory();
293
294     auto output_ptr = output_prim.pointer<float>();
295
296     EXPECT_EQ(2.0f, output_ptr[0]);
297     EXPECT_EQ(1.5f, output_ptr[1]);
298     EXPECT_EQ(2.0f, output_ptr[2]);
299     EXPECT_EQ(1.5f, output_ptr[3]);
300 }
301
302 TEST(pooling_forward_gpu, basic_max_yxfb_f32_wsiz2x2_wstr2x2_i4x4x1x1_nopad) {
303     //  Brief test description.
304     //
305     //  Pool window: 2x2
306     //  Pool stride: 2x2
307     //  Pool mode: max
308     //  Padding: none
309     //
310     //  Input data:
311     //  [-0.25,  1.00,  0.50,  0.25]
312     //  [ 2.00,  1.50, -0.50, -0.75]
313     //  [ 0.00, -1.00,  0.50,  0.25]
314     //  [ 0.50, -2.00, -1.50, -2.50]
315     //
316     //  Expected output:
317     //  [ 2.0,  0.5]
318     //  [ 0.5,  0.5]
319
320     const auto& engine = get_test_engine();
321
322     auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 4 } });
323
324     topology topology;
325     topology.add(input_layout("input_prim", input_prim.get_layout()));
326     topology.add(pooling("pool_prim", "input_prim", pooling_mode::max, { 1,1,2,2 }, { 1,1,2,2 }));
327
328     network network(engine, topology);
329     set_values(input_prim, { -0.25f, 1.00f, 0.50f, 0.25f, 2.00f, 1.50f, -0.50f, -0.75f, 0.00f, -1.00f, 0.50f, 0.25f, 0.50f, -2.00f, -1.50f, -2.50f });
330     network.set_input_data("input_prim", input_prim);
331
332     auto outputs = network.execute();
333     EXPECT_EQ(outputs.size(), size_t(1));
334     EXPECT_EQ(outputs.begin()->first, "pool_prim");
335
336     auto output_prim = outputs.begin()->second.get_memory();
337
338     auto output_ptr = output_prim.pointer<float>();
339
340     EXPECT_EQ(2.0f, output_ptr[0]);
341     EXPECT_EQ(0.5f, output_ptr[1]);
342     EXPECT_EQ(0.5f, output_ptr[2]);
343     EXPECT_EQ(0.5f, output_ptr[3]);
344 }
345
346 TEST(pooling_forward_gpu, basic_max_yxfb_f32_wsiz2x2_wstr1x1_i3x3x2x2_nopad) {
347     //  Brief test description.
348     //
349     //  Pool window: 2x2
350     //  Pool stride: 1x1
351     //  Pool mode: max
352     //  Padding: none
353     //
354     //  Input data:
355     //  FM: 0 BATCH: 0       FM: 1 BATCH: 0
356     //  [-0.5,  0.5,  0.0]   [-1.5, -0.5,  0.0]
357     //  [ 1.0, -1.0, -2.0]   [ 0.0, -1.0,  1.5]
358     //  [-1.0, -0.5, -0.5]   [-2.0,  1.0, -0.5]
359     //
360     //  FM: 0 BATCH: 1       FM: 1 BATCH: 1
361     //  [ 0.5,  0.0, -0.5]   [ 0.0,  0.5, -0.5]
362     //  [-2.0, -1.0,  1.0]   [ 1.0, -1.0,  0.0]
363     //  [-0.5, -1.0,  1.5]   [ 0.5, -0.5,  0.0]
364     //
365     //  Expected output:
366     //  FM: 0 BATCH: 0       FM: 1 BATCH: 0
367     //  [ 1.0,  0.5]         [ 0.0,  1.5]
368     //  [ 1.0, -0.5]         [ 1.0,  1.5]
369     //
370     //  FM: 0 BATCH: 1       FM: 1 BATCH: 1
371     //  [ 0.5,  1.0]         [ 1.0,  0.5]
372     //  [-0.5,  1.5]         [ 1.0,  0.0]
373
374     const auto& engine = get_test_engine();
375
376     auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 3, 3 } });
377
378     topology topology;
379     topology.add(input_layout("input_prim", input_prim.get_layout()));
380     topology.add(pooling("pool_prim", "input_prim", pooling_mode::max, { 1,1,2,2 }, { 1,1,1,1 }));
381
382     network network(engine, topology);
383     set_values(input_prim, { -0.5f, 0.5f, -1.5f, 0.0f, 0.5f, 0.0f, -0.5f, 0.5f, 0.0f, -0.5f, 0.0f, -0.5f, 1.0f, -2.0f, 0.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -2.0f, 1.0f, 1.5f, 0.0f, -1.0f, -0.5f, -2.0f, 0.5f, -0.5f, -1.0f, 1.0f, -0.5f, -0.5f, 1.5f, -0.5f, 0.0f });
384     network.set_input_data("input_prim", input_prim);
385
386     auto outputs = network.execute();
387     EXPECT_EQ(outputs.size(), size_t(1));
388     EXPECT_EQ(outputs.begin()->first, "pool_prim");
389
390     auto output_prim = outputs.begin()->second.get_memory();
391
392     auto output_ptr = output_prim.pointer<float>();
393
394     EXPECT_EQ(1.0f, output_ptr[0]); EXPECT_EQ(0.0f, output_ptr[2]);
395     EXPECT_EQ(0.5f, output_ptr[4]); EXPECT_EQ(1.5f, output_ptr[6]);
396     EXPECT_EQ(1.0f, output_ptr[8]); EXPECT_EQ(1.0f, output_ptr[10]);
397     EXPECT_EQ(-0.5f, output_ptr[12]); EXPECT_EQ(1.5f, output_ptr[14]);
398
399     EXPECT_EQ(0.5f,  output_ptr[1]);  EXPECT_EQ(1.0f, output_ptr[3]);
400     EXPECT_EQ(1.0f,  output_ptr[5]);  EXPECT_EQ(0.5f, output_ptr[7]);
401     EXPECT_EQ(-0.5f, output_ptr[9]);  EXPECT_EQ(1.0f, output_ptr[11]);
402     EXPECT_EQ(1.5f,  output_ptr[13]); EXPECT_EQ(0.0f, output_ptr[15]);
403 }
404
405 TEST(pooling_forward_gpu, offsets_max_yxfb_f32_wsiz2x2_wstr2x2_i2x2x1x1_zeropad) {
406     //  Brief test description.
407     //
408     //  Pool window: 2x2
409     //  Pool stride: 2x2
410     //  Pool mode: max
411     //  Padding: zero
412     //
413     //  Input offset : -1x-1
414     //  Input data:
415     //  [ padd, padd, padd, padd]
416     //  [ padd,  1.5, -0.5, padd]
417     //  [ padd, -1.0,  0.5, padd]
418     //  [ padd, padd, padd, padd]
419     //
420     //  Expected output:
421     //  [ 1.5, -0.5]
422     //  [   -1, 0.5]
423
424     const auto& engine = get_test_engine();
425
426     auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } });
427
428     topology topology;
429     topology.add(input_layout("input_prim", input_prim.get_layout()));
430     topology.add(pooling("pool_prim", "input_prim", pooling_mode::max, { 1,1,2,2 }, { 1,1,2,2 }, { 0, 0, -1,-1 }));
431
432     network network(engine, topology);
433     set_values(input_prim, { 1.50f, -0.50f, -1.00f, 0.50f });
434     network.set_input_data("input_prim", input_prim);
435
436     auto outputs = network.execute();
437     EXPECT_EQ(outputs.size(), size_t(1));
438     EXPECT_EQ(outputs.begin()->first, "pool_prim");
439
440     auto output_prim = outputs.begin()->second.get_memory();
441
442     auto output_ptr = output_prim.pointer<float>();
443     EXPECT_EQ( 1.5f, output_ptr[0]);
444     EXPECT_EQ(-0.5f, output_ptr[1]);
445     EXPECT_EQ(-1.0f, output_ptr[2]);
446     EXPECT_EQ( 0.5f, output_ptr[3]);
447 }
448
449 TEST(pooling_forward_gpu, offsets_max_yxfb_f32_wsiz2x2_wstr2x2_i3x3x1x1_zeropad) {
450     //  Brief test description.
451     //
452     //  Pool window: 2x2
453     //  Pool stride: 2x2
454     //  Pool mode: max
455     //  Padding: zero
456     //
457     //  Input offset : -1x-1
458     //  Input data:
459     //  [ padd, padd, padd, padd, padd]
460     //  [ padd,  1.5, -1.0, -0.5, padd]
461     //  [ padd,  1.0, -1.0, -1.0, padd]
462     //  [ padd, -1.0, -1.0, -0.5, padd]
463     //  [ padd, padd, padd, padd, padd]
464     //
465     //  Expected output:
466     //  [ 1.5,  -0.5]
467     //  [   1,  -0.5]
468
469     const auto& engine = get_test_engine();
470
471     auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 3, 3 } });
472
473     topology topology;
474     topology.add(input_layout("input_prim", input_prim.get_layout()));
475     topology.add(pooling("pool_prim", "input_prim", pooling_mode::max, { 1,1,2,2 }, { 1,1,2,2 }, { 0,0,-1,-1 }));
476
477     network network(engine, topology);
478
479     set_values(input_prim, { 
480         1.50f, -1.00f, -0.50f,
481         1.00f, -1.00f, -1.00f,
482        -1.00f, -1.00f, -0.50f
483     });
484
485     network.set_input_data("input_prim", input_prim);
486
487     auto outputs = network.execute();
488     EXPECT_EQ(outputs.size(), size_t(1));
489     EXPECT_EQ(outputs.begin()->first, "pool_prim");
490
491     auto output_prim = outputs.begin()->second.get_memory();
492     EXPECT_EQ((int)output_prim.get_layout().size.count(), 4);
493
494     auto output_ptr = output_prim.pointer<float>();
495     EXPECT_EQ(1.5f, get_value<float>(output_ptr, 0));
496     EXPECT_EQ(-0.5f, get_value<float>(output_ptr, 1));
497     EXPECT_EQ(1.0f, get_value<float>(output_ptr, 2));
498     EXPECT_EQ(-0.5f, get_value<float>(output_ptr, 3));
499 }
500
501 TEST(pooling_forward_gpu, basic_avg_yxfb_f32_wsiz2x2_wstr1x1_i3x3x1x1_nopad) {
502     //  Brief test description.
503     //
504     //  Pool window: 2x2
505     //  Pool stride: 1x1
506     //  Pool mode: avg
507     //  Padding: none
508     //
509     //  Input data:
510     //  [-0.5,  1.0,  0.5]
511     //  [ 2.0,  1.5, -0.5]
512     //  [ 4.0, -1.0,  3.5]
513     //
514     //  Expected output:
515     //  [ 1.0,   0.625]
516     //  [ 1.625, 0.875]
517
518     const auto& engine = get_test_engine();
519
520     auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 3, 3 } });
521
522     topology topology;
523     topology.add(input_layout("input_prim", input_prim.get_layout()));
524     topology.add(pooling("pool_prim", "input_prim", pooling_mode::average,{ 1,1,2,2 },{ 1,1,1,1 }));
525
526     network network(engine, topology);
527     set_values(input_prim, { -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 4.0f, -1.0f, 3.5f });
528     network.set_input_data("input_prim", input_prim);
529
530     auto outputs = network.execute();
531     EXPECT_EQ(outputs.size(), size_t(1));
532     EXPECT_EQ(outputs.begin()->first, "pool_prim");
533
534     auto output_prim = outputs.begin()->second.get_memory();
535
536     auto output_ptr = output_prim.pointer<float>();
537     
538     EXPECT_EQ(1.0f,   output_ptr[0]);
539     EXPECT_EQ(0.625f, output_ptr[1]);
540     EXPECT_EQ(1.625f, output_ptr[2]);
541     EXPECT_EQ(0.875f, output_ptr[3]);
542 }
543
544 TEST(pooling_forward_gpu, offsets_avg_yxfb_f32_wsiz2x2_wstr2x2_i2x2x1x1_zeropad) {
545     //  Brief test description.
546     //
547     //  Pool window: 2x2
548     //  Pool stride: 2x2
549     //  Pool mode: avg
550     //  Padding: zero
551     //
552     //  Input offset : -1x-1
553     //  Input data:
554     //  [ padd, padd, padd, padd]
555     //  [ padd,  1.5, -0.5, padd]
556     //  [ padd, -1.0,  0.5, padd]
557     //  [ padd, padd, padd, padd]
558     //
559     //  Expected output:
560     //  [ 0.375, -0.125]
561     //  [ -0.25,  0.125]
562
563     const auto& engine = get_test_engine();
564
565     auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } });
566
567     topology topology;
568     topology.add(input_layout("input_prim", input_prim.get_layout()));
569     topology.add(pooling("pool_prim", "input_prim", pooling_mode::average, { 1,1,2,2 }, { 1,1,2,2 }, { 0,0,-1,-1 }));
570
571     network network(engine, topology);
572     set_values(input_prim, { 1.5f, -0.5f, -1.0f, 0.5f });
573     network.set_input_data("input_prim", input_prim);
574
575     auto outputs = network.execute();
576     EXPECT_EQ(outputs.size(), size_t(1));
577     EXPECT_EQ(outputs.begin()->first, "pool_prim");
578
579     auto output_prim = outputs.begin()->second.get_memory();
580
581     auto output_ptr = output_prim.pointer<float>();
582     EXPECT_EQ(0.375f,  output_ptr[0]);
583     EXPECT_EQ(-0.125f, output_ptr[1]);
584     EXPECT_EQ(-0.25f,  output_ptr[2]);
585     EXPECT_EQ(0.125f,  output_ptr[3]);
586 }
587
588 TEST(pooling_forward_gpu, offsets_avg_bfyx_f32_wsiz3x3_wstr3x3_i1x1x3x3_zeropad) {
589     //  Test the corner case when average pooling window contains data from image, data from padding and data outside padding
590     //
591     //  Pool window: 3x3
592     //  Pool stride: 3x3
593     //  Pool mode: avg
594     //  Padding: zero
595     //
596     //  Input offset : -1x-1
597     //  Input data:
598     //  [ padd, padd, padd, padd, padd]
599     //  [ padd,  1.5, -0.5, -1.0, padd]
600     //  [ padd,  0.5,  0.1,  0.2, padd]
601     //  [ padd,  0.9,  1.1,  2.2, padd]
602     //  [ padd, padd, padd, padd, padd]
603     //
604     //  Expected output:
605     //  [ 0.177777, -0.133333]
606     //  [ 0.333333,  0.55]
607
608     const auto& engine = get_test_engine();
609
610     auto input_prim = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 3 } });
611
612     topology topology;
613     topology.add(input_layout("input_prim", input_prim.get_layout()));
614     topology.add(pooling("pool_prim", "input_prim", pooling_mode::average, { 1,1,3,3 }, { 1,1,3,3 }, { 0,0,-1,-1 }));
615
616     network network(engine, topology);
617
618     std::vector<float> input_vec = { 1.5f, -0.5f, -1.0f, 0.5f, 0.1f, 0.2f, 0.9f, 1.1f, 2.2f };
619     set_values(input_prim, input_vec);
620
621     network.set_input_data("input_prim", input_prim);
622
623     auto outputs = network.execute();
624     EXPECT_EQ(outputs.size(), size_t(1));
625     EXPECT_EQ(outputs.begin()->first, "pool_prim");
626
627     auto output_prim = outputs.begin()->second.get_memory();
628
629     auto output_ptr = output_prim.pointer<float>();
630
631     EXPECT_NEAR(output_ptr[0], 0.177777f, 1e-05F);
632     EXPECT_NEAR(output_ptr[1], -0.133333f, 1e-05F);
633     EXPECT_NEAR(output_ptr[2], 0.333333f, 1e-05F);
634     EXPECT_NEAR(output_ptr[3], 0.55f, 1e-05F);
635 }
636
637 TEST(pooling_forward_gpu, offsets_avg_yxfb_f32_wsiz2x2_wstr2x2_i3x3x1x1_zeropad) {
638     //  Brief test description.
639     //
640     //  Pool window: 2x2
641     //  Pool stride: 2x2
642     //  Pool mode: avg
643     //  Padding: zero
644     //
645     //  Input offset : -1x-1
646     //  Input data:
647     //  [ padd, padd, padd, padd]
648     //  [ padd,  1.5, -0.5,  2.5]
649     //  [ padd, -1.0,  0.5,  3.0]
650     //  [ padd,  0.5,  0.0, -8.0]
651     //
652     //  Expected output:
653     //  [  0.375,    0.5]
654     //  [ -0.125, -1.125]
655
656     const auto& engine = get_test_engine();
657
658     auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 3, 3 } });
659
660     topology topology;
661     topology.add(input_layout("input_prim", input_prim.get_layout()));
662     topology.add(pooling("pool_prim", "input_prim", pooling_mode::average, { 1,1,2,2 }, { 1,1,2,2 }, { 0,0,-1,-1 }));
663
664     network network(engine, topology);
665     set_values(input_prim, { 1.5f, -0.5f, 2.5f, -1.0f, 0.5f, 3.0f, 0.5f, 0.0f, -8.0f });
666     network.set_input_data("input_prim", input_prim);
667
668     auto outputs = network.execute();
669     EXPECT_EQ(outputs.size(), size_t(1));
670     EXPECT_EQ(outputs.begin()->first, "pool_prim");
671
672     auto output_prim = outputs.begin()->second.get_memory();
673     EXPECT_EQ((int)output_prim.get_layout().size.count(), 4);
674
675     auto output_ptr = output_prim.pointer<float>();
676     EXPECT_EQ(0.375f,  output_ptr[0]);
677     EXPECT_EQ(0.5f,    output_ptr[1]);
678     EXPECT_EQ(-0.125f, output_ptr[2]);
679     EXPECT_EQ(-1.125f, output_ptr[3]);
680 }
681
682 TEST(pooling_forward_gpu, offsets_avg_yxfb_bfyx_f32_wsiz2x2_wstr2x2_i2x2x1x1_outpad2) {
683     //  Brief test description.
684     //
685     //  Pool window: 2x2
686     //  Pool stride: 2x2
687     //  Pool mode: avg
688     //  Padding: 2x2
689     //
690     //  Input offset : -1x-1
691     //  Input data:
692     //  [ padd, padd, padd, padd]
693     //  [ padd,  1.5, -0.5, padd]
694     //  [ padd, -1.0,  0.5, padd]
695     //  [ padd, padd, padd, padd]
696     //
697     //  Expected output:
698     //  [0, 0, 0, 0, 0, 0]
699     //  [0, 0, 0, 0, 0, 0]
700     //  [ 0, 0, 0.375, -0.125, 0, 0]
701     //  [ 0, 0, -0.25,  0.125, 0, 0]
702     //  [0, 0, 0, 0, 0, 0]
703     //  [0, 0, 0, 0, 0, 0]
704
705     const auto& engine = get_test_engine();
706     std::vector<format> formats_to_test = { format::yxfb , format::bfyx };
707
708     for (std::vector<format>::iterator it = formats_to_test.begin(); it != formats_to_test.end(); ++it)
709     {
710         std::cout << "Testing format: " << format::order(*it) << std::endl;
711
712         tensor input_tensor( 1, 1, 2, 2 );
713         auto input_prim = memory::allocate(engine, { data_types::f32, *it, input_tensor });
714
715         topology topology;
716         topology.add(input_layout("input_prim", input_prim.get_layout()));
717         topology.add(pooling("pool_prim", "input_prim", pooling_mode::average, { 1,1,2,2 }, { 1,1,2,2 }, { 0,0,-1,-1 }, padding{ { 0,0,2,2 }, 0 }));
718
719         network network(engine, topology);
720         set_values(input_prim, { 1.5f, -0.5f, -1.0f, 0.5f });
721         network.set_input_data("input_prim", input_prim);
722
723         std::vector<float> expected = {
724             0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
725             0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
726             0.0f, 0.0f, 0.375f, -0.125f, 0.0f, 0.0f,
727             0.0f, 0.0f, -0.25f, 0.125f, 0.0f, 0.0f,
728             0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
729             0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
730         };
731
732         auto outputs = network.execute();
733         EXPECT_EQ(outputs.size(), size_t(1));
734         EXPECT_EQ(outputs.begin()->first, "pool_prim");
735
736         auto output_prim = outputs.begin()->second.get_memory();
737         auto output_ptr = output_prim.pointer<float>();
738         for (size_t i = 0; i < expected.size(); ++i) {
739             EXPECT_EQ(expected[i], output_ptr[i]);
740         }
741     }
742 }
743
744 TEST(pooling_forward_gpu, offsets_max_yxfb_bfyx_f32_wsiz2x2_wstr2x2_i3x3x1x1_outpad2) {
745     //  Brief test description.
746     //
747     //  Pool window: 2x2
748     //  Pool stride: 2x2
749     //  Pool mode: max
750     //  Padding: 2x2
751     //
752     //  Input offset : -1x-1
753     //  Input data:
754     //  [ padd, padd, padd, padd, padd]
755     //  [ padd,  1.5, -1.0, -0.5, padd]
756     //  [ padd,  1.0, -1.0, -1.0, padd]
757     //  [ padd, -1.0, -1.0, -0.5, padd]
758     //  [ padd, padd, padd, padd, padd]
759     //
760     //  Expected output:
761     //  [0, 0, 0, 0, 0]
762     //  [0, 1.5, -0.5, 0, 0]
763     //  [0, 1, -0.5, 0, 0]
764     //  [0, 0, 0, 0, 0]
765
766     const auto& engine = get_test_engine();
767     std::vector<format> formats_to_test = { format::yxfb , format::bfyx };
768
769     for (std::vector<format>::iterator it = formats_to_test.begin(); it != formats_to_test.end(); ++it)
770     {
771         std::cout << "Testing format: " << format::order(*it) << std::endl;
772
773         tensor input_tensor( 1, 1, 3, 3 );
774         auto input_prim = memory::allocate(engine, { data_types::f32, *it, input_tensor });
775
776         topology topology;
777         topology.add(input_layout("input_prim", input_prim.get_layout()));
778         topology.add(pooling("pool_prim", "input_prim", pooling_mode::max, { 1,1,2,2 }, { 1,1,2,2 }, { 0,0,-1,-1 }, padding{ { 0,0,1,1 }, 0 }));
779
780         network network(engine, topology);
781
782         set_values(input_prim, {
783             1.50f, -1.00f, -0.50f,
784             1.00f, -1.00f, -1.00f,
785             -1.00f, -1.00f, -0.50f
786         });
787
788         network.set_input_data("input_prim", input_prim);
789
790         std::vector<float> expected = {
791             0.0f, 0.0f, 0.0f, 0.0f,
792             0.0f, 1.5f,-0.5f, 0.0f,
793             0.0f, 1.f, -0.5f, 0.0f,
794             0.0f, 0.0f, 0.0f, 0.0f,
795         };
796
797         auto outputs = network.execute();
798         EXPECT_EQ(outputs.size(), size_t(1));
799         EXPECT_EQ(outputs.begin()->first, "pool_prim");
800
801         auto output_prim = outputs.begin()->second.get_memory();
802         EXPECT_EQ((int)output_prim.get_layout().size.count(), 4);
803         EXPECT_EQ((int)output_prim.get_layout().get_buffer_size().count(), 16);
804
805         auto output_ptr = output_prim.pointer<float>();
806         for (size_t i = 0; i < expected.size(); ++i) {
807             EXPECT_EQ(expected[i], output_ptr[i]);
808         }
809     }
810 }
811
812 TEST(pooling_forward_gpu, offsets_avg_yxfb_bfyx_f32_wsiz2x2_wstr2x2_i2x2x1x1_inpad2x1_outpad2) {
813     //  Brief test description.
814     //
815     //  Pool window: 2x2
816     //  Pool stride: 2x2
817     //  Pool mode: avg
818     //  Out Padding: 2x2
819     //  Input Padding: 2x1 (yx format) out of the reorder layer
820     //
821     //  Input offset : -1x-1
822     //  Input data:
823     //  [ padd, padd, padd, padd]
824     //  [ padd,  1.5, -0.5, padd]
825     //  [ padd, -1.0,  0.5, padd]
826     //  [ padd, padd, padd, padd]
827     //
828     //  Expected output:
829     //  [0, 0, 0, 0, 0, 0]
830     //  [0, 0, 0, 0, 0, 0]
831     //  [ 0, 0, 0.375, -0.125, 0, 0]
832     //  [ 0, 0, -0.25,  0.125, 0, 0]
833     //  [0, 0, 0, 0, 0, 0]
834     //  [0, 0, 0, 0, 0, 0]
835
836     const auto& engine = get_test_engine();
837     std::vector<format> formats_to_test = { format::yxfb , format::bfyx };
838
839     for (std::vector<format>::iterator it = formats_to_test.begin(); it != formats_to_test.end(); ++it)
840     {
841         std::cout << "Testing format: " << format::order(*it) << std::endl;
842
843         tensor input_tensor( 1, 1, 2, 2 );
844         auto input_prim = memory::allocate(engine, { data_types::f32, *it, input_tensor });
845
846         topology topology;
847         topology.add(input_layout("input_prim", input_prim.get_layout()));
848         topology.add(reorder("reorder", "input_prim", input_prim.get_layout().with_padding({ {0,0,1,2}, 0 })));
849         topology.add(pooling("pool_prim", "reorder", pooling_mode::average, { 1,1,2,2 }, { 1,1,2,2 }, { 0,0,-1,-1 }, padding{ { 0,0,2,2 }, 0 }));
850
851         network network(engine, topology);
852         set_values(input_prim, { 1.5f, -0.5f, -1.0f, 0.5f });
853         network.set_input_data("input_prim", input_prim);
854
855         std::vector<float> expected = {
856             0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
857             0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
858             0.0f, 0.0f, 0.375f, -0.125f, 0.0f, 0.0f,
859             0.0f, 0.0f, -0.25f, 0.125f, 0.0f, 0.0f,
860             0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
861             0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
862         };
863
864         auto outputs = network.execute();
865         EXPECT_EQ(outputs.size(), size_t(1));
866         EXPECT_EQ(outputs.begin()->first, "pool_prim");
867
868         auto output_prim = outputs.begin()->second.get_memory();
869         auto output_ptr = output_prim.pointer<float>();
870         for (size_t i = 0; i < expected.size(); ++i) {
871             EXPECT_EQ(expected[i], output_ptr[i]);
872         }
873     }
874 }
875
876 TEST(pooling_forward_gpu, offsets_max_yxfb_bfyx_f32_wsiz2x2_wstr2x2_i3x3x1x1_inpad2x1_outpad2) {
877     //  Brief test description.
878     //
879     //  Pool window: 2x2
880     //  Pool stride: 2x2
881     //  Pool mode: max
882     //  Padding: 2x2
883     //  Input Padding: 2x1 (yx format) out of the reorder layer
884     //
885     //  Input offset : -1x-1
886     //  Input data:
887     //  [ padd, padd, padd, padd, padd]
888     //  [ padd,  1.5, -1.0, -0.5, padd]
889     //  [ padd,  1.0, -1.0, -1.0, padd]
890     //  [ padd, -1.0, -1.0, -0.5, padd]
891     //  [ padd, padd, padd, padd, padd]
892     //
893     //  Expected output:
894     //  [0, 0, 0, 0, 0]
895     //  [0, 1.5, -0.5, 0]
896     //  [0, 1, -0.5, 0]
897     //  [0, 0, 0, 0, 0]
898
899     const auto& engine = get_test_engine();
900     std::vector<format> formats_to_test = { format::yxfb , format::bfyx };
901
902     for (std::vector<format>::iterator it = formats_to_test.begin(); it != formats_to_test.end(); ++it)
903     {
904         std::cout << "Testing format: " << format::order(*it) << std::endl;
905
906         tensor input_tensor( 1, 1, 3, 3 );
907         auto input_prim = memory::allocate(engine, { data_types::f32, *it, input_tensor });
908
909         topology topology;
910         topology.add(input_layout("input_prim", input_prim.get_layout()));
911         topology.add(reorder("reorder", "input_prim", input_prim.get_layout().with_padding({ { 0, 0, 1, 2 }, 0 })));
912         topology.add(pooling("pool_prim", "reorder", pooling_mode::max, { 1,1,2,2 }, { 1,1,2,2 }, { 0,0,-1,-1 }, padding{ { 0,0,1,1 }, 0 }));
913
914         network network(engine, topology);
915
916         set_values(input_prim, {
917             1.50f, -1.00f, -0.50f,
918             1.00f, -1.00f, -1.00f,
919             -1.00f, -1.00f, -0.50f
920         });
921
922         network.set_input_data("input_prim", input_prim);
923
924         std::vector<float> expected = {
925             0.0f, 0.0f, 0.0f, 0.0f,
926             0.0f, 1.5f, -0.5f, 0.0f,
927             0.0f, 1.f, -0.5f, 0.0f,
928             0.0f, 0.0f, 0.0f, 0.0f,
929         };
930
931         auto outputs = network.execute();
932         EXPECT_EQ(outputs.size(), size_t(1));
933         EXPECT_EQ(outputs.begin()->first, "pool_prim");
934
935         auto output_prim = outputs.begin()->second.get_memory();
936         EXPECT_EQ((int)output_prim.get_layout().size.count(), 4);
937         EXPECT_EQ((int)output_prim.get_layout().get_buffer_size().count(), 16);
938
939         auto output_ptr = output_prim.pointer<float>();
940         for (size_t i = 0; i < expected.size(); ++i) {
941             EXPECT_EQ(expected[i], output_ptr[i]);
942         }
943     }
944 }
945
946 TEST(pooling_forward_gpu, avg_yxfb_bfyx_f32_wsiz2x2_wstr2x2_i2x2x1x1_inpad2x1_outpad2) {
947     //  Brief test description.
948     //
949     //  Pool window: 2x2
950     //  Pool stride: 2x2
951     //  Pool mode: avg
952     //  Out Padding: 2x2
953     //  Input Padding: 2x1 (yx format) out of the reorder layer
954     //
955     //  Input offset : 0x0
956     //  Input data:
957     //  [ 1, 2, 3, 4]
958     //  [ 5,  1.5, -0.5, 6]
959     //  [ 7, -1.0,  0.5, 8]
960     //  [ 9, 10, 11, 12]
961     //
962     //  Expected output:
963     //  [0, 0, 0, 0, 0, 0]
964     //  [0, 0, 0, 0, 0, 0]
965     //  [ 0, 0, 2.375, 3.125, 0, 0]
966     //  [ 0, 0, 6.25,  7.875, 0, 0]
967     //  [0, 0, 0, 0, 0, 0]
968     //  [0, 0, 0, 0, 0, 0]
969
970     const auto& engine = get_test_engine();
971     std::vector<format> formats_to_test = { format::yxfb , format::bfyx };
972
973     for (std::vector<format>::iterator it = formats_to_test.begin(); it != formats_to_test.end(); ++it)
974     {
975         std::cout << "Testing format: " << format::order(*it) << std::endl;
976
977         tensor input_tensor( 1, 1, 4, 4 );
978         auto input_prim = memory::allocate(engine, { data_types::f32, *it, input_tensor });
979
980         topology topology;
981         topology.add(input_layout("input_prim", input_prim.get_layout()));
982         topology.add(reorder("reorder", "input_prim", input_prim.get_layout().with_padding({ { 0, 0, 2, 1 }, 0 })));
983         topology.add(pooling("pool_prim", "reorder", pooling_mode::average, { 1,1,2,2 }, { 1,1,2,2 }, { 0,0,0,0 }, padding{ { 0,0,2,2 }, 0 }));
984
985         network network(engine, topology);
986         set_values(input_prim, {
987             1.f, 2.f, 3.f, 4.f,
988             5.f, 1.5f, -0.5f, 6.f,
989             7.f, -1.0f, 0.5f, 8.f,
990             9.f, 10.f, 11.f, 12.f});
991         network.set_input_data("input_prim", input_prim);
992
993         std::vector<float> expected = {
994             0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
995             0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
996             0.0f, 0.0f, 2.375f, 3.125f, 0.0f, 0.0f,
997             0.0f, 0.0f, 6.25f, 7.875f, 0.0f, 0.0f,
998             0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
999             0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
1000         };
1001
1002         auto outputs = network.execute();
1003         EXPECT_EQ(outputs.size(), size_t(1));
1004         EXPECT_EQ(outputs.begin()->first, "pool_prim");
1005
1006         auto output_prim = outputs.begin()->second.get_memory();
1007         auto output_ptr = output_prim.pointer<float>();
1008         for (size_t i = 0; i < expected.size(); ++i) {
1009             EXPECT_EQ(expected[i], output_ptr[i]);
1010         }
1011     }
1012 }
1013
1014 TEST(pooling_forward_gpu, max_yxfb_bfyx_f32_wsiz2x2_wstr2x2_i3x3x1x1_inpad2x1_outpad2) {
1015     //  Brief test description.
1016     //
1017     //  Pool window: 2x2
1018     //  Pool stride: 2x2
1019     //  Pool mode: max
1020     //  Padding: 2x2
1021     //  Input Padding: 2x1 (yx format) out of the reorder layer
1022     //
1023     //  Input offset : 0x0
1024     //  Input data:
1025     //  [ 1, 2, 3, 4, 5]
1026     //  [ 6,  1.5, -1.0, -0.5, 7]
1027     //  [ 8,  1.0, -1.0, -1.0, 9]
1028     //  [ 10, -1.0, -1.0, -0.5, 11]
1029     //  [ 12, 13, 14, 15, 16]
1030     //
1031     //  Expected output:
1032     //  [0, 0, 0, 0, 0]
1033     //  [0, 1, 3, 5, 0]
1034     //  [0, 8, 1.5, 9, 0]
1035     //  [0, 12, 14, 16, 0]
1036     //  [0, 0, 0, 0, 0]
1037
1038     const auto& engine = get_test_engine();
1039     std::vector<format> formats_to_test = { format::yxfb , format::bfyx };
1040
1041     for (std::vector<format>::iterator it = formats_to_test.begin(); it != formats_to_test.end(); ++it)
1042     {
1043         std::cout << "Testing format: " << format::order(*it) << std::endl;
1044
1045         tensor input_tensor( 1, 1, 5, 5 );
1046         auto input_prim = memory::allocate(engine, { data_types::f32, *it, input_tensor });
1047
1048         topology topology;
1049         topology.add(input_layout("input_prim", input_prim.get_layout()));
1050         topology.add(reorder("reorder", "input_prim", input_prim.get_layout().with_padding({ { 0, 0, 2, 1 }, 0 })));
1051         topology.add(pooling("pool_prim", "reorder", pooling_mode::max, { 1,1,2,2 }, { 1,1,2,2 }, { 0,0,-1,-1 }, padding{ { 0,0,1,1 }, 0 }));
1052
1053         network network(engine, topology);
1054
1055         set_values(input_prim, {
1056             1.f, 2.f, 3.f, 4.f, 5.f,
1057             6.f, 1.50f, -1.00f, -0.50f, 7.f,
1058             8.f, 1.00f, -1.00f, -1.00f, 9.f,
1059             10.f, -1.00f, -1.00f, -0.50f, 11.f,
1060             12.f, 13.f, 14.f, 15.f, 16.f
1061         });
1062
1063         network.set_input_data("input_prim", input_prim);
1064
1065         std::vector<float> expected = {
1066             0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
1067             0.0f, 1.f, 3.f, 5.f, 0.0f,
1068             0.0f, 8.f, 1.5f, 9.f, 0.0f,
1069             0.0f, 12.f, 14.f, 16.0f, 0.0f,
1070             0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
1071         };
1072
1073         auto outputs = network.execute();
1074         EXPECT_EQ(outputs.size(), size_t(1));
1075         EXPECT_EQ(outputs.begin()->first, "pool_prim");
1076
1077         auto output_prim = outputs.begin()->second.get_memory();
1078         EXPECT_EQ((int)output_prim.get_layout().size.count(), 9);
1079         EXPECT_EQ((int)output_prim.get_layout().get_buffer_size().count(), 25);
1080
1081         auto output_ptr = output_prim.pointer<float>();
1082         for (size_t i = 0; i < expected.size(); ++i) {
1083             EXPECT_EQ(expected[i], output_ptr[i]);
1084         }
1085     }
1086 }
1087
1088 TEST(pooling_forward_gpu, basic_in2x2x3x2_max_with_argmax) {
1089     //  Input  : 2x2x3x2
1090     //  Argmax : 2x2x2x1
1091     //  Output : 2x2x2x2
1092
1093     //  Forward Max Pooling Input:
1094     //  f0: b0:  1    2  -10   b1:   0    0     -11
1095     //  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15
1096     //  f1: b0:  5    6  -12   b1:   1.5  5.2   -13
1097     //  f1: b0:  7    8   16    b1:   12   9     17
1098     //
1099     //  Output:
1100     //  f0: b0:  4    4   b1:   0.5    0
1101     //  f1: b0:  8   16   b1:   12    17
1102     //
1103     //  Argmax:
1104     //  f0: b0:  4    4   b1:   15    13
1105     //  f1: b0:  10  11   b1:   21    23
1106
1107
1108     const auto& engine = get_test_engine();
1109
1110     auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } });
1111     auto arg_max = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 1 } });
1112
1113     set_values(input, {
1114         1.0f, 2.0f, -10.f,
1115         3.0f, 4.0f, -14.f,
1116         5.0f, 6.0f, -12.f,
1117         7.0f, 8.0f, 16.0f,
1118         0.f, 0.f, -11.f,
1119         0.5f, -0.5f, -15.f,
1120         1.5f, 5.2f, -13.f,
1121         12.f, 9.f, 17.f
1122     });
1123
1124     topology topology;
1125     topology.add(input_layout("input", input.get_layout()));
1126     topology.add(mutable_data("arg_max", arg_max));
1127     topology.add(pooling("pooling", "input", "arg_max", pooling_mode::max_with_argmax, { 1, 1, 2, 2 }, { 1, 1, 1, 1 }));
1128
1129     network network(engine, topology);
1130
1131     network.set_input_data("input", input);
1132
1133     auto outputs = network.execute();
1134
1135     auto output = outputs.at("pooling").get_memory();
1136     auto output_ptr = output.pointer<float>();
1137     auto output_layout = output.get_layout();
1138     auto argmax_ptr = arg_max.pointer<float>();
1139
1140     EXPECT_EQ(output_layout.format, format::bfyx);
1141     EXPECT_EQ(output_layout.size.spatial[1], 1);
1142     EXPECT_EQ(output_layout.size.spatial[0], 2);
1143     EXPECT_EQ(output_layout.size.feature[0], 2);
1144     EXPECT_EQ(output_layout.size.batch[0], 2);
1145
1146     std::vector<float> expected_argmax_vec = {
1147         4.0f, 4.0f,
1148         10.0f, 11.0f,
1149         15.0f, 13.0f,
1150         21.0f, 23.0f
1151     };
1152
1153     std::vector<float> expected_output_vec = {
1154         4.0f, 4.0f,
1155         8.0f, 16.0f,
1156         0.5f, 0.0f,
1157         12.0f, 17.0f
1158     };
1159
1160     for (size_t i = 0; i < expected_output_vec.size(); ++i) {
1161         EXPECT_EQ(expected_output_vec[i], output_ptr[i]);
1162         EXPECT_EQ(expected_argmax_vec[i], argmax_ptr[i]);
1163     }
1164 }
1165
1166 TEST(pooling_forward_gpu, basic_in2x2x3x2_max_with_argmax_input_padding) {
1167     //  Input  : 2x2x3x2
1168     //  Argmax : 2x2x2x1
1169     //  Output : 2x2x2x2
1170     //  Input Padding : 2x2
1171
1172     //  Forward Max Pooling Input:
1173     //  f0: b0:  1    2  -10   b1:   0    0     -11
1174     //  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15
1175     //  f1: b0:  5    6  -12   b1:   1.5  5.2   -13
1176     //  f1: b0:  7    8   16    b1:   12   9     17
1177     //
1178     //  Output:
1179     //  f0: b0:  4    4   b1:   0.5    0
1180     //  f1: b0:  8   16   b1:   12    17
1181     //
1182     //  Argmax:
1183     //  f0: b0:  4    4   b1:   15    13
1184     //  f1: b0:  10  11   b1:   21    23
1185
1186
1187     const auto& engine = get_test_engine();
1188
1189     auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } });
1190     auto arg_max = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 1 } });
1191
1192     set_values(input, {
1193         1.0f, 2.0f, -10.f,
1194         3.0f, 4.0f, -14.f,
1195         5.0f, 6.0f, -12.f,
1196         7.0f, 8.0f, 16.0f,
1197         0.f, 0.f, -11.f,
1198         0.5f, -0.5f, -15.f,
1199         1.5f, 5.2f, -13.f,
1200         12.f, 9.f, 17.f
1201     });
1202
1203     topology topology;
1204     topology.add(input_layout("input", input.get_layout()));
1205     topology.add(reorder("reorder", "input", input.get_layout().with_padding({ { 0, 0, 2, 2 }, 0 })));
1206     topology.add(mutable_data("arg_max", arg_max));
1207     topology.add(pooling("pooling", "reorder", "arg_max", pooling_mode::max_with_argmax, { 1, 1, 2, 2 }, { 1, 1, 1, 1 }));
1208
1209     network network(engine, topology);
1210
1211     network.set_input_data("input", input);
1212
1213     auto outputs = network.execute();
1214
1215     auto output = outputs.at("pooling").get_memory();
1216     auto output_ptr = output.pointer<float>();
1217     auto output_layout = output.get_layout();
1218     auto argmax_ptr = arg_max.pointer<float>();
1219
1220     EXPECT_EQ(output_layout.format, format::bfyx);
1221     EXPECT_EQ(output_layout.size.spatial[1], 1);
1222     EXPECT_EQ(output_layout.size.spatial[0], 2);
1223     EXPECT_EQ(output_layout.size.feature[0], 2);
1224     EXPECT_EQ(output_layout.size.batch[0], 2);
1225
1226     std::vector<float> expected_argmax_vec = {
1227         4.0f, 4.0f,
1228         10.0f, 11.0f,
1229         15.0f, 13.0f,
1230         21.0f, 23.0f
1231     };
1232
1233     std::vector<float> expected_output_vec = {
1234         4.0f, 4.0f,
1235         8.0f, 16.0f,
1236         0.5f, 0.0f,
1237         12.0f, 17.0f
1238     };
1239
1240     for (size_t i = 0; i < expected_output_vec.size(); ++i) {
1241         EXPECT_EQ(expected_output_vec[i], output_ptr[i]);
1242         EXPECT_EQ(expected_argmax_vec[i], argmax_ptr[i]);
1243     }
1244 }
1245
1246 TEST(pooling_forward_gpu, basic_in2x2x3x2_max_with_argmax_output_padding) {
1247     //  Input  : 2x2x3x2
1248     //  Argmax : 2x2x2x1
1249     //  Output : 2x2x2x2
1250     //  Output Padding : 2x2
1251
1252     //  Forward Max Pooling Input:
1253     //  f0: b0:  1    2  -10   b1:   0    0     -11
1254     //  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15
1255     //  f1: b0:  5    6  -12   b1:   1.5  5.2   -13
1256     //  f1: b0:  7    8   16    b1:   12   9     17
1257     //
1258     //  Output:
1259     //  f0: b0:  4    4   b1:   0.5    0
1260     //  f1: b0:  8   16   b1:   12    17
1261     //
1262     //  Argmax:
1263     //  f0: b0:  4    4   b1:   15    13
1264     //  f1: b0:  10  11   b1:   21    23
1265
1266
1267     const auto& engine = get_test_engine();
1268
1269     auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } });
1270     auto arg_max = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 1 } });
1271
1272     set_values(input, {
1273         1.0f, 2.0f, -10.f,
1274         3.0f, 4.0f, -14.f,
1275         5.0f, 6.0f, -12.f,
1276         7.0f, 8.0f, 16.0f,
1277         0.f, 0.f, -11.f,
1278         0.5f, -0.5f, -15.f,
1279         1.5f, 5.2f, -13.f,
1280         12.f, 9.f, 17.f
1281     });
1282
1283     topology topology;
1284     topology.add(input_layout("input", input.get_layout()));
1285     topology.add(reorder("reorder", "input", input.get_layout().with_padding({ { 0, 0, 2, 2 }, 0 })));
1286     topology.add(mutable_data("arg_max", arg_max));
1287     topology.add(pooling("pooling", "reorder", "arg_max", pooling_mode::max_with_argmax, { 1, 1, 2, 2 }, { 1, 1, 1, 1 }, { 0, 0, 0, 0 }, padding({ 0, 0, 1, 1 }, 0)));
1288
1289     network network(engine, topology);
1290
1291     network.set_input_data("input", input);
1292
1293     auto outputs = network.execute();
1294
1295     auto output = outputs.at("pooling").get_memory();
1296     auto output_ptr = output.pointer<float>();
1297     auto output_layout = output.get_layout();
1298     auto argmax_ptr = arg_max.pointer<float>();
1299
1300     EXPECT_EQ(output_layout.format, format::bfyx);
1301     EXPECT_EQ(output_layout.size.spatial[1], 1);
1302     EXPECT_EQ(output_layout.size.spatial[0], 2);
1303     EXPECT_EQ(output_layout.size.feature[0], 2);
1304     EXPECT_EQ(output_layout.size.batch[0], 2);
1305
1306     std::vector<float> expected_argmax_vec = {
1307         4.0f, 4.0f,
1308         10.0f, 11.0f,
1309         15.0f, 13.0f,
1310         21.0f, 23.0f
1311     };
1312
1313     std::vector<float> expected_output_vec = {
1314         0.0f, 0.0f, 0.0f, 0.0f,
1315         0.0f, 4.0f, 4.0f, 0.0f,
1316         0.0f, 0.0f, 0.0f, 0.0f,
1317         0.0f, 0.0f, 0.0f, 0.0f,
1318         0.0f, 8.0f, 16.0f,0.0f,
1319         0.0f, 0.0f, 0.0f, 0.0f,
1320         0.0f, 0.0f, 0.0f, 0.0f,
1321         0.0f, 0.5f, 0.0f, 0.0f,
1322         0.0f, 0.0f, 0.0f, 0.0f,
1323         0.0f, 0.0f, 0.0f, 0.0f,
1324         0.0f, 12.0f, 17.0f, 0.0f,
1325         0.0f, 0.0f, 0.0f, 0.0f,
1326     };
1327
1328     for (size_t i = 0; i < expected_output_vec.size(); ++i) {
1329         EXPECT_EQ(expected_output_vec[i], output_ptr[i]);
1330     }
1331
1332     for (size_t i = 0; i < expected_argmax_vec.size(); ++i) {
1333         EXPECT_EQ(expected_argmax_vec[i], argmax_ptr[i]);
1334     }
1335 }
1336
1337 TEST(pooling_forward_gpu, basic_in2x2x3x2_max_with_argmax_with_output_size) {
1338     //  Input  : 2x2x3x2
1339     //  Argmax : 2x2x2x1
1340     //  Output : 2x2x2x2
1341
1342     //  Forward Max Pooling Input:
1343     //  f0: b0:  1    2  -10   b1:   0    0     -11
1344     //  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15
1345     //  f1: b0:  5    6  -12   b1:   1.5  5.2   -13
1346     //  f1: b0:  7    8   16    b1:   12   9     17
1347     //
1348     //  Output:
1349     //  f0: b0:  4    4   b1:   0.5    0
1350     //  f1: b0:  8   16   b1:   12    17
1351     //
1352     //  Argmax:
1353     //  f0: b0:  4    4   b1:   15    13
1354     //  f1: b0:  10  11   b1:   21    23
1355
1356
1357     const auto& engine = get_test_engine();
1358
1359     auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } });
1360     auto arg_max = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 1 } });
1361
1362     set_values(input, {
1363         1.0f, 2.0f, -10.f,
1364         3.0f, 4.0f, -14.f,
1365         5.0f, 6.0f, -12.f,
1366         7.0f, 8.0f, 16.0f,
1367         0.f, 0.f, -11.f,
1368         0.5f, -0.5f, -15.f,
1369         1.5f, 5.2f, -13.f,
1370         12.f, 9.f, 17.f
1371     });
1372
1373     topology topology;
1374     topology.add(input_layout("input", input.get_layout()));
1375     topology.add(mutable_data("arg_max", arg_max));
1376     topology.add(pooling("pooling", "input", "arg_max", pooling_mode::max_with_argmax, { 1, 1, 2, 2 }, { 1, 1, 1, 1 }, { 0, 0, 0, 0 }, { 2, 2, 2, 1 }));
1377
1378     network network(engine, topology);
1379
1380     network.set_input_data("input", input);
1381
1382     auto outputs = network.execute();
1383
1384     auto output = outputs.at("pooling").get_memory();
1385     auto output_ptr = output.pointer<float>();
1386     auto output_layout = output.get_layout();
1387     auto argmax_ptr = arg_max.pointer<float>();
1388
1389     EXPECT_EQ(output_layout.format, format::bfyx);
1390     EXPECT_EQ(output_layout.size.spatial[1], 1);
1391     EXPECT_EQ(output_layout.size.spatial[0], 2);
1392     EXPECT_EQ(output_layout.size.feature[0], 2);
1393     EXPECT_EQ(output_layout.size.batch[0], 2);
1394
1395     std::vector<float> expected_argmax_vec = {
1396         4.0f, 4.0f,
1397         10.0f, 11.0f,
1398         15.0f, 13.0f,
1399         21.0f, 23.0f
1400     };
1401
1402     std::vector<float> expected_output_vec = {
1403         4.0f, 4.0f,
1404         8.0f, 16.0f,
1405         0.5f, 0.0f,
1406         12.0f, 17.0f
1407     };
1408
1409     for (size_t i = 0; i < expected_output_vec.size(); ++i) {
1410         EXPECT_EQ(expected_output_vec[i], output_ptr[i]);
1411         EXPECT_EQ(expected_argmax_vec[i], argmax_ptr[i]);
1412     }
1413 }
1414
1415 template <class DataType>
1416 static void generic_average_wo_padding_test(format fmt, tensor output, tensor input, tensor window, tensor stride, tensor offset)
1417 {
1418     constexpr auto dt = std::is_same<DataType, float>::value ? data_types::f32 : data_types::f16;
1419
1420     engine eng;
1421
1422     if (!eng.get_info().supports_fp16)
1423     {
1424         if(dt == data_types::f16)
1425         {
1426             return;
1427         }
1428     }
1429
1430     auto input_mem = memory::allocate(eng, layout{ dt, fmt, input });
1431     set_values(input_mem, std::vector<DataType>(input.count(), DataType(1)));
1432     std::vector<DataType> expected_output(output.count(), DataType(1));
1433
1434     topology tpl;
1435     tpl.add(input_layout("in", input_mem.get_layout()));
1436
1437     auto pool_in = "in";
1438     if (offset != tensor())
1439     {
1440         tpl.add(reorder("reorder", "in", input_mem.get_layout().with_padding(offset.negate().sizes())));
1441         pool_in = "reorder";
1442     }
1443     tpl.add(pooling("pool", pool_in, pooling_mode::average_no_padding, window, stride, offset));
1444
1445     network net(eng, tpl);
1446     net.set_input_data("in", input_mem);
1447     auto output_mem = net.execute().at("pool").get_memory();
1448
1449     ASSERT_TRUE(output_mem.count() == expected_output.size());
1450     EXPECT_TRUE(output_mem.get_layout().size == output);
1451     auto out_ptr = output_mem.pointer<DataType>();
1452
1453     for (size_t i = 0; i < expected_output.size(); ++i)
1454         EXPECT_FLOAT_EQ(out_ptr[i], expected_output[i]);
1455 }
1456
1457 //bfyx fp32
1458 TEST(pooling_forward_gpu, bfyx_average_without_padding_i3x3_w2x2_s2x2)
1459 {
1460     generic_average_wo_padding_test<float>(format::bfyx, spatial(2, 2), spatial(3, 3), spatial(2, 2), tensor{ 0,0,2,2 }, tensor{});
1461 }
1462
1463 TEST(pooling_forward_gpu, bfyx_average_without_padding_i3x3_w2x2_s2x2_o1x1)
1464 {
1465     generic_average_wo_padding_test<float>(format::bfyx, spatial(2, 2), spatial(3, 3), spatial(2, 2), tensor{ 0,0,2,2 }, tensor{ 0,0,-1,-1 });
1466 }
1467
1468 TEST(pooling_forward_gpu, bfyx_average_without_padding_i3x3_w2x2_s3x3_o1x1)
1469 {
1470     generic_average_wo_padding_test<float>(format::bfyx, spatial(2, 2), spatial(3, 3), spatial(3, 3), tensor{ 0,0,2,2 }, tensor{ 0,0,-1,-1 });
1471 }
1472
1473 TEST(pooling_forward_gpu, bfyx_average_without_padding_i1x1_w3x3_s1x1_o1x1)
1474 {
1475     generic_average_wo_padding_test<float>(format::bfyx, spatial(1, 1), spatial(1, 1), spatial(3, 3), tensor{ 0,0,1,1 }, tensor{ 0,0,-1,-1 });
1476 }
1477
1478 //bfyx fp16
1479 TEST(pooling_forward_gpu, bfyx_average_without_padding_i3x3_w2x2_s2x2_fp16)
1480 {
1481     generic_average_wo_padding_test<FLOAT16>(format::bfyx, spatial(2, 2), spatial(3, 3), spatial(2, 2), tensor{ 0,0,2,2 }, tensor{});
1482 }
1483
1484 TEST(pooling_forward_gpu, bfyx_average_without_padding_i3x3_w2x2_s2x2_o1x1_fp16)
1485 {
1486     generic_average_wo_padding_test<FLOAT16>(format::bfyx, spatial(2, 2), spatial(3, 3), spatial(2, 2), tensor{ 0,0,2,2 }, tensor{ 0,0,-1,-1 });
1487 }
1488
1489 TEST(pooling_forward_gpu, bfyx_average_without_padding_i3x3_w2x2_s3x3_o1x1_fp16)
1490 {
1491     generic_average_wo_padding_test<FLOAT16>(format::bfyx, spatial(2, 2), spatial(3, 3), spatial(3, 3), tensor{ 0,0,2,2 }, tensor{ 0,0,-1,-1 });
1492 }
1493
1494 TEST(pooling_forward_gpu, bfyx_average_without_padding_i1x1_w3x3_s1x1_o1x1_fp16)
1495 {
1496     generic_average_wo_padding_test<FLOAT16>(format::bfyx, spatial(1, 1), spatial(1, 1), spatial(3, 3), tensor{ 0,0,1,1 }, tensor{ 0,0,-1,-1 });
1497 }
1498
1499 //yxfb fp32
1500 TEST(pooling_forward_gpu, yxfb_average_without_padding_i3x3_w2x2_s2x2)
1501 {
1502     generic_average_wo_padding_test<float>(format::yxfb, spatial(2, 2), spatial(3, 3), spatial(2, 2), tensor{ 0,0,2,2 }, tensor{});
1503 }
1504
1505 TEST(pooling_forward_gpu, yxfb_average_without_padding_i3x3_w2x2_s2x2_o1x1)
1506 {
1507     generic_average_wo_padding_test<float>(format::yxfb, spatial(2, 2), spatial(3, 3), spatial(2, 2), tensor{ 0,0,2,2 }, tensor{ 0,0,-1,-1 });
1508 }
1509
1510 TEST(pooling_forward_gpu, yxfb_average_without_padding_i3x3_w2x2_s3x3_o1x1)
1511 {
1512     generic_average_wo_padding_test<float>(format::yxfb, spatial(2, 2), spatial(3, 3), spatial(3, 3), tensor{ 0,0,2,2 }, tensor{ 0,0,-1,-1 });
1513 }
1514
1515 TEST(pooling_forward_gpu, yxfb_average_without_padding_i1x1_w3x3_s1x1_o1x1)
1516 {
1517     generic_average_wo_padding_test<float>(format::yxfb, spatial(1, 1), spatial(1, 1), spatial(3, 3), tensor{ 0,0,1,1 }, tensor{ 0,0,-1,-1 });
1518 }
1519
1520 //yxfb fp16
1521 TEST(pooling_forward_gpu, yxfb_average_without_padding_i3x3_w2x2_s2x2_fp16)
1522 {
1523     generic_average_wo_padding_test<FLOAT16>(format::yxfb, spatial(2, 2), spatial(3, 3), spatial(2, 2), tensor{ 0,0,2,2 }, tensor{});
1524 }
1525
1526 TEST(pooling_forward_gpu, yxfb_average_without_padding_i3x3_w2x2_s2x2_o1x1_fp16)
1527 {
1528     generic_average_wo_padding_test<FLOAT16>(format::yxfb, spatial(2, 2), spatial(3, 3), spatial(2, 2), tensor{ 0,0,2,2 }, tensor{ 0,0,-1,-1 });
1529 }
1530
1531 TEST(pooling_forward_gpu, yxfb_average_without_padding_i3x3_w2x2_s3x3_o1x1_fp16)
1532 {
1533     generic_average_wo_padding_test<FLOAT16>(format::yxfb, spatial(2, 2), spatial(3, 3), spatial(3, 3), tensor{ 0,0,2,2 }, tensor{ 0,0,-1,-1 });
1534 }
1535
1536 TEST(pooling_forward_gpu, yxfb_average_without_padding_i1x1_w3x3_s1x1_o1x1_fp16)
1537 {
1538     generic_average_wo_padding_test<FLOAT16>(format::yxfb, spatial(1, 1), spatial(1, 1), spatial(3, 3), tensor{ 0,0,1,1 }, tensor{ 0,0,-1,-1 });
1539 }
1540
1541 TEST(pooling_forward_gpu, b_fs_yx_fsv4)
1542 {
1543     int B_array[] = {  16,    4, 0 };  // Batch
1544     int F_array[] = {  64, 2048, 0 };  // Features
1545     int I_array[] = { 112,    7, 0 };  // Input MxM data sizes
1546     int W_array[] = {   7,    3, 0 };  // Filter (a-ka weights) sizes
1547     int S_array[] = {   1,    2, 0 };  // Strides
1548     for (int j = 0; F_array[j]; j++) {
1549         int in_B = B_array[j];
1550
1551         int in_F = F_array[j];
1552
1553         int in_X = I_array[j],
1554             in_Y = in_X;
1555
1556         int W_X = W_array[j],
1557             W_Y = W_X;
1558
1559         int S_X = S_array[j],
1560             S_Y = S_X;
1561
1562         // Input data init
1563         std::vector<char> Data(in_B * in_F * in_X * in_Y);
1564         for (size_t i = 0; i < Data.size(); i++)
1565             Data[i] = static_cast<char>(i);
1566         std::vector<char> DataGold(Data);
1567
1568         // Expected "gold" output and IMAD output.
1569         std::vector<char>  vGoldOutput;
1570         std::vector<char>  vTestOutput;
1571
1572         engine   engine;
1573
1574         // "Golden" Pooling
1575         {
1576             // Mem initialization
1577             // This is user data, no kernels here
1578             auto input = memory::allocate(engine,
1579                                           { data_types::i8,
1580                                               format::bfyx,
1581                                               { in_B, in_F, in_X, in_Y } });
1582             set_values(input, std::move(DataGold));
1583
1584             auto pool = pooling("pool_GOLD",
1585                                  "input",
1586                                  pooling_mode::max,
1587                                  { 1, 1, W_X, W_Y },  // kernel_size
1588                                  { 1, 1, S_X, S_Y }); // stride
1589
1590             // Create a topology with a simple Convolution layer
1591             topology topology(input_layout("input", input.get_layout()),
1592                               pool);
1593
1594             // Network processing
1595             network network(engine, topology);
1596             network.set_input_data("input", input);
1597             //network_exe(network, vGoldOutput, "pool_GOLD");
1598             auto outputs = network.execute();
1599             auto searchC = outputs.find("pool_GOLD");
1600             ASSERT_FALSE(searchC == outputs.end());
1601             auto output = outputs.begin()->second.get_memory();
1602             auto output_ptr = output.pointer<char>();
1603             vGoldOutput.reserve(output_ptr.size());
1604             for (size_t i = 0; i < output_ptr.size(); i++)
1605                 vGoldOutput.push_back(output_ptr[i]);
1606         }
1607
1608         //
1609         // IMAD Pooling
1610         //
1611         {
1612             topology topology;
1613
1614             // Mem initialization
1615             // This is user data, no kernels here
1616             auto input = memory::allocate(engine,
1617                                           { data_types::i8,
1618                                               format::bfyx,
1619                                               { in_B, in_F, in_X, in_Y } });
1620             set_values(input, std::move(Data));
1621
1622             // Add input to topology
1623             topology.add(
1624                 input_layout("input", input.get_layout()));
1625
1626             // Reorder (a-ka swizzelling) input to MMAD/IMAD Pooling format
1627             topology.add(reorder("reorder_Swizzelled",
1628                          "input",
1629                          layout(data_types::i8,
1630                                 format::b_fs_yx_fsv4,
1631                                 { in_B, in_F, in_X, in_Y })));
1632
1633             // Add Convoluiton to topology
1634             topology.add(pooling("pool_IMAD",
1635                                  "reorder_Swizzelled",
1636                                  pooling_mode::max,
1637                                  { 1, 1, W_X, W_Y },  // kernel_size
1638                                  { 1, 1, S_X, S_Y })); // stride
1639
1640             // Back reordering (a-ka unswizzelling) output from MMAD/IMAD pooling
1641             topology.add(reorder("reorder_UnSwizzelled",
1642                                  "pool_IMAD",
1643                                  layout(data_types::i8,
1644                                         format::bfyx,
1645                                         { in_B, in_F, in_X, in_Y })));
1646
1647             network network(engine, topology);
1648             network.set_input_data("input", input);
1649             //network_exe(network, vTestOutput, "reorder_UnSwizzelled");
1650             auto outputs = network.execute();
1651             auto searchC = outputs.find("reorder_UnSwizzelled");
1652             ASSERT_FALSE(searchC == outputs.end());
1653             auto output = outputs.begin()->second.get_memory();
1654             auto output_ptr = output.pointer<char>();
1655             vTestOutput.reserve(output_ptr.size());
1656             for (size_t i = 0; i < output_ptr.size(); i++)
1657                 vTestOutput.push_back(output_ptr[i]);
1658         }
1659
1660         // Result validation
1661         ASSERT_TRUE(vGoldOutput.size() == vTestOutput.size());
1662         for (size_t i = 0; i < vGoldOutput.size(); i++)
1663             ASSERT_TRUE(vTestOutput[i] == vGoldOutput[i]);
1664
1665     } // for (int j = 0; F_array[j]; i++)
1666 }
1667
1668
1669 class pooling_test : public tests::generic_test
1670 {
1671
1672 public:
1673
1674     static void TearDownTestCase()
1675     {
1676         for (auto generic_params : all_generic_params)
1677         {
1678             delete generic_params;
1679         }
1680
1681         for (auto layer_params : all_layer_params)
1682         {
1683             delete layer_params;
1684         }
1685     }
1686
1687     static tensor generate_input_offset(int x, int y, const tensor& window_size)
1688     {
1689         return tensor(0, 0, -std::min(x, window_size.spatial[0] - 1), -std::min(y, window_size.spatial[1] - 1));
1690     }
1691
1692     static std::vector<cldnn::primitive*> generate_specific_test_params()
1693     {
1694         std::vector<pooling_mode> pooling_modes = { pooling_mode::max, pooling_mode::average, pooling_mode::average_no_padding };
1695
1696         std::vector<tensor> sizes = { tensor(1, 1, 2, 2 ), tensor(1, 1, 3, 3), tensor(1, 1, 7, 4) };
1697
1698         std::vector<tensor> strides = { tensor(1, 1, 1, 1), tensor(1, 1, 2, 2), tensor(1, 1, 4, 3) };
1699
1700         for (auto pooling_mode : pooling_modes)
1701         {
1702             for (auto size : sizes)
1703             {
1704                 for (auto stride : strides)
1705                 {
1706                     // No padding
1707                     all_layer_params.push_back(new pooling("pooling", "input0", pooling_mode, size, stride));
1708                     all_layer_params.push_back(new pooling("pooling", "input0", pooling_mode, size, stride, generate_input_offset(4, 3, size)));
1709
1710                     // Input padding
1711                     all_layer_params.push_back(new pooling("pooling", "reorder0", pooling_mode, size, stride));
1712
1713                     // Output padding
1714                     all_layer_params.push_back(new pooling("pooling", "input0", pooling_mode, size, stride, generate_input_offset(2, 3, size), { { 0, 0, 1, 5 },{ 0, 0, 19, 4 } }));
1715
1716                     // Input + output padding
1717                     all_layer_params.push_back(new pooling("pooling", "reorder0", pooling_mode, size, stride, generate_input_offset(2, 3, size), { { 0, 0, 2, 1 },{ 0, 0, 3, 4 } }));
1718                 }
1719             }
1720         }
1721
1722         // This case tests the pooling_gpu_bfyx_average_opt kernel.
1723         all_layer_params.push_back(new pooling("pooling", "input0", pooling_mode::average, tensor(1, 1, 3, 3), tensor(1, 1, 1, 1), generate_input_offset(1, 1, tensor(1, 1, 3, 3))));
1724
1725         return all_layer_params;
1726     }
1727
1728     static std::vector<tests::test_params*> generate_generic_test_params()
1729     {
1730         return generic_test::generate_generic_test_params(all_generic_params);
1731     }
1732
1733     virtual bool is_format_supported(cldnn::format format)
1734     {
1735         if ((format == cldnn_format_type::cldnn_format_yxfb) || (format == cldnn_format_type::cldnn_format_bfyx) || (format == cldnn_format_type::cldnn_format_byxf))
1736         {
1737             return true;
1738         }
1739         return false;
1740     }
1741
1742     virtual void prepare_input_for_test(std::vector<cldnn::memory>& inputs)
1743     {
1744         if (generic_params->data_type == data_types::f32)
1745         {
1746             prepare_input_for_test_typed<float>(inputs);
1747         }
1748         else
1749         {
1750             prepare_input_for_test_typed<FLOAT16>(inputs);
1751         }
1752     }
1753
1754     template<typename Type>
1755     void prepare_input_for_test_typed(std::vector<cldnn::memory>& inputs)
1756     {
1757         int k = (generic_params->data_type == data_types::f32) ? 8 : 4;
1758         auto input = inputs[0];
1759         auto input_size = inputs[0].get_layout().size;
1760         VVVVF<Type> input_rnd = generate_random_4d<Type>(input_size.batch[0], input_size.feature[0], input_size.spatial[1], input_size.spatial[0], -10, 10, k);
1761         VF<Type> input_rnd_vec = flatten_4d<Type>(input.get_layout().format, input_rnd);
1762         set_values(input, input_rnd_vec);
1763     }
1764
1765     virtual cldnn::tensor get_expected_output_tensor()
1766     {
1767         const cldnn::pooling* pooling = (cldnn::pooling*)layer_params;
1768
1769         int batch = generic_params->input_layouts[0].size.batch[0];
1770         int feature = generic_params->input_layouts[0].size.feature[0];
1771         int height = generic_params->input_layouts[0].size.spatial[1];
1772         int width = generic_params->input_layouts[0].size.spatial[0];
1773
1774         int input_offset_height = pooling->input_offset.spatial[1];
1775         int input_offset_width = pooling->input_offset.spatial[0];
1776
1777         int kernel_height = pooling->size.spatial[1];
1778         int kernel_width = pooling->size.spatial[0];
1779
1780         int stride_height = pooling->stride.spatial[1];
1781         int stride_width = pooling->stride.spatial[0];
1782
1783         int pooled_height = (int)(ceil((float)std::max(height - 2 * input_offset_height - kernel_height, 0) / stride_height)) + 1;
1784         int pooled_width = (int)(ceil((float)std::max(width - 2 * input_offset_width - kernel_width, 0) / stride_width)) + 1;
1785         
1786         // Make sure that the last pooling starts strictly inside the image.
1787         while ((pooled_height - 1) * stride_height >= height - input_offset_height) 
1788         {
1789             --pooled_height;
1790         }
1791         while ((pooled_width - 1) * stride_width >= width - input_offset_width) 
1792         {
1793             --pooled_width;
1794         }
1795
1796         return cldnn::tensor(batch, feature, pooled_width, pooled_height);
1797     }
1798
1799     template<typename Type>
1800     memory generate_reference_typed(const std::vector<cldnn::memory>& inputs)
1801     {
1802         const cldnn::pooling* pooling = (cldnn::pooling*)layer_params;
1803
1804         int batch = inputs[0].get_layout().size.batch[0];
1805         int feature = inputs[0].get_layout().size.feature[0];
1806         int height = inputs[0].get_layout().size.spatial[1];
1807         int width = inputs[0].get_layout().size.spatial[0];
1808
1809
1810
1811         cldnn::pooling_mode pooling_mode = pooling->mode;
1812
1813         int input_offset_width = pooling->input_offset.spatial[0];
1814         int input_offset_height = pooling->input_offset.spatial[1];
1815         
1816         int kernel_width = pooling->size.spatial[0];
1817         int kernel_height = pooling->size.spatial[1];
1818         
1819         int stride_width = pooling->stride.spatial[0];
1820         int stride_height = pooling->stride.spatial[1];
1821         
1822         auto output_tensor = get_expected_output_tensor();
1823
1824         int pooled_width = output_tensor.spatial[0];
1825         int pooled_height = output_tensor.spatial[1];
1826          
1827         //Output is bfyx
1828         auto output = memory::allocate(engine, cldnn::layout(inputs[0].get_layout().data_type, cldnn::format::bfyx, output_tensor, pooling->output_padding));
1829
1830         auto input_mem = inputs[0].pointer<Type>();
1831         auto output_mem = output.pointer<Type>();
1832
1833         int output_width = output.get_layout().get_buffer_size().spatial[0];
1834         int output_height = output.get_layout().get_buffer_size().spatial[1];
1835
1836         const auto input_desc = get_linear_memory_desc(inputs[0].get_layout());
1837         const auto output_desc = get_linear_memory_desc(output.get_layout());
1838
1839         switch (pooling_mode)
1840         {
1841             case cldnn::pooling_mode::max:
1842             {
1843                 for (int i = 0; i < (int)output.get_layout().get_buffer_size().count(); i++)
1844                 {
1845                     output_mem[i] = (generic_params->data_type == data_types::f32) ? -FLT_MAX : -65504;
1846                 }
1847                 for (int b = 0; b < batch; b++) 
1848                 {
1849                     for (int f = 0; f < feature; f++) 
1850                     {
1851                         for (int h = 0; h < pooled_height; h++) 
1852                         {
1853                             for (int w = 0; w < pooled_width; w++) 
1854                             {
1855                                 int input_offset_x_start = w * stride_width + input_offset_width;
1856                                 int input_offset_x_end = std::min(input_offset_x_start + kernel_width, width);
1857                                 input_offset_x_start = std::max(input_offset_x_start, 0);
1858
1859                                 int input_offset_y_start = h * stride_height + input_offset_height;
1860                                 int input_offset_y_end = std::min(input_offset_y_start + kernel_height, height);
1861                                 input_offset_y_start = std::max(input_offset_y_start, 0);
1862
1863                                 const size_t output_index = get_linear_index(output.get_layout(), b, f, h, w, output_desc);
1864
1865                                 for (int y = input_offset_y_start; y < input_offset_y_end; y++) 
1866                                 {
1867                                     for (int x = input_offset_x_start; x < input_offset_x_end; x++) 
1868                                     {
1869                                         const size_t input_index = get_linear_index(inputs[0].get_layout(), b, f, y, x, input_desc);
1870                                         
1871                                         if (input_mem[input_index] > output_mem[output_index])
1872                                         {
1873                                             output_mem[output_index] = input_mem[input_index];
1874                                         }
1875                                     }
1876                                 }
1877                             }
1878                         }
1879                     }
1880                 }
1881                 break;
1882             }
1883             case cldnn::pooling_mode::average:
1884             case cldnn::pooling_mode::average_no_padding:
1885             {
1886                 auto kernel_size =  kernel_width * kernel_height;
1887                 auto dynamic_mode = (((output_tensor.spatial[0] - 1) * stride_width) + pooling->size.spatial[0]) > -2 * input_offset_width + width ||
1888                     (((output_tensor.spatial[1] - 1) * stride_height) + pooling->size.spatial[1]) > -2 * input_offset_width + height;
1889
1890                 auto divider = [=](int actual_x, int actual_y) {
1891                     auto x = kernel_width;
1892                     auto y = kernel_height;
1893                     if (dynamic_mode)
1894                     {
1895                         if (actual_x + kernel_width > width + std::abs(input_offset_width))
1896                         {
1897                             x = (width + std::abs(input_offset_width)) - actual_x;
1898                         }
1899                         if (actual_y + kernel_height > height + std::abs(input_offset_height))
1900                         {
1901                             y = (height + std::abs(input_offset_height)) - actual_y;
1902                         }
1903                     }
1904                     return y*x;
1905                 };
1906
1907                 for (int i = 0; i < (int)output.get_layout().get_buffer_size().count(); i++)
1908                 {
1909                     output_mem[i] = 0;
1910                 }
1911                 for (int b = 0; b < batch; b++) 
1912                 {
1913                     for (int f = 0; f < feature; f++) 
1914                     {
1915                         for (int h = 0; h < pooled_height; h++) 
1916                         {
1917                             for (int w = 0; w < pooled_width; w++) 
1918                             {   
1919                                 int input_offset_x_start = w * stride_width + input_offset_width;
1920                                 int input_offset_x_end = std::min(input_offset_x_start + kernel_width, width);
1921                                 input_offset_x_start = std::max(input_offset_x_start, 0);
1922
1923                                 int input_offset_y_start = h * stride_height + input_offset_height;
1924                                 int input_offset_y_end = std::min(input_offset_y_start + kernel_height, height);
1925                                 input_offset_y_start = std::max(input_offset_y_start, 0);       
1926
1927                                 int output_index = (b * feature + f) * output_height * output_width;
1928                                 tensor lower_padding = pooling->output_padding.lower_size();
1929                                 output_index += (lower_padding.spatial[1] + h) * output_width + lower_padding.spatial[0] + w;
1930
1931                                 int num_of_elements = 0;
1932                                 for (int y = input_offset_y_start; y < input_offset_y_end; y++) 
1933                                 {
1934                                     for (int x = input_offset_x_start; x < input_offset_x_end; x++) 
1935                                     {
1936                                         const size_t input_index = get_linear_index(inputs[0].get_layout(), b, f, y, x, input_desc);
1937                                         output_mem[output_index] += input_mem[input_index];
1938                                         if (!dynamic_mode || pooling_mode == cldnn::pooling_mode::average_no_padding)
1939                                         {
1940                                             num_of_elements++;
1941                                         }
1942                                     }
1943                                 }
1944                                 if (pooling_mode == cldnn::pooling_mode::average)
1945                                 {
1946                                         num_of_elements = divider(input_offset_x_start, input_offset_y_start);
1947                                 }
1948                                 if (num_of_elements == 0)
1949                                 {
1950                                     assert(0);
1951                                     return output;
1952                                 }
1953                                 output_mem[output_index] /= (Type)num_of_elements;
1954  
1955                             }
1956                         }
1957                     }
1958                 }
1959                 break;
1960             }
1961             default:
1962             {
1963                 assert(0);
1964             }
1965         }
1966
1967         return output;
1968     }
1969
1970     virtual memory generate_reference(const std::vector<cldnn::memory>& inputs)
1971     {
1972         if (generic_params->data_type == data_types::f32)
1973         {
1974             return generate_reference_typed<float>(inputs);
1975         }
1976         else
1977         {
1978             return generate_reference_typed<FLOAT16>(inputs);
1979         }
1980     }
1981
1982 private:
1983
1984     static std::vector<tests::test_params*> all_generic_params;
1985     static std::vector<cldnn::primitive*> all_layer_params;
1986
1987 };
1988
1989 std::vector<cldnn::primitive*> pooling_test::all_layer_params = {};
1990 std::vector<tests::test_params*> pooling_test::all_generic_params = {};
1991
1992 TEST_P(pooling_test, POOLING)
1993 {
1994     run_single_test();
1995 }
1996
1997 INSTANTIATE_TEST_CASE_P(DISABLED_POOLING,
1998                         pooling_test,
1999                         ::testing::Combine(::testing::ValuesIn(pooling_test::generate_generic_test_params()),
2000                                            ::testing::ValuesIn(pooling_test::generate_specific_test_params())),
2001                         tests::generic_test::custom_param_name_functor());