Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / mkl-dnn / examples / simple_net.cpp
1 /*******************************************************************************
2 * Copyright 2016-2018 Intel Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
16
17 #include <chrono>
18 #include <iostream>
19 #include <numeric>
20 #include <string>
21
22 #include "mkldnn.hpp"
23
24 using namespace mkldnn;
25
26 using namespace std;
27
28 void simple_net(int times = 100) {
29
30     auto cpu_engine = engine(engine::cpu, 0);
31
32     /* Create a vector primitive to hold the network. For efficienty purpose,
33      * weights are stored in a separate net to perform reordering only once. */
34     std::vector<primitive> net;
35     std::vector<primitive> net_weights;
36
37     const int batch = 1;
38
39     /* AlexNet: conv1
40      * {batch, 3, 227, 227} (x) {96, 3, 11, 11} -> {batch, 96, 55, 55}
41      * strides: {4, 4}
42      */
43     memory::dims conv1_src_tz = { batch, 3, 227, 227 };
44     memory::dims conv1_weights_tz = { 96, 3, 11, 11 };
45     memory::dims conv1_bias_tz = { 96 };
46     memory::dims conv1_dst_tz = { batch, 96, 55, 55 };
47     memory::dims conv1_strides = { 4, 4 };
48     memory::dims conv1_padding = { 0, 0 };
49
50     /* Allocate input and output buffers for user data */
51     std::vector<float> user_src(batch * 3 * 227 * 227);
52     std::vector<float> user_dst(batch * 1000);
53
54     /* Allocate and fill buffers for weights and bias */
55     std::vector<float> conv1_weights(std::accumulate(
56             conv1_weights_tz.begin(), conv1_weights_tz.end(), 1,
57             std::multiplies<uint32_t>()));
58     std::vector<float> conv1_bias(std::accumulate(conv1_bias_tz.begin(),
59             conv1_bias_tz.end(), 1, std::multiplies<uint32_t>()));
60
61     /* create memory for user data */
62     auto user_src_memory
63             = memory({ { { conv1_src_tz }, memory::data_type::f32,
64                                memory::format::nchw },
65                              cpu_engine },
66                     user_src.data());
67     auto user_weights_memory
68             = memory({ { { conv1_weights_tz }, memory::data_type::f32,
69                                memory::format::oihw },
70                              cpu_engine },
71                     conv1_weights.data());
72     auto user_bias_memory = memory(
73             { { { conv1_bias_tz }, memory::data_type::f32, memory::format::x },
74                     cpu_engine },
75             conv1_bias.data());
76
77     /* create memory descriptors for convolution data w/ no specified format
78      */
79     auto conv1_src_md = memory::desc(
80             { conv1_src_tz }, memory::data_type::f32, memory::format::any);
81     auto conv1_bias_md = memory::desc(
82             { conv1_bias_tz }, memory::data_type::f32, memory::format::any);
83     auto conv1_weights_md = memory::desc(
84             { conv1_weights_tz }, memory::data_type::f32, memory::format::any);
85     auto conv1_dst_md = memory::desc(
86             { conv1_dst_tz }, memory::data_type::f32, memory::format::any);
87
88     /* create a convolution */
89     auto conv1_desc = convolution_forward::desc(
90             prop_kind::forward_inference, convolution_direct, conv1_src_md,
91             conv1_weights_md, conv1_bias_md, conv1_dst_md, conv1_strides,
92             conv1_padding, conv1_padding, padding_kind::zero);
93     auto conv1_prim_desc
94             = convolution_forward::primitive_desc(conv1_desc, cpu_engine);
95
96     /* create reorders for data and weights if layout requested by
97      * convolution is different from NCHW/OIHW */
98     auto conv1_src_memory = user_src_memory;
99     if (memory::primitive_desc(conv1_prim_desc.src_primitive_desc())
100             != user_src_memory.get_primitive_desc()) {
101         conv1_src_memory = memory(conv1_prim_desc.src_primitive_desc());
102         net.push_back(reorder(user_src_memory, conv1_src_memory));
103     }
104
105     auto conv1_weights_memory = user_weights_memory;
106     if (memory::primitive_desc(conv1_prim_desc.weights_primitive_desc())
107             != user_weights_memory.get_primitive_desc()) {
108         conv1_weights_memory
109                 = memory(conv1_prim_desc.weights_primitive_desc());
110         net_weights.push_back(
111                 reorder(user_weights_memory, conv1_weights_memory));
112     }
113
114     auto conv1_dst_memory = memory(conv1_prim_desc.dst_primitive_desc());
115
116     /* create convolution primitive and add it to net */
117     net.push_back(convolution_forward(conv1_prim_desc, conv1_src_memory,
118             conv1_weights_memory, user_bias_memory,
119             conv1_dst_memory));
120
121     /* AlexNet: relu1
122      * {batch, 96, 55, 55} -> {batch, 96, 55, 55}
123      */
124     const float negative1_slope = 1.0f;
125
126     /* create relu primitive and add it to net */
127     auto relu1_desc = eltwise_forward::desc(prop_kind::forward_inference,
128             algorithm::eltwise_relu,
129             conv1_dst_memory.get_primitive_desc().desc(), negative1_slope);
130     auto relu1_prim_desc
131             = eltwise_forward::primitive_desc(relu1_desc, cpu_engine);
132
133     net.push_back(eltwise_forward(
134             relu1_prim_desc, conv1_dst_memory, conv1_dst_memory));
135
136     /* AlexNet: lrn1
137      * {batch, 96, 55, 55} -> {batch, 96, 55, 55}
138      * local size: 5
139      * alpha1: 0.0001
140      * beta1: 0.75
141      */
142     const uint32_t local1_size = 5;
143     const float alpha1 = 0.0001f;
144     const float beta1 = 0.75f;
145     const float k1 = 1.0f;
146
147     /* create lrn primitive and add it to net */
148     auto lrn1_desc = lrn_forward::desc(prop_kind::forward_inference,
149             lrn_across_channels,
150             conv1_dst_memory.get_primitive_desc().desc(), local1_size,
151             alpha1, beta1, k1);
152     auto lrn1_prim_desc
153             = lrn_forward::primitive_desc(lrn1_desc, cpu_engine);
154     auto lrn1_dst_memory = memory(lrn1_prim_desc.dst_primitive_desc());
155
156     net.push_back(
157             lrn_forward(lrn1_prim_desc, conv1_dst_memory, lrn1_dst_memory));
158
159     /* AlexNet: pool1
160      * {batch, 96, 55, 55} -> {batch, 96, 27, 27}
161      * kernel: {3, 3}
162      * strides: {2, 2}
163      */
164
165     memory::dims pool1_dst_tz = { batch, 96, 27, 27 };
166     memory::dims pool1_kernel = { 3, 3 };
167     memory::dims pool1_strides = { 2, 2 };
168     memory::dims pool_padding = { 0, 0 };
169
170     auto pool1_dst_md = memory::desc(
171             { pool1_dst_tz }, memory::data_type::f32, memory::format::any);
172
173     /* create a pooling */
174     auto pool1_desc = pooling_forward::desc(prop_kind::forward_inference,
175             pooling_max, lrn1_dst_memory.get_primitive_desc().desc(),
176             pool1_dst_md, pool1_strides, pool1_kernel, pool_padding,
177             pool_padding, padding_kind::zero);
178     auto pool1_pd = pooling_forward::primitive_desc(pool1_desc, cpu_engine);
179     auto pool1_dst_memory = memory(pool1_pd.dst_primitive_desc());
180
181     /* create pooling primitive an add it to net */
182     net.push_back(
183             pooling_forward(pool1_pd, lrn1_dst_memory, pool1_dst_memory));
184
185     /* AlexNet: conv2
186     * {batch, 96, 27, 27} (x) {2, 128, 48, 5, 5} -> {batch, 256, 27, 27}
187     * strides: {1, 1}
188     */
189     memory::dims conv2_src_tz = { batch, 96, 27, 27 };
190     memory::dims conv2_weights_tz = { 2, 128, 48, 5, 5 };
191     memory::dims conv2_bias_tz = { 256 };
192     memory::dims conv2_dst_tz = { batch, 256, 27, 27 };
193     memory::dims conv2_strides = { 1, 1 };
194     memory::dims conv2_padding = { 2, 2 };
195
196     std::vector<float> conv2_weights(std::accumulate(
197             conv2_weights_tz.begin(), conv2_weights_tz.end(), 1,
198             std::multiplies<uint32_t>()));
199     std::vector<float> conv2_bias(std::accumulate(conv2_bias_tz.begin(),
200             conv2_bias_tz.end(), 1, std::multiplies<uint32_t>()));
201
202     /* create memory for user data */
203     auto conv2_user_weights_memory
204             = memory({ { { conv2_weights_tz }, memory::data_type::f32,
205                                memory::format::goihw },
206                              cpu_engine },
207                     conv2_weights.data());
208     auto conv2_user_bias_memory
209             = memory({ { { conv2_bias_tz }, memory::data_type::f32,
210                                memory::format::x },
211                              cpu_engine },
212                     conv2_bias.data());
213
214     /* create memory descriptors for convolution data w/ no specified format
215      */
216     auto conv2_src_md = memory::desc(
217             { conv2_src_tz }, memory::data_type::f32, memory::format::any);
218     auto conv2_bias_md = memory::desc(
219             { conv2_bias_tz }, memory::data_type::f32, memory::format::any);
220     auto conv2_weights_md = memory::desc({ conv2_weights_tz },
221             memory::data_type::f32, memory::format::any);
222     auto conv2_dst_md = memory::desc(
223             { conv2_dst_tz }, memory::data_type::f32, memory::format::any);
224
225     /* create a convolution */
226     auto conv2_desc = convolution_forward::desc(
227             prop_kind::forward_inference, convolution_direct, conv2_src_md,
228             conv2_weights_md, conv2_bias_md, conv2_dst_md, conv2_strides,
229             conv2_padding, conv2_padding, padding_kind::zero);
230     auto conv2_prim_desc
231             = convolution_forward::primitive_desc(conv2_desc, cpu_engine);
232
233     auto conv2_src_memory = pool1_dst_memory;
234     if (memory::primitive_desc(conv2_prim_desc.src_primitive_desc())
235             != conv2_src_memory.get_primitive_desc()) {
236         conv2_src_memory = memory(conv2_prim_desc.src_primitive_desc());
237         net.push_back(reorder(pool1_dst_memory, conv2_src_memory));
238     }
239
240     auto conv2_weights_memory = conv2_user_weights_memory;
241     if (memory::primitive_desc(conv2_prim_desc.weights_primitive_desc())
242             != conv2_user_weights_memory.get_primitive_desc()) {
243         conv2_weights_memory
244                 = memory(conv2_prim_desc.weights_primitive_desc());
245         net_weights.push_back(
246                 reorder(conv2_user_weights_memory, conv2_weights_memory));
247     }
248
249     auto conv2_dst_memory = memory(conv2_prim_desc.dst_primitive_desc());
250
251     /* create convolution primitive and add it to net */
252     net.push_back(convolution_forward(conv2_prim_desc, conv2_src_memory,
253             conv2_weights_memory, conv2_user_bias_memory,
254             conv2_dst_memory));
255
256     /* AlexNet: relu2
257     * {batch, 256, 27, 27} -> {batch, 256, 27, 27}
258     */
259     const float negative2_slope = 1.0f;
260
261     /* create relu primitive and add it to net */
262     auto relu2_desc = eltwise_forward::desc(prop_kind::forward_inference,
263             algorithm::eltwise_relu,
264             conv2_dst_memory.get_primitive_desc().desc(), negative2_slope);
265     auto relu2_prim_desc
266             = eltwise_forward::primitive_desc(relu2_desc, cpu_engine);
267
268     net.push_back(eltwise_forward(
269             relu2_prim_desc, conv2_dst_memory, conv2_dst_memory));
270
271     /* AlexNet: lrn2
272      * {batch, 256, 27, 27} -> {batch, 256, 27, 27}
273      * local size: 5
274      * alpha2: 0.0001
275      * beta2: 0.75
276      */
277     const uint32_t local2_size = 5;
278     const float alpha2 = 0.0001f;
279     const float beta2 = 0.75f;
280     const float k2 = 1.0f;
281
282     /* create lrn primitive and add it to net */
283     auto lrn2_desc = lrn_forward::desc(prop_kind::forward_inference,
284             lrn_across_channels,
285             conv2_prim_desc.dst_primitive_desc().desc(), local2_size,
286             alpha2, beta2, k2);
287     auto lrn2_prim_desc
288             = lrn_forward::primitive_desc(lrn2_desc, cpu_engine);
289     auto lrn2_dst_memory = memory(lrn2_prim_desc.dst_primitive_desc());
290
291     net.push_back(
292             lrn_forward(lrn2_prim_desc, conv2_dst_memory, lrn2_dst_memory));
293
294     /* AlexNet: pool2
295     * {batch, 256, 27, 27} -> {batch, 256, 13, 13}
296     * kernel: {3, 3}
297     * strides: {2, 2}
298     */
299
300     memory::dims pool2_dst_tz = { batch, 256, 13, 13 };
301     memory::dims pool2_kernel = { 3, 3 };
302     memory::dims pool2_strides = { 2, 2 };
303     memory::dims pool2_padding = { 0, 0 };
304
305     auto pool2_dst_md = memory::desc(
306             { pool2_dst_tz }, memory::data_type::f32, memory::format::any);
307
308     /* create a pooling */
309     auto pool2_desc = pooling_forward::desc(prop_kind::forward_inference,
310             pooling_max, lrn2_dst_memory.get_primitive_desc().desc(),
311             pool2_dst_md, pool2_strides, pool2_kernel, pool2_padding,
312             pool2_padding, padding_kind::zero);
313     auto pool2_pd = pooling_forward::primitive_desc(pool2_desc, cpu_engine);
314
315     auto pool2_dst_memory = memory(pool2_pd.dst_primitive_desc());
316
317     /* create pooling primitive an add it to net */
318     net.push_back(
319             pooling_forward(pool2_pd, lrn2_dst_memory, pool2_dst_memory));
320
321     // -------
322     /* AlexNet: conv3
323     * {batch, 256, 13, 13} (x)  {384, 256, 3, 3}; -> {batch, 384, 13, 13};
324     * strides: {1, 1}
325     */
326     memory::dims conv3_src_tz = { batch, 256, 13, 13 };
327     memory::dims conv3_weights_tz = { 384, 256, 3, 3 };
328     memory::dims conv3_bias_tz = { 384 };
329     memory::dims conv3_dst_tz = { batch, 384, 13, 13 };
330     memory::dims conv3_strides = { 1, 1 };
331     memory::dims conv3_padding = { 1, 1 };
332
333     std::vector<float> conv3_weights(std::accumulate(
334             conv3_weights_tz.begin(), conv3_weights_tz.end(), 1,
335             std::multiplies<uint32_t>()));
336     std::vector<float> conv3_bias(std::accumulate(conv3_bias_tz.begin(),
337             conv3_bias_tz.end(), 1, std::multiplies<uint32_t>()));
338
339     /* create memory for user data */
340     auto conv3_user_weights_memory
341             = memory({ { { conv3_weights_tz }, memory::data_type::f32,
342                                memory::format::oihw },
343                              cpu_engine },
344                     conv3_weights.data());
345     auto conv3_user_bias_memory
346             = memory({ { { conv3_bias_tz }, memory::data_type::f32,
347                                memory::format::x },
348                              cpu_engine },
349                     conv3_bias.data());
350
351     /* create memory descriptors for convolution data w/ no specified format
352      */
353     auto conv3_src_md = memory::desc(
354             { conv3_src_tz }, memory::data_type::f32, memory::format::any);
355     auto conv3_bias_md = memory::desc(
356             { conv3_bias_tz }, memory::data_type::f32, memory::format::any);
357     auto conv3_weights_md = memory::desc({ conv3_weights_tz },
358             memory::data_type::f32, memory::format::any);
359     auto conv3_dst_md = memory::desc(
360             { conv3_dst_tz }, memory::data_type::f32, memory::format::any);
361
362     /* create a convolution */
363     auto conv3_desc = convolution_forward::desc(
364             prop_kind::forward_inference, convolution_direct, conv3_src_md,
365             conv3_weights_md, conv3_bias_md, conv3_dst_md, conv3_strides,
366             conv3_padding, conv3_padding, padding_kind::zero);
367     auto conv3_prim_desc
368             = convolution_forward::primitive_desc(conv3_desc, cpu_engine);
369
370     auto conv3_src_memory = pool2_dst_memory;
371     if (memory::primitive_desc(conv3_prim_desc.src_primitive_desc())
372             != conv3_src_memory.get_primitive_desc()) {
373         conv3_src_memory = memory(conv3_prim_desc.src_primitive_desc());
374         net.push_back(reorder(pool2_dst_memory, conv3_src_memory));
375     }
376
377     auto conv3_weights_memory = conv3_user_weights_memory;
378     if (memory::primitive_desc(conv3_prim_desc.weights_primitive_desc())
379             != conv3_user_weights_memory.get_primitive_desc()) {
380         conv3_weights_memory
381                 = memory(conv3_prim_desc.weights_primitive_desc());
382         net_weights.push_back(
383                 reorder(conv3_user_weights_memory, conv3_weights_memory));
384     }
385
386     auto conv3_dst_memory = memory(conv3_prim_desc.dst_primitive_desc());
387
388     /* create convolution primitive and add it to net */
389     net.push_back(convolution_forward(conv3_prim_desc, conv3_src_memory,
390             conv3_weights_memory, conv3_user_bias_memory,
391             conv3_dst_memory));
392
393     /* AlexNet: relu3
394     * {batch, 384, 13, 13} -> {batch, 384, 13, 13}
395     */
396     const float negative3_slope = 1.0f;
397
398     /* create relu primitive and add it to net */
399     auto relu3_desc = eltwise_forward::desc(prop_kind::forward_inference,
400             algorithm::eltwise_relu,
401             conv3_dst_memory.get_primitive_desc().desc(), negative3_slope);
402     auto relu3_prim_desc
403             = eltwise_forward::primitive_desc(relu3_desc, cpu_engine);
404
405     net.push_back(eltwise_forward(
406             relu3_prim_desc, conv3_dst_memory, conv3_dst_memory));
407
408     /* AlexNet: conv4
409     * {batch, 384, 13, 13} (x)  {2, 192, 192, 3, 3}; -> {batch, 384, 13,
410     * 13};
411     * strides: {1, 1}
412     */
413     memory::dims conv4_src_tz = { batch, 384, 13, 13 };
414     memory::dims conv4_weights_tz = { 2, 192, 192, 3, 3 };
415     memory::dims conv4_bias_tz = { 384 };
416     memory::dims conv4_dst_tz = { batch, 384, 13, 13 };
417     memory::dims conv4_strides = { 1, 1 };
418     memory::dims conv4_padding = { 1, 1 };
419
420     std::vector<float> conv4_weights(std::accumulate(
421             conv4_weights_tz.begin(), conv4_weights_tz.end(), 1,
422             std::multiplies<uint32_t>()));
423     std::vector<float> conv4_bias(std::accumulate(conv4_bias_tz.begin(),
424             conv4_bias_tz.end(), 1, std::multiplies<uint32_t>()));
425
426     /* create memory for user data */
427     auto conv4_user_weights_memory
428             = memory({ { { conv4_weights_tz }, memory::data_type::f32,
429                                memory::format::goihw },
430                              cpu_engine },
431                     conv4_weights.data());
432     auto conv4_user_bias_memory
433             = memory({ { { conv4_bias_tz }, memory::data_type::f32,
434                                memory::format::x },
435                              cpu_engine },
436                     conv4_bias.data());
437
438     /* create memory descriptors for convolution data w/ no specified format
439      */
440     auto conv4_src_md = memory::desc(
441             { conv4_src_tz }, memory::data_type::f32, memory::format::any);
442     auto conv4_bias_md = memory::desc(
443             { conv4_bias_tz }, memory::data_type::f32, memory::format::any);
444     auto conv4_weights_md = memory::desc({ conv4_weights_tz },
445             memory::data_type::f32, memory::format::any);
446     auto conv4_dst_md = memory::desc(
447             { conv4_dst_tz }, memory::data_type::f32, memory::format::any);
448
449     /* create a convolution */
450     auto conv4_desc = convolution_forward::desc(
451             prop_kind::forward_inference, convolution_direct, conv4_src_md,
452             conv4_weights_md, conv4_bias_md, conv4_dst_md, conv4_strides,
453             conv4_padding, conv4_padding, padding_kind::zero);
454     auto conv4_prim_desc
455             = convolution_forward::primitive_desc(conv4_desc, cpu_engine);
456
457     auto conv4_src_memory = conv3_dst_memory;
458     if (memory::primitive_desc(conv4_prim_desc.src_primitive_desc())
459             != conv4_src_memory.get_primitive_desc()) {
460         conv4_src_memory = memory(conv4_prim_desc.src_primitive_desc());
461         net.push_back(reorder(conv3_dst_memory, conv4_src_memory));
462     }
463
464     auto conv4_weights_memory = conv4_user_weights_memory;
465     if (memory::primitive_desc(conv4_prim_desc.weights_primitive_desc())
466             != conv4_user_weights_memory.get_primitive_desc()) {
467         conv4_weights_memory
468                 = memory(conv4_prim_desc.weights_primitive_desc());
469         net_weights.push_back(
470                 reorder(conv4_user_weights_memory, conv4_weights_memory));
471     }
472
473     auto conv4_dst_memory = memory(conv4_prim_desc.dst_primitive_desc());
474
475     /* create convolution primitive and add it to net */
476     net.push_back(convolution_forward(conv4_prim_desc, conv4_src_memory,
477             conv4_weights_memory, conv4_user_bias_memory,
478             conv4_dst_memory));
479
480     /* AlexNet: relu4
481     * {batch, 384, 13, 13} -> {batch, 384, 13, 13}
482     */
483     const float negative4_slope = 1.0f;
484
485     /* create relu primitive and add it to net */
486     auto relu4_desc = eltwise_forward::desc(prop_kind::forward_inference,
487             algorithm::eltwise_relu,
488             conv4_dst_memory.get_primitive_desc().desc(), negative4_slope);
489     auto relu4_prim_desc
490             = eltwise_forward::primitive_desc(relu4_desc, cpu_engine);
491
492     net.push_back(eltwise_forward(
493             relu4_prim_desc, conv4_dst_memory, conv4_dst_memory));
494
495     /* AlexNet: conv5
496     * {batch, 384, 13, 13} (x)  {2, 128, 192, 3, 3}; -> {batch, 256, 13,
497     * 13};
498     * strides: {1, 1}
499     */
500     memory::dims conv5_weights_tz = { 2, 128, 192, 3, 3 };
501     memory::dims conv5_bias_tz = { 256 };
502     memory::dims conv5_dst_tz = { batch, 256, 13, 13 };
503     memory::dims conv5_strides = { 1, 1 };
504     memory::dims conv5_padding = { 1, 1 };
505
506     std::vector<float> conv5_weights(std::accumulate(
507             conv5_weights_tz.begin(), conv5_weights_tz.end(), 1,
508             std::multiplies<uint32_t>()));
509     std::vector<float> conv5_bias(std::accumulate(conv5_bias_tz.begin(),
510             conv5_bias_tz.end(), 1, std::multiplies<uint32_t>()));
511
512     /* create memory for user data */
513     auto conv5_user_weights_memory
514             = memory({ { { conv5_weights_tz }, memory::data_type::f32,
515                                memory::format::goihw },
516                              cpu_engine },
517                     conv5_weights.data());
518     auto conv5_user_bias_memory
519             = memory({ { { conv5_bias_tz }, memory::data_type::f32,
520                                memory::format::x },
521                              cpu_engine },
522                     conv5_bias.data());
523
524     /* create memory descriptors for convolution data w/ no specified format
525      */
526     auto conv5_bias_md = memory::desc(
527             { conv5_bias_tz }, memory::data_type::f32, memory::format::any);
528     auto conv5_weights_md = memory::desc({ conv5_weights_tz },
529             memory::data_type::f32, memory::format::any);
530     auto conv5_dst_md = memory::desc(
531             { conv5_dst_tz }, memory::data_type::f32, memory::format::any);
532
533     /* create a convolution */
534     auto conv5_desc = convolution_forward::desc(
535             prop_kind::forward_inference, convolution_direct,
536             conv4_dst_memory.get_primitive_desc().desc(), conv5_weights_md,
537             conv5_bias_md, conv5_dst_md, conv5_strides, conv5_padding,
538             conv5_padding, padding_kind::zero);
539     auto conv5_prim_desc
540             = convolution_forward::primitive_desc(conv5_desc, cpu_engine);
541
542     auto conv5_src_memory = conv4_dst_memory;
543     if (memory::primitive_desc(conv5_prim_desc.src_primitive_desc())
544             != conv5_src_memory.get_primitive_desc()) {
545         conv5_src_memory = memory(conv5_prim_desc.src_primitive_desc());
546         net.push_back(reorder(conv4_dst_memory, conv5_src_memory));
547     }
548
549     auto conv5_weights_memory = conv5_user_weights_memory;
550     if (memory::primitive_desc(conv5_prim_desc.weights_primitive_desc())
551             != conv5_user_weights_memory.get_primitive_desc()) {
552         conv5_weights_memory
553                 = memory(conv5_prim_desc.weights_primitive_desc());
554         net_weights.push_back(
555                 reorder(conv5_user_weights_memory, conv5_weights_memory));
556     }
557
558     auto conv5_dst_memory = memory(conv5_prim_desc.dst_primitive_desc());
559
560     /* create convolution primitive and add it to net */
561     net.push_back(convolution_forward(conv5_prim_desc, conv5_src_memory,
562             conv5_weights_memory, conv5_user_bias_memory,
563             conv5_dst_memory));
564
565     /* AlexNet: relu5
566     * {batch, 256, 13, 13} -> {batch, 256, 13, 13}
567     */
568     const float negative5_slope = 1.0f;
569
570     /* create relu primitive and add it to net */
571     auto relu5_desc = eltwise_forward::desc(prop_kind::forward_inference,
572             algorithm::eltwise_relu,
573             conv5_dst_memory.get_primitive_desc().desc(), negative5_slope);
574     auto relu5_prim_desc
575             = eltwise_forward::primitive_desc(relu5_desc, cpu_engine);
576
577     net.push_back(eltwise_forward(
578             relu5_prim_desc, conv5_dst_memory, conv5_dst_memory));
579
580     /* AlexNet: pool5
581     * {batch, 256, 13, 13} -> {batch, 256, 6, 6}
582     * kernel: {3, 3}
583     * strides: {2, 2}
584     */
585
586     memory::dims pool5_dst_tz = { batch, 256, 6, 6 };
587     memory::dims pool5_kernel = { 3, 3 };
588     memory::dims pool5_strides = { 2, 2 };
589     memory::dims pool5_padding = { 0, 0 };
590
591     std::vector<float> pool5_dst(std::accumulate(pool5_dst_tz.begin(),
592             pool5_dst_tz.end(), 1, std::multiplies<uint32_t>()));
593
594     auto pool5_dst_md = memory::desc(
595             { pool5_dst_tz }, memory::data_type::f32, memory::format::any);
596
597     /* create a pooling */
598     auto pool5_desc = pooling_forward::desc(prop_kind::forward_inference,
599             pooling_max, conv5_dst_memory.get_primitive_desc().desc(),
600             pool5_dst_md, pool5_strides, pool5_kernel, pool5_padding,
601             pool5_padding, padding_kind::zero);
602     auto pool5_pd = pooling_forward::primitive_desc(pool5_desc, cpu_engine);
603
604     auto pool5_dst_memory = memory(pool5_pd.dst_primitive_desc());
605
606     /* create pooling primitive an add it to net */
607     net.push_back(
608             pooling_forward(pool5_pd, conv5_dst_memory, pool5_dst_memory));
609
610     /**
611      * fc6 inner product {batch, 256, 6, 6} (x) {4096, 256, 6, 6}-> {batch,
612      * 4096}
613      */
614     memory::dims fc6_src_tz = { batch, 256, 6, 6 };
615     memory::dims fc6_weights_tz = { 4096, 256, 6, 6 };
616     memory::dims fc6_bias_tz = { 4096 };
617     memory::dims fc6_dst_tz = { batch, 4096 };
618
619     std::vector<float> fc6_weights(std::accumulate(fc6_weights_tz.begin(),
620             fc6_weights_tz.end(), 1, std::multiplies<uint32_t>()));
621     std::vector<float> fc6_bias(std::accumulate(fc6_bias_tz.begin(),
622             fc6_bias_tz.end(), 1, std::multiplies<uint32_t>()));
623
624     /* create memory for user data */
625     auto fc6_user_weights_memory
626             = memory({ { { fc6_weights_tz }, memory::data_type::f32,
627                                memory::format::oihw },
628                              cpu_engine },
629                     fc6_weights.data());
630
631     auto fc6_user_bias_memory
632             = memory({ { { fc6_bias_tz }, memory::data_type::f32,
633                                memory::format::x },
634                              cpu_engine },
635                     fc6_bias.data());
636
637     /* create memory descriptors for convolution data w/ no specified format
638      */
639     auto fc6_src_md = memory::desc(
640             { fc6_src_tz }, memory::data_type::f32, memory::format::any);
641     auto fc6_bias_md = memory::desc(
642             { fc6_bias_tz }, memory::data_type::f32, memory::format::any);
643     auto fc6_weights_md = memory::desc({ fc6_weights_tz },
644             memory::data_type::f32, memory::format::any);
645     auto fc6_dst_md = memory::desc(
646             { fc6_dst_tz }, memory::data_type::f32, memory::format::any);
647
648     /* create a inner_product */
649     auto fc6_desc
650             = inner_product_forward::desc(prop_kind::forward_inference,
651                     fc6_src_md, fc6_weights_md, fc6_bias_md, fc6_dst_md);
652     auto fc6_prim_desc
653             = inner_product_forward::primitive_desc(fc6_desc, cpu_engine);
654
655     auto fc6_src_memory = pool5_dst_memory;
656     if (memory::primitive_desc(fc6_prim_desc.src_primitive_desc())
657             != fc6_src_memory.get_primitive_desc()) {
658         fc6_src_memory = memory(fc6_prim_desc.src_primitive_desc());
659         net.push_back(reorder(pool5_dst_memory, fc6_src_memory));
660     }
661
662     auto fc6_weights_memory = fc6_user_weights_memory;
663     if (memory::primitive_desc(fc6_prim_desc.weights_primitive_desc())
664             != fc6_user_weights_memory.get_primitive_desc()) {
665         fc6_weights_memory = memory(fc6_prim_desc.weights_primitive_desc());
666         net_weights.push_back(
667                 reorder(fc6_user_weights_memory, fc6_weights_memory));
668     }
669
670     auto fc6_dst_memory = memory(fc6_prim_desc.dst_primitive_desc());
671
672     /* create convolution primitive and add it to net */
673     net.push_back(inner_product_forward(fc6_prim_desc, fc6_src_memory,
674             fc6_weights_memory, fc6_user_bias_memory, fc6_dst_memory));
675
676     /**
677      * fc7 inner product {batch, 4096} (x) {4096, 4096}-> {batch, 4096}
678      */
679     memory::dims fc7_weights_tz = { 4096, 4096 };
680     memory::dims fc7_bias_tz = { 4096 };
681     memory::dims fc7_dst_tz = { batch, 4096 };
682
683     std::vector<float> fc7_weights(std::accumulate(fc7_weights_tz.begin(),
684             fc7_weights_tz.end(), 1, std::multiplies<uint32_t>()));
685     std::vector<float> fc7_bias(std::accumulate(fc7_bias_tz.begin(),
686             fc7_bias_tz.end(), 1, std::multiplies<uint32_t>()));
687
688     /* create memory for user data */
689     auto fc7_user_weights_memory
690             = memory({ { { fc7_weights_tz }, memory::data_type::f32,
691                                memory::format::nc },
692                              cpu_engine },
693                     fc7_weights.data());
694
695     auto fc7_user_bias_memory
696             = memory({ { { fc7_bias_tz }, memory::data_type::f32,
697                                memory::format::x },
698                              cpu_engine },
699                     fc7_bias.data());
700
701     /* create memory descriptors for convolution data w/ no specified format
702      */
703     auto fc7_bias_md = memory::desc(
704             { fc7_bias_tz }, memory::data_type::f32, memory::format::any);
705     auto fc7_weights_md = memory::desc({ fc7_weights_tz },
706             memory::data_type::f32, memory::format::any);
707     auto fc7_dst_md = memory::desc(
708             { fc7_dst_tz }, memory::data_type::f32, memory::format::any);
709
710     /* create a inner_product */
711     auto fc7_desc
712             = inner_product_forward::desc(prop_kind::forward_inference,
713                     fc6_dst_memory.get_primitive_desc().desc(),
714                     fc7_weights_md, fc7_bias_md, fc7_dst_md);
715     auto fc7_prim_desc
716             = inner_product_forward::primitive_desc(fc7_desc, cpu_engine);
717
718     auto fc7_weights_memory = fc7_user_weights_memory;
719     if (memory::primitive_desc(fc7_prim_desc.weights_primitive_desc())
720             != fc7_user_weights_memory.get_primitive_desc()) {
721         fc7_weights_memory = memory(fc7_prim_desc.weights_primitive_desc());
722         net.push_back(reorder(fc7_user_weights_memory, fc7_weights_memory));
723     }
724
725     auto fc7_dst_memory = memory(fc7_prim_desc.dst_primitive_desc());
726
727     /* create convolution primitive and add it to net */
728     net.push_back(inner_product_forward(fc7_prim_desc, fc6_dst_memory,
729             fc7_weights_memory, fc7_user_bias_memory, fc7_dst_memory));
730
731     /**
732     * fc8 inner product {batch, 4096} (x) {1000, 4096}-> {batch, 1000}
733     */
734     memory::dims fc8_weights_tz = { 1000, 4096 };
735     memory::dims fc8_bias_tz = { 1000 };
736     memory::dims fc8_dst_tz = { batch, 1000 };
737
738     std::vector<float> fc8_weights(std::accumulate(fc8_weights_tz.begin(),
739             fc8_weights_tz.end(), 1, std::multiplies<uint32_t>()));
740     std::vector<float> fc8_bias(std::accumulate(fc8_bias_tz.begin(),
741             fc8_bias_tz.end(), 1, std::multiplies<uint32_t>()));
742
743     /* create memory for user data */
744     auto fc8_user_weights_memory
745             = memory({ { { fc8_weights_tz }, memory::data_type::f32,
746                                memory::format::nc },
747                              cpu_engine },
748                     fc8_weights.data());
749
750     auto fc8_user_bias_memory
751             = memory({ { { fc8_bias_tz }, memory::data_type::f32,
752                                memory::format::x },
753                              cpu_engine },
754                     fc8_bias.data());
755
756     auto user_dst_memory = memory({ { { fc8_dst_tz }, memory::data_type::f32,
757                                            memory::format::nc },
758                                          cpu_engine },
759             user_dst.data());
760
761     /* create memory descriptors for convolution data w/ no specified format
762      */
763     auto fc8_bias_md = memory::desc(
764             { fc8_bias_tz }, memory::data_type::f32, memory::format::any);
765     auto fc8_weights_md = memory::desc({ fc8_weights_tz },
766             memory::data_type::f32, memory::format::any);
767     auto fc8_dst_md = memory::desc(
768             { fc8_dst_tz }, memory::data_type::f32, memory::format::any);
769
770     /* create a inner_product */
771     auto fc8_desc
772             = inner_product_forward::desc(prop_kind::forward_inference,
773                     fc7_dst_memory.get_primitive_desc().desc(),
774                     fc8_weights_md, fc8_bias_md, fc8_dst_md);
775     auto fc8_prim_desc
776             = inner_product_forward::primitive_desc(fc8_desc, cpu_engine);
777
778     auto fc8_weights_memory = fc8_user_weights_memory;
779     if (memory::primitive_desc(fc8_prim_desc.weights_primitive_desc())
780             != fc8_user_weights_memory.get_primitive_desc()) {
781         fc8_weights_memory = memory(fc8_prim_desc.weights_primitive_desc());
782         net_weights.push_back(
783                 reorder(fc8_user_weights_memory, fc8_weights_memory));
784     }
785
786     auto fc8_dst_memory = memory(fc8_prim_desc.dst_primitive_desc());
787
788     /* create convolution primitive and add it to net */
789     net.push_back(inner_product_forward(fc8_prim_desc, fc7_dst_memory,
790             fc8_weights_memory, fc8_user_bias_memory, fc8_dst_memory));
791
792     /* create reorder between internal and user data if it is needed and
793      *  add it to net after pooling */
794     if (fc8_dst_memory != user_dst_memory) {
795         net.push_back(reorder(fc8_dst_memory, user_dst_memory));
796     }
797
798     stream(stream::kind::eager).submit(net_weights).wait();
799     for (int j = 0; j < times; ++j) {
800         stream(stream::kind::eager).submit(net).wait();
801     }
802 }
803
804 int main(int argc, char **argv) {
805     try {
806         auto begin = chrono::duration_cast<chrono::milliseconds>(
807                              chrono::steady_clock::now().time_since_epoch())
808                              .count();
809         int times = 1000;
810         simple_net(times);
811         auto end = chrono::duration_cast<chrono::milliseconds>(
812                            chrono::steady_clock::now().time_since_epoch())
813                            .count();
814         cout << "Use time " << (end - begin) / (times + 0.0) << "\n";
815     } catch (error &e) {
816         std::cerr << "status: " << e.status << std::endl;
817         std::cerr << "message: " << e.message << std::endl;
818     }
819     return 0;
820 }