inference-engine/thirdparty/mkl-dnn/examples/simple_net.cpp

   1 /*******************************************************************************
   2 * Copyright 2016-2018 Intel Corporation
   3 *
   4 * Licensed under the Apache License, Version 2.0 (the "License");
   5 * you may not use this file except in compliance with the License.
   6 * You may obtain a copy of the License at
   7 *
   8 *     http://www.apache.org/licenses/LICENSE-2.0
   9 *
  10 * Unless required by applicable law or agreed to in writing, software
  11 * distributed under the License is distributed on an "AS IS" BASIS,
  12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 * See the License for the specific language governing permissions and
  14 * limitations under the License.
  15 *******************************************************************************/
  16
  17 #include <chrono>
  18 #include <iostream>
  19 #include <numeric>
  20 #include <string>
  21
  22 #include "mkldnn.hpp"
  23
  24 using namespace mkldnn;
  25
  26 using namespace std;
  27
  28 void simple_net(int times = 100) {
  29
  30     auto cpu_engine = engine(engine::cpu, 0);
  31
  32     /* Create a vector primitive to hold the network. For efficienty purpose,
  33      * weights are stored in a separate net to perform reordering only once. */
  34     std::vector<primitive> net;
  35     std::vector<primitive> net_weights;
  36
  37     const int batch = 1;
  38
  39     /* AlexNet: conv1
  40      * {batch, 3, 227, 227} (x) {96, 3, 11, 11} -> {batch, 96, 55, 55}
  41      * strides: {4, 4}
  42      */
  43     memory::dims conv1_src_tz = { batch, 3, 227, 227 };
  44     memory::dims conv1_weights_tz = { 96, 3, 11, 11 };
  45     memory::dims conv1_bias_tz = { 96 };
  46     memory::dims conv1_dst_tz = { batch, 96, 55, 55 };
  47     memory::dims conv1_strides = { 4, 4 };
  48     memory::dims conv1_padding = { 0, 0 };
  49
  50     /* Allocate input and output buffers for user data */
  51     std::vector<float> user_src(batch * 3 * 227 * 227);
  52     std::vector<float> user_dst(batch * 1000);
  53
  54     /* Allocate and fill buffers for weights and bias */
  55     std::vector<float> conv1_weights(std::accumulate(
  56             conv1_weights_tz.begin(), conv1_weights_tz.end(), 1,
  57             std::multiplies<uint32_t>()));
  58     std::vector<float> conv1_bias(std::accumulate(conv1_bias_tz.begin(),
  59             conv1_bias_tz.end(), 1, std::multiplies<uint32_t>()));
  60
  61     /* create memory for user data */
  62     auto user_src_memory
  63             = memory({ { { conv1_src_tz }, memory::data_type::f32,
  64                                memory::format::nchw },
  65                              cpu_engine },
  66                     user_src.data());
  67     auto user_weights_memory
  68             = memory({ { { conv1_weights_tz }, memory::data_type::f32,
  69                                memory::format::oihw },
  70                              cpu_engine },
  71                     conv1_weights.data());
  72     auto user_bias_memory = memory(
  73             { { { conv1_bias_tz }, memory::data_type::f32, memory::format::x },
  74                     cpu_engine },
  75             conv1_bias.data());
  76
  77     /* create memory descriptors for convolution data w/ no specified format
  78      */
  79     auto conv1_src_md = memory::desc(
  80             { conv1_src_tz }, memory::data_type::f32, memory::format::any);
  81     auto conv1_bias_md = memory::desc(
  82             { conv1_bias_tz }, memory::data_type::f32, memory::format::any);
  83     auto conv1_weights_md = memory::desc(
  84             { conv1_weights_tz }, memory::data_type::f32, memory::format::any);
  85     auto conv1_dst_md = memory::desc(
  86             { conv1_dst_tz }, memory::data_type::f32, memory::format::any);
  87
  88     /* create a convolution */
  89     auto conv1_desc = convolution_forward::desc(
  90             prop_kind::forward_inference, convolution_direct, conv1_src_md,
  91             conv1_weights_md, conv1_bias_md, conv1_dst_md, conv1_strides,
  92             conv1_padding, conv1_padding, padding_kind::zero);
  93     auto conv1_prim_desc
  94             = convolution_forward::primitive_desc(conv1_desc, cpu_engine);
  95
  96     /* create reorders for data and weights if layout requested by
  97      * convolution is different from NCHW/OIHW */
  98     auto conv1_src_memory = user_src_memory;
  99     if (memory::primitive_desc(conv1_prim_desc.src_primitive_desc())
 100             != user_src_memory.get_primitive_desc()) {
 101         conv1_src_memory = memory(conv1_prim_desc.src_primitive_desc());
 102         net.push_back(reorder(user_src_memory, conv1_src_memory));
 103     }
 104
 105     auto conv1_weights_memory = user_weights_memory;
 106     if (memory::primitive_desc(conv1_prim_desc.weights_primitive_desc())
 107             != user_weights_memory.get_primitive_desc()) {
 108         conv1_weights_memory
 109                 = memory(conv1_prim_desc.weights_primitive_desc());
 110         net_weights.push_back(
 111                 reorder(user_weights_memory, conv1_weights_memory));
 112     }
 113
 114     auto conv1_dst_memory = memory(conv1_prim_desc.dst_primitive_desc());
 115
 116     /* create convolution primitive and add it to net */
 117     net.push_back(convolution_forward(conv1_prim_desc, conv1_src_memory,
 118             conv1_weights_memory, user_bias_memory,
 119             conv1_dst_memory));
 120
 121     /* AlexNet: relu1
 122      * {batch, 96, 55, 55} -> {batch, 96, 55, 55}
 123      */
 124     const float negative1_slope = 1.0f;
 125
 126     /* create relu primitive and add it to net */
 127     auto relu1_desc = eltwise_forward::desc(prop_kind::forward_inference,
 128             algorithm::eltwise_relu,
 129             conv1_dst_memory.get_primitive_desc().desc(), negative1_slope);
 130     auto relu1_prim_desc
 131             = eltwise_forward::primitive_desc(relu1_desc, cpu_engine);
 132
 133     net.push_back(eltwise_forward(
 134             relu1_prim_desc, conv1_dst_memory, conv1_dst_memory));
 135
 136     /* AlexNet: lrn1
 137      * {batch, 96, 55, 55} -> {batch, 96, 55, 55}
 138      * local size: 5
 139      * alpha1: 0.0001
 140      * beta1: 0.75
 141      */
 142     const uint32_t local1_size = 5;
 143     const float alpha1 = 0.0001f;
 144     const float beta1 = 0.75f;
 145     const float k1 = 1.0f;
 146
 147     /* create lrn primitive and add it to net */
 148     auto lrn1_desc = lrn_forward::desc(prop_kind::forward_inference,
 149             lrn_across_channels,
 150             conv1_dst_memory.get_primitive_desc().desc(), local1_size,
 151             alpha1, beta1, k1);
 152     auto lrn1_prim_desc
 153             = lrn_forward::primitive_desc(lrn1_desc, cpu_engine);
 154     auto lrn1_dst_memory = memory(lrn1_prim_desc.dst_primitive_desc());
 155
 156     net.push_back(
 157             lrn_forward(lrn1_prim_desc, conv1_dst_memory, lrn1_dst_memory));
 158
 159     /* AlexNet: pool1
 160      * {batch, 96, 55, 55} -> {batch, 96, 27, 27}
 161      * kernel: {3, 3}
 162      * strides: {2, 2}
 163      */
 164
 165     memory::dims pool1_dst_tz = { batch, 96, 27, 27 };
 166     memory::dims pool1_kernel = { 3, 3 };
 167     memory::dims pool1_strides = { 2, 2 };
 168     memory::dims pool_padding = { 0, 0 };
 169
 170     auto pool1_dst_md = memory::desc(
 171             { pool1_dst_tz }, memory::data_type::f32, memory::format::any);
 172
 173     /* create a pooling */
 174     auto pool1_desc = pooling_forward::desc(prop_kind::forward_inference,
 175             pooling_max, lrn1_dst_memory.get_primitive_desc().desc(),
 176             pool1_dst_md, pool1_strides, pool1_kernel, pool_padding,
 177             pool_padding, padding_kind::zero);
 178     auto pool1_pd = pooling_forward::primitive_desc(pool1_desc, cpu_engine);
 179     auto pool1_dst_memory = memory(pool1_pd.dst_primitive_desc());
 180
 181     /* create pooling primitive an add it to net */
 182     net.push_back(
 183             pooling_forward(pool1_pd, lrn1_dst_memory, pool1_dst_memory));
 184
 185     /* AlexNet: conv2
 186     * {batch, 96, 27, 27} (x) {2, 128, 48, 5, 5} -> {batch, 256, 27, 27}
 187     * strides: {1, 1}
 188     */
 189     memory::dims conv2_src_tz = { batch, 96, 27, 27 };
 190     memory::dims conv2_weights_tz = { 2, 128, 48, 5, 5 };
 191     memory::dims conv2_bias_tz = { 256 };
 192     memory::dims conv2_dst_tz = { batch, 256, 27, 27 };
 193     memory::dims conv2_strides = { 1, 1 };
 194     memory::dims conv2_padding = { 2, 2 };
 195
 196     std::vector<float> conv2_weights(std::accumulate(
 197             conv2_weights_tz.begin(), conv2_weights_tz.end(), 1,
 198             std::multiplies<uint32_t>()));
 199     std::vector<float> conv2_bias(std::accumulate(conv2_bias_tz.begin(),
 200             conv2_bias_tz.end(), 1, std::multiplies<uint32_t>()));
 201
 202     /* create memory for user data */
 203     auto conv2_user_weights_memory
 204             = memory({ { { conv2_weights_tz }, memory::data_type::f32,
 205                                memory::format::goihw },
 206                              cpu_engine },
 207                     conv2_weights.data());
 208     auto conv2_user_bias_memory
 209             = memory({ { { conv2_bias_tz }, memory::data_type::f32,
 210                                memory::format::x },
 211                              cpu_engine },
 212                     conv2_bias.data());
 213
 214     /* create memory descriptors for convolution data w/ no specified format
 215      */
 216     auto conv2_src_md = memory::desc(
 217             { conv2_src_tz }, memory::data_type::f32, memory::format::any);
 218     auto conv2_bias_md = memory::desc(
 219             { conv2_bias_tz }, memory::data_type::f32, memory::format::any);
 220     auto conv2_weights_md = memory::desc({ conv2_weights_tz },
 221             memory::data_type::f32, memory::format::any);
 222     auto conv2_dst_md = memory::desc(
 223             { conv2_dst_tz }, memory::data_type::f32, memory::format::any);
 224
 225     /* create a convolution */
 226     auto conv2_desc = convolution_forward::desc(
 227             prop_kind::forward_inference, convolution_direct, conv2_src_md,
 228             conv2_weights_md, conv2_bias_md, conv2_dst_md, conv2_strides,
 229             conv2_padding, conv2_padding, padding_kind::zero);
 230     auto conv2_prim_desc
 231             = convolution_forward::primitive_desc(conv2_desc, cpu_engine);
 232
 233     auto conv2_src_memory = pool1_dst_memory;
 234     if (memory::primitive_desc(conv2_prim_desc.src_primitive_desc())
 235             != conv2_src_memory.get_primitive_desc()) {
 236         conv2_src_memory = memory(conv2_prim_desc.src_primitive_desc());
 237         net.push_back(reorder(pool1_dst_memory, conv2_src_memory));
 238     }
 239
 240     auto conv2_weights_memory = conv2_user_weights_memory;
 241     if (memory::primitive_desc(conv2_prim_desc.weights_primitive_desc())
 242             != conv2_user_weights_memory.get_primitive_desc()) {
 243         conv2_weights_memory
 244                 = memory(conv2_prim_desc.weights_primitive_desc());
 245         net_weights.push_back(
 246                 reorder(conv2_user_weights_memory, conv2_weights_memory));
 247     }
 248
 249     auto conv2_dst_memory = memory(conv2_prim_desc.dst_primitive_desc());
 250
 251     /* create convolution primitive and add it to net */
 252     net.push_back(convolution_forward(conv2_prim_desc, conv2_src_memory,
 253             conv2_weights_memory, conv2_user_bias_memory,
 254             conv2_dst_memory));
 255
 256     /* AlexNet: relu2
 257     * {batch, 256, 27, 27} -> {batch, 256, 27, 27}
 258     */
 259     const float negative2_slope = 1.0f;
 260
 261     /* create relu primitive and add it to net */
 262     auto relu2_desc = eltwise_forward::desc(prop_kind::forward_inference,
 263             algorithm::eltwise_relu,
 264             conv2_dst_memory.get_primitive_desc().desc(), negative2_slope);
 265     auto relu2_prim_desc
 266             = eltwise_forward::primitive_desc(relu2_desc, cpu_engine);
 267
 268     net.push_back(eltwise_forward(
 269             relu2_prim_desc, conv2_dst_memory, conv2_dst_memory));
 270
 271     /* AlexNet: lrn2
 272      * {batch, 256, 27, 27} -> {batch, 256, 27, 27}
 273      * local size: 5
 274      * alpha2: 0.0001
 275      * beta2: 0.75
 276      */
 277     const uint32_t local2_size = 5;
 278     const float alpha2 = 0.0001f;
 279     const float beta2 = 0.75f;
 280     const float k2 = 1.0f;
 281
 282     /* create lrn primitive and add it to net */
 283     auto lrn2_desc = lrn_forward::desc(prop_kind::forward_inference,
 284             lrn_across_channels,
 285             conv2_prim_desc.dst_primitive_desc().desc(), local2_size,
 286             alpha2, beta2, k2);
 287     auto lrn2_prim_desc
 288             = lrn_forward::primitive_desc(lrn2_desc, cpu_engine);
 289     auto lrn2_dst_memory = memory(lrn2_prim_desc.dst_primitive_desc());
 290
 291     net.push_back(
 292             lrn_forward(lrn2_prim_desc, conv2_dst_memory, lrn2_dst_memory));
 293
 294     /* AlexNet: pool2
 295     * {batch, 256, 27, 27} -> {batch, 256, 13, 13}
 296     * kernel: {3, 3}
 297     * strides: {2, 2}
 298     */
 299
 300     memory::dims pool2_dst_tz = { batch, 256, 13, 13 };
 301     memory::dims pool2_kernel = { 3, 3 };
 302     memory::dims pool2_strides = { 2, 2 };
 303     memory::dims pool2_padding = { 0, 0 };
 304
 305     auto pool2_dst_md = memory::desc(
 306             { pool2_dst_tz }, memory::data_type::f32, memory::format::any);
 307
 308     /* create a pooling */
 309     auto pool2_desc = pooling_forward::desc(prop_kind::forward_inference,
 310             pooling_max, lrn2_dst_memory.get_primitive_desc().desc(),
 311             pool2_dst_md, pool2_strides, pool2_kernel, pool2_padding,
 312             pool2_padding, padding_kind::zero);
 313     auto pool2_pd = pooling_forward::primitive_desc(pool2_desc, cpu_engine);
 314
 315     auto pool2_dst_memory = memory(pool2_pd.dst_primitive_desc());
 316
 317     /* create pooling primitive an add it to net */
 318     net.push_back(
 319             pooling_forward(pool2_pd, lrn2_dst_memory, pool2_dst_memory));
 320
 321     // -------
 322     /* AlexNet: conv3
 323     * {batch, 256, 13, 13} (x)  {384, 256, 3, 3}; -> {batch, 384, 13, 13};
 324     * strides: {1, 1}
 325     */
 326     memory::dims conv3_src_tz = { batch, 256, 13, 13 };
 327     memory::dims conv3_weights_tz = { 384, 256, 3, 3 };
 328     memory::dims conv3_bias_tz = { 384 };
 329     memory::dims conv3_dst_tz = { batch, 384, 13, 13 };
 330     memory::dims conv3_strides = { 1, 1 };
 331     memory::dims conv3_padding = { 1, 1 };
 332
 333     std::vector<float> conv3_weights(std::accumulate(
 334             conv3_weights_tz.begin(), conv3_weights_tz.end(), 1,
 335             std::multiplies<uint32_t>()));
 336     std::vector<float> conv3_bias(std::accumulate(conv3_bias_tz.begin(),
 337             conv3_bias_tz.end(), 1, std::multiplies<uint32_t>()));
 338
 339     /* create memory for user data */
 340     auto conv3_user_weights_memory
 341             = memory({ { { conv3_weights_tz }, memory::data_type::f32,
 342                                memory::format::oihw },
 343                              cpu_engine },
 344                     conv3_weights.data());
 345     auto conv3_user_bias_memory
 346             = memory({ { { conv3_bias_tz }, memory::data_type::f32,
 347                                memory::format::x },
 348                              cpu_engine },
 349                     conv3_bias.data());
 350
 351     /* create memory descriptors for convolution data w/ no specified format
 352      */
 353     auto conv3_src_md = memory::desc(
 354             { conv3_src_tz }, memory::data_type::f32, memory::format::any);
 355     auto conv3_bias_md = memory::desc(
 356             { conv3_bias_tz }, memory::data_type::f32, memory::format::any);
 357     auto conv3_weights_md = memory::desc({ conv3_weights_tz },
 358             memory::data_type::f32, memory::format::any);
 359     auto conv3_dst_md = memory::desc(
 360             { conv3_dst_tz }, memory::data_type::f32, memory::format::any);
 361
 362     /* create a convolution */
 363     auto conv3_desc = convolution_forward::desc(
 364             prop_kind::forward_inference, convolution_direct, conv3_src_md,
 365             conv3_weights_md, conv3_bias_md, conv3_dst_md, conv3_strides,
 366             conv3_padding, conv3_padding, padding_kind::zero);
 367     auto conv3_prim_desc
 368             = convolution_forward::primitive_desc(conv3_desc, cpu_engine);
 369
 370     auto conv3_src_memory = pool2_dst_memory;
 371     if (memory::primitive_desc(conv3_prim_desc.src_primitive_desc())
 372             != conv3_src_memory.get_primitive_desc()) {
 373         conv3_src_memory = memory(conv3_prim_desc.src_primitive_desc());
 374         net.push_back(reorder(pool2_dst_memory, conv3_src_memory));
 375     }
 376
 377     auto conv3_weights_memory = conv3_user_weights_memory;
 378     if (memory::primitive_desc(conv3_prim_desc.weights_primitive_desc())
 379             != conv3_user_weights_memory.get_primitive_desc()) {
 380         conv3_weights_memory
 381                 = memory(conv3_prim_desc.weights_primitive_desc());
 382         net_weights.push_back(
 383                 reorder(conv3_user_weights_memory, conv3_weights_memory));
 384     }
 385
 386     auto conv3_dst_memory = memory(conv3_prim_desc.dst_primitive_desc());
 387
 388     /* create convolution primitive and add it to net */
 389     net.push_back(convolution_forward(conv3_prim_desc, conv3_src_memory,
 390             conv3_weights_memory, conv3_user_bias_memory,
 391             conv3_dst_memory));
 392
 393     /* AlexNet: relu3
 394     * {batch, 384, 13, 13} -> {batch, 384, 13, 13}
 395     */
 396     const float negative3_slope = 1.0f;
 397
 398     /* create relu primitive and add it to net */
 399     auto relu3_desc = eltwise_forward::desc(prop_kind::forward_inference,
 400             algorithm::eltwise_relu,
 401             conv3_dst_memory.get_primitive_desc().desc(), negative3_slope);
 402     auto relu3_prim_desc
 403             = eltwise_forward::primitive_desc(relu3_desc, cpu_engine);
 404
 405     net.push_back(eltwise_forward(
 406             relu3_prim_desc, conv3_dst_memory, conv3_dst_memory));
 407
 408     /* AlexNet: conv4
 409     * {batch, 384, 13, 13} (x)  {2, 192, 192, 3, 3}; -> {batch, 384, 13,
 410     * 13};
 411     * strides: {1, 1}
 412     */
 413     memory::dims conv4_src_tz = { batch, 384, 13, 13 };
 414     memory::dims conv4_weights_tz = { 2, 192, 192, 3, 3 };
 415     memory::dims conv4_bias_tz = { 384 };
 416     memory::dims conv4_dst_tz = { batch, 384, 13, 13 };
 417     memory::dims conv4_strides = { 1, 1 };
 418     memory::dims conv4_padding = { 1, 1 };
 419
 420     std::vector<float> conv4_weights(std::accumulate(
 421             conv4_weights_tz.begin(), conv4_weights_tz.end(), 1,
 422             std::multiplies<uint32_t>()));
 423     std::vector<float> conv4_bias(std::accumulate(conv4_bias_tz.begin(),
 424             conv4_bias_tz.end(), 1, std::multiplies<uint32_t>()));
 425
 426     /* create memory for user data */
 427     auto conv4_user_weights_memory
 428             = memory({ { { conv4_weights_tz }, memory::data_type::f32,
 429                                memory::format::goihw },
 430                              cpu_engine },
 431                     conv4_weights.data());
 432     auto conv4_user_bias_memory
 433             = memory({ { { conv4_bias_tz }, memory::data_type::f32,
 434                                memory::format::x },
 435                              cpu_engine },
 436                     conv4_bias.data());
 437
 438     /* create memory descriptors for convolution data w/ no specified format
 439      */
 440     auto conv4_src_md = memory::desc(
 441             { conv4_src_tz }, memory::data_type::f32, memory::format::any);
 442     auto conv4_bias_md = memory::desc(
 443             { conv4_bias_tz }, memory::data_type::f32, memory::format::any);
 444     auto conv4_weights_md = memory::desc({ conv4_weights_tz },
 445             memory::data_type::f32, memory::format::any);
 446     auto conv4_dst_md = memory::desc(
 447             { conv4_dst_tz }, memory::data_type::f32, memory::format::any);
 448
 449     /* create a convolution */
 450     auto conv4_desc = convolution_forward::desc(
 451             prop_kind::forward_inference, convolution_direct, conv4_src_md,
 452             conv4_weights_md, conv4_bias_md, conv4_dst_md, conv4_strides,
 453             conv4_padding, conv4_padding, padding_kind::zero);
 454     auto conv4_prim_desc
 455             = convolution_forward::primitive_desc(conv4_desc, cpu_engine);
 456
 457     auto conv4_src_memory = conv3_dst_memory;
 458     if (memory::primitive_desc(conv4_prim_desc.src_primitive_desc())
 459             != conv4_src_memory.get_primitive_desc()) {
 460         conv4_src_memory = memory(conv4_prim_desc.src_primitive_desc());
 461         net.push_back(reorder(conv3_dst_memory, conv4_src_memory));
 462     }
 463
 464     auto conv4_weights_memory = conv4_user_weights_memory;
 465     if (memory::primitive_desc(conv4_prim_desc.weights_primitive_desc())
 466             != conv4_user_weights_memory.get_primitive_desc()) {
 467         conv4_weights_memory
 468                 = memory(conv4_prim_desc.weights_primitive_desc());
 469         net_weights.push_back(
 470                 reorder(conv4_user_weights_memory, conv4_weights_memory));
 471     }
 472
 473     auto conv4_dst_memory = memory(conv4_prim_desc.dst_primitive_desc());
 474
 475     /* create convolution primitive and add it to net */
 476     net.push_back(convolution_forward(conv4_prim_desc, conv4_src_memory,
 477             conv4_weights_memory, conv4_user_bias_memory,
 478             conv4_dst_memory));
 479
 480     /* AlexNet: relu4
 481     * {batch, 384, 13, 13} -> {batch, 384, 13, 13}
 482     */
 483     const float negative4_slope = 1.0f;
 484
 485     /* create relu primitive and add it to net */
 486     auto relu4_desc = eltwise_forward::desc(prop_kind::forward_inference,
 487             algorithm::eltwise_relu,
 488             conv4_dst_memory.get_primitive_desc().desc(), negative4_slope);
 489     auto relu4_prim_desc
 490             = eltwise_forward::primitive_desc(relu4_desc, cpu_engine);
 491
 492     net.push_back(eltwise_forward(
 493             relu4_prim_desc, conv4_dst_memory, conv4_dst_memory));
 494
 495     /* AlexNet: conv5
 496     * {batch, 384, 13, 13} (x)  {2, 128, 192, 3, 3}; -> {batch, 256, 13,
 497     * 13};
 498     * strides: {1, 1}
 499     */
 500     memory::dims conv5_weights_tz = { 2, 128, 192, 3, 3 };
 501     memory::dims conv5_bias_tz = { 256 };
 502     memory::dims conv5_dst_tz = { batch, 256, 13, 13 };
 503     memory::dims conv5_strides = { 1, 1 };
 504     memory::dims conv5_padding = { 1, 1 };
 505
 506     std::vector<float> conv5_weights(std::accumulate(
 507             conv5_weights_tz.begin(), conv5_weights_tz.end(), 1,
 508             std::multiplies<uint32_t>()));
 509     std::vector<float> conv5_bias(std::accumulate(conv5_bias_tz.begin(),
 510             conv5_bias_tz.end(), 1, std::multiplies<uint32_t>()));
 511
 512     /* create memory for user data */
 513     auto conv5_user_weights_memory
 514             = memory({ { { conv5_weights_tz }, memory::data_type::f32,
 515                                memory::format::goihw },
 516                              cpu_engine },
 517                     conv5_weights.data());
 518     auto conv5_user_bias_memory
 519             = memory({ { { conv5_bias_tz }, memory::data_type::f32,
 520                                memory::format::x },
 521                              cpu_engine },
 522                     conv5_bias.data());
 523
 524     /* create memory descriptors for convolution data w/ no specified format
 525      */
 526     auto conv5_bias_md = memory::desc(
 527             { conv5_bias_tz }, memory::data_type::f32, memory::format::any);
 528     auto conv5_weights_md = memory::desc({ conv5_weights_tz },
 529             memory::data_type::f32, memory::format::any);
 530     auto conv5_dst_md = memory::desc(
 531             { conv5_dst_tz }, memory::data_type::f32, memory::format::any);
 532
 533     /* create a convolution */
 534     auto conv5_desc = convolution_forward::desc(
 535             prop_kind::forward_inference, convolution_direct,
 536             conv4_dst_memory.get_primitive_desc().desc(), conv5_weights_md,
 537             conv5_bias_md, conv5_dst_md, conv5_strides, conv5_padding,
 538             conv5_padding, padding_kind::zero);
 539     auto conv5_prim_desc
 540             = convolution_forward::primitive_desc(conv5_desc, cpu_engine);
 541
 542     auto conv5_src_memory = conv4_dst_memory;
 543     if (memory::primitive_desc(conv5_prim_desc.src_primitive_desc())
 544             != conv5_src_memory.get_primitive_desc()) {
 545         conv5_src_memory = memory(conv5_prim_desc.src_primitive_desc());
 546         net.push_back(reorder(conv4_dst_memory, conv5_src_memory));
 547     }
 548
 549     auto conv5_weights_memory = conv5_user_weights_memory;
 550     if (memory::primitive_desc(conv5_prim_desc.weights_primitive_desc())
 551             != conv5_user_weights_memory.get_primitive_desc()) {
 552         conv5_weights_memory
 553                 = memory(conv5_prim_desc.weights_primitive_desc());
 554         net_weights.push_back(
 555                 reorder(conv5_user_weights_memory, conv5_weights_memory));
 556     }
 557
 558     auto conv5_dst_memory = memory(conv5_prim_desc.dst_primitive_desc());
 559
 560     /* create convolution primitive and add it to net */
 561     net.push_back(convolution_forward(conv5_prim_desc, conv5_src_memory,
 562             conv5_weights_memory, conv5_user_bias_memory,
 563             conv5_dst_memory));
 564
 565     /* AlexNet: relu5
 566     * {batch, 256, 13, 13} -> {batch, 256, 13, 13}
 567     */
 568     const float negative5_slope = 1.0f;
 569
 570     /* create relu primitive and add it to net */
 571     auto relu5_desc = eltwise_forward::desc(prop_kind::forward_inference,
 572             algorithm::eltwise_relu,
 573             conv5_dst_memory.get_primitive_desc().desc(), negative5_slope);
 574     auto relu5_prim_desc
 575             = eltwise_forward::primitive_desc(relu5_desc, cpu_engine);
 576
 577     net.push_back(eltwise_forward(
 578             relu5_prim_desc, conv5_dst_memory, conv5_dst_memory));
 579
 580     /* AlexNet: pool5
 581     * {batch, 256, 13, 13} -> {batch, 256, 6, 6}
 582     * kernel: {3, 3}
 583     * strides: {2, 2}
 584     */
 585
 586     memory::dims pool5_dst_tz = { batch, 256, 6, 6 };
 587     memory::dims pool5_kernel = { 3, 3 };
 588     memory::dims pool5_strides = { 2, 2 };
 589     memory::dims pool5_padding = { 0, 0 };
 590
 591     std::vector<float> pool5_dst(std::accumulate(pool5_dst_tz.begin(),
 592             pool5_dst_tz.end(), 1, std::multiplies<uint32_t>()));
 593
 594     auto pool5_dst_md = memory::desc(
 595             { pool5_dst_tz }, memory::data_type::f32, memory::format::any);
 596
 597     /* create a pooling */
 598     auto pool5_desc = pooling_forward::desc(prop_kind::forward_inference,
 599             pooling_max, conv5_dst_memory.get_primitive_desc().desc(),
 600             pool5_dst_md, pool5_strides, pool5_kernel, pool5_padding,
 601             pool5_padding, padding_kind::zero);
 602     auto pool5_pd = pooling_forward::primitive_desc(pool5_desc, cpu_engine);
 603
 604     auto pool5_dst_memory = memory(pool5_pd.dst_primitive_desc());
 605
 606     /* create pooling primitive an add it to net */
 607     net.push_back(
 608             pooling_forward(pool5_pd, conv5_dst_memory, pool5_dst_memory));
 609
 610     /**
 611      * fc6 inner product {batch, 256, 6, 6} (x) {4096, 256, 6, 6}-> {batch,
 612      * 4096}
 613      */
 614     memory::dims fc6_src_tz = { batch, 256, 6, 6 };
 615     memory::dims fc6_weights_tz = { 4096, 256, 6, 6 };
 616     memory::dims fc6_bias_tz = { 4096 };
 617     memory::dims fc6_dst_tz = { batch, 4096 };
 618
 619     std::vector<float> fc6_weights(std::accumulate(fc6_weights_tz.begin(),
 620             fc6_weights_tz.end(), 1, std::multiplies<uint32_t>()));
 621     std::vector<float> fc6_bias(std::accumulate(fc6_bias_tz.begin(),
 622             fc6_bias_tz.end(), 1, std::multiplies<uint32_t>()));
 623
 624     /* create memory for user data */
 625     auto fc6_user_weights_memory
 626             = memory({ { { fc6_weights_tz }, memory::data_type::f32,
 627                                memory::format::oihw },
 628                              cpu_engine },
 629                     fc6_weights.data());
 630
 631     auto fc6_user_bias_memory
 632             = memory({ { { fc6_bias_tz }, memory::data_type::f32,
 633                                memory::format::x },
 634                              cpu_engine },
 635                     fc6_bias.data());
 636
 637     /* create memory descriptors for convolution data w/ no specified format
 638      */
 639     auto fc6_src_md = memory::desc(
 640             { fc6_src_tz }, memory::data_type::f32, memory::format::any);
 641     auto fc6_bias_md = memory::desc(
 642             { fc6_bias_tz }, memory::data_type::f32, memory::format::any);
 643     auto fc6_weights_md = memory::desc({ fc6_weights_tz },
 644             memory::data_type::f32, memory::format::any);
 645     auto fc6_dst_md = memory::desc(
 646             { fc6_dst_tz }, memory::data_type::f32, memory::format::any);
 647
 648     /* create a inner_product */
 649     auto fc6_desc
 650             = inner_product_forward::desc(prop_kind::forward_inference,
 651                     fc6_src_md, fc6_weights_md, fc6_bias_md, fc6_dst_md);
 652     auto fc6_prim_desc
 653             = inner_product_forward::primitive_desc(fc6_desc, cpu_engine);
 654
 655     auto fc6_src_memory = pool5_dst_memory;
 656     if (memory::primitive_desc(fc6_prim_desc.src_primitive_desc())
 657             != fc6_src_memory.get_primitive_desc()) {
 658         fc6_src_memory = memory(fc6_prim_desc.src_primitive_desc());
 659         net.push_back(reorder(pool5_dst_memory, fc6_src_memory));
 660     }
 661
 662     auto fc6_weights_memory = fc6_user_weights_memory;
 663     if (memory::primitive_desc(fc6_prim_desc.weights_primitive_desc())
 664             != fc6_user_weights_memory.get_primitive_desc()) {
 665         fc6_weights_memory = memory(fc6_prim_desc.weights_primitive_desc());
 666         net_weights.push_back(
 667                 reorder(fc6_user_weights_memory, fc6_weights_memory));
 668     }
 669
 670     auto fc6_dst_memory = memory(fc6_prim_desc.dst_primitive_desc());
 671
 672     /* create convolution primitive and add it to net */
 673     net.push_back(inner_product_forward(fc6_prim_desc, fc6_src_memory,
 674             fc6_weights_memory, fc6_user_bias_memory, fc6_dst_memory));
 675
 676     /**
 677      * fc7 inner product {batch, 4096} (x) {4096, 4096}-> {batch, 4096}
 678      */
 679     memory::dims fc7_weights_tz = { 4096, 4096 };
 680     memory::dims fc7_bias_tz = { 4096 };
 681     memory::dims fc7_dst_tz = { batch, 4096 };
 682
 683     std::vector<float> fc7_weights(std::accumulate(fc7_weights_tz.begin(),
 684             fc7_weights_tz.end(), 1, std::multiplies<uint32_t>()));
 685     std::vector<float> fc7_bias(std::accumulate(fc7_bias_tz.begin(),
 686             fc7_bias_tz.end(), 1, std::multiplies<uint32_t>()));
 687
 688     /* create memory for user data */
 689     auto fc7_user_weights_memory
 690             = memory({ { { fc7_weights_tz }, memory::data_type::f32,
 691                                memory::format::nc },
 692                              cpu_engine },
 693                     fc7_weights.data());
 694
 695     auto fc7_user_bias_memory
 696             = memory({ { { fc7_bias_tz }, memory::data_type::f32,
 697                                memory::format::x },
 698                              cpu_engine },
 699                     fc7_bias.data());
 700
 701     /* create memory descriptors for convolution data w/ no specified format
 702      */
 703     auto fc7_bias_md = memory::desc(
 704             { fc7_bias_tz }, memory::data_type::f32, memory::format::any);
 705     auto fc7_weights_md = memory::desc({ fc7_weights_tz },
 706             memory::data_type::f32, memory::format::any);
 707     auto fc7_dst_md = memory::desc(
 708             { fc7_dst_tz }, memory::data_type::f32, memory::format::any);
 709
 710     /* create a inner_product */
 711     auto fc7_desc
 712             = inner_product_forward::desc(prop_kind::forward_inference,
 713                     fc6_dst_memory.get_primitive_desc().desc(),
 714                     fc7_weights_md, fc7_bias_md, fc7_dst_md);
 715     auto fc7_prim_desc
 716             = inner_product_forward::primitive_desc(fc7_desc, cpu_engine);
 717
 718     auto fc7_weights_memory = fc7_user_weights_memory;
 719     if (memory::primitive_desc(fc7_prim_desc.weights_primitive_desc())
 720             != fc7_user_weights_memory.get_primitive_desc()) {
 721         fc7_weights_memory = memory(fc7_prim_desc.weights_primitive_desc());
 722         net.push_back(reorder(fc7_user_weights_memory, fc7_weights_memory));
 723     }
 724
 725     auto fc7_dst_memory = memory(fc7_prim_desc.dst_primitive_desc());
 726
 727     /* create convolution primitive and add it to net */
 728     net.push_back(inner_product_forward(fc7_prim_desc, fc6_dst_memory,
 729             fc7_weights_memory, fc7_user_bias_memory, fc7_dst_memory));
 730
 731     /**
 732     * fc8 inner product {batch, 4096} (x) {1000, 4096}-> {batch, 1000}
 733     */
 734     memory::dims fc8_weights_tz = { 1000, 4096 };
 735     memory::dims fc8_bias_tz = { 1000 };
 736     memory::dims fc8_dst_tz = { batch, 1000 };
 737
 738     std::vector<float> fc8_weights(std::accumulate(fc8_weights_tz.begin(),
 739             fc8_weights_tz.end(), 1, std::multiplies<uint32_t>()));
 740     std::vector<float> fc8_bias(std::accumulate(fc8_bias_tz.begin(),
 741             fc8_bias_tz.end(), 1, std::multiplies<uint32_t>()));
 742
 743     /* create memory for user data */
 744     auto fc8_user_weights_memory
 745             = memory({ { { fc8_weights_tz }, memory::data_type::f32,
 746                                memory::format::nc },
 747                              cpu_engine },
 748                     fc8_weights.data());
 749
 750     auto fc8_user_bias_memory
 751             = memory({ { { fc8_bias_tz }, memory::data_type::f32,
 752                                memory::format::x },
 753                              cpu_engine },
 754                     fc8_bias.data());
 755
 756     auto user_dst_memory = memory({ { { fc8_dst_tz }, memory::data_type::f32,
 757                                            memory::format::nc },
 758                                          cpu_engine },
 759             user_dst.data());
 760
 761     /* create memory descriptors for convolution data w/ no specified format
 762      */
 763     auto fc8_bias_md = memory::desc(
 764             { fc8_bias_tz }, memory::data_type::f32, memory::format::any);
 765     auto fc8_weights_md = memory::desc({ fc8_weights_tz },
 766             memory::data_type::f32, memory::format::any);
 767     auto fc8_dst_md = memory::desc(
 768             { fc8_dst_tz }, memory::data_type::f32, memory::format::any);
 769
 770     /* create a inner_product */
 771     auto fc8_desc
 772             = inner_product_forward::desc(prop_kind::forward_inference,
 773                     fc7_dst_memory.get_primitive_desc().desc(),
 774                     fc8_weights_md, fc8_bias_md, fc8_dst_md);
 775     auto fc8_prim_desc
 776             = inner_product_forward::primitive_desc(fc8_desc, cpu_engine);
 777
 778     auto fc8_weights_memory = fc8_user_weights_memory;
 779     if (memory::primitive_desc(fc8_prim_desc.weights_primitive_desc())
 780             != fc8_user_weights_memory.get_primitive_desc()) {
 781         fc8_weights_memory = memory(fc8_prim_desc.weights_primitive_desc());
 782         net_weights.push_back(
 783                 reorder(fc8_user_weights_memory, fc8_weights_memory));
 784     }
 785
 786     auto fc8_dst_memory = memory(fc8_prim_desc.dst_primitive_desc());
 787
 788     /* create convolution primitive and add it to net */
 789     net.push_back(inner_product_forward(fc8_prim_desc, fc7_dst_memory,
 790             fc8_weights_memory, fc8_user_bias_memory, fc8_dst_memory));
 791
 792     /* create reorder between internal and user data if it is needed and
 793      *  add it to net after pooling */
 794     if (fc8_dst_memory != user_dst_memory) {
 795         net.push_back(reorder(fc8_dst_memory, user_dst_memory));
 796     }
 797
 798     stream(stream::kind::eager).submit(net_weights).wait();
 799     for (int j = 0; j < times; ++j) {
 800         stream(stream::kind::eager).submit(net).wait();
 801     }
 802 }
 803
 804 int main(int argc, char **argv) {
 805     try {
 806         auto begin = chrono::duration_cast<chrono::milliseconds>(
 807                              chrono::steady_clock::now().time_since_epoch())
 808                              .count();
 809         int times = 1000;
 810         simple_net(times);
 811         auto end = chrono::duration_cast<chrono::milliseconds>(
 812                            chrono::steady_clock::now().time_since_epoch())
 813                            .count();
 814         cout << "Use time " << (end - begin) / (times + 0.0) << "\n";
 815     } catch (error &e) {
 816         std::cerr << "status: " << e.status << std::endl;
 817         std::cerr << "message: " << e.message << std::endl;
 818     }
 819     return 0;
 820 }