1 /*******************************************************************************
2 * Copyright 2018 Intel Corporation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
20 #include "c_types_map.hpp"
21 #include "mkldnn_thread.hpp"
22 #include "type_helpers.hpp"
23 #include "format_traits.hpp"
25 #include "ref_shuffle.hpp"
31 using namespace memory_format;
33 template <int data_type_size>
34 template <mkldnn_memory_format_t fmt>
35 void ref_shuffle_t<data_type_size>::execute_() const {
36 using namespace prop_kind;
37 using namespace utils;
39 const memory_desc_wrapper data_d(pd()->data_pd());
41 auto input = reinterpret_cast<const data_t*>(this->input_memory(0));
42 auto output = reinterpret_cast<data_t*>(this->memory(0));
44 const int axis = pd()->axis();
45 const int axis_size = pd()->axis_size();
47 const int MB = pd()->MB();
48 const int C = pd()->C();
49 int H = 1, W = 1, D = 1, HW = 1, SP = 1;
50 const bool has_spatial = utils::one_of(data_d.ndims(), 3, 4, 5);
59 const size_t stride_mb = data_d.blocking_desc().strides[0][0];
60 constexpr int blksize = format_traits<fmt>::blk_size;
62 if (axis == 1 && one_of(fmt, nChw16c, nChw8c, nChw4c, nCdhw16c, nCdhw8c,
64 #if MKLDNN_THR == MKLDNN_THR_OMP
65 # pragma omp parallel for collapse(3) schedule(static)
66 for (int mb = 0; mb < MB; ++mb)
67 for (int cb = 0; cb < C; cb += blksize)
68 for (int sp = 0; sp < SP; ++sp) {
69 const size_t off = mb * stride_mb + sp * blksize;
70 const size_t output_off = off + cb * SP;
72 for (int cc = 0; cc < nstl::min(blksize, C - cb); ++cc)
74 int input_c = rev_transposed_[cb + cc];
75 const size_t input_off = off + input_c / blksize * SP * blksize
77 output[output_off + cc] = input[input_off];
81 parallel_nd(MB, utils::div_up(C, blksize), SP, [&](int mb, int c,
83 const size_t off = mb * stride_mb + sp * blksize;
84 const int cb = c * blksize;
85 const size_t output_off = off + cb * SP;
86 for (int cc = 0; cc < nstl::min(blksize, C - cb); ++cc)
88 int input_c = rev_transposed_[cb + cc];
89 const size_t input_off = off + input_c / blksize * SP * blksize
91 output[output_off + cc] = input[input_off];
95 } else if (axis == 1 && one_of(fmt, nhwc, ndhwc)) {
96 parallel_nd(MB, SP, [&](int mb, int sp) {
97 const size_t off = mb * stride_mb + sp * C;
99 for (int c = 0; c < C; ++c)
100 output[off + c] = input[off + rev_transposed_[c]];
102 } else if (axis == 1 && one_of(fmt, nchw, ncdhw)) {
103 parallel_nd(MB, C, [&](int mb, int c) {
104 const size_t output_off = mb * stride_mb + c * SP;
105 const size_t input_off = mb * stride_mb + rev_transposed_[c] * SP;
107 for (int sp = 0; sp < SP; ++sp) {
108 output[output_off + sp] = input[input_off + sp];
112 auto dims = pd()->desc()->data_desc.dims;
113 auto ndims = pd()->desc()->data_desc.ndims;
114 const size_t outer_size = utils::array_product(dims, axis);
115 const size_t inner_size = utils::array_product(dims + axis + 1,
117 const size_t dim = axis_size * inner_size;
119 parallel_nd(outer_size, axis_size, inner_size, [&](size_t ou, int a,
122 const size_t off = ou * dim + in;
123 auto &o = output[data_d.off_l(off + a * inner_size)];
124 o = input[data_d.off_l(off + rev_transposed_[a] * inner_size)];
129 template void ref_shuffle_t<4>::execute_<nCdhw16c>() const;
130 template void ref_shuffle_t<4>::execute_<nChw16c>() const;
131 template void ref_shuffle_t<4>::execute_<nCdhw8c>() const;
132 template void ref_shuffle_t<4>::execute_<nChw8c>() const;
133 template void ref_shuffle_t<4>::execute_<nCdhw4c>() const;
134 template void ref_shuffle_t<4>::execute_<nChw4c>() const;
135 template void ref_shuffle_t<4>::execute_<ncdhw>() const;
136 template void ref_shuffle_t<4>::execute_<nchw>() const;
137 template void ref_shuffle_t<4>::execute_<ndhwc>() const;
138 template void ref_shuffle_t<4>::execute_<nhwc>() const;
139 template void ref_shuffle_t<4>::execute_<any>() const;
141 template void ref_shuffle_t<1>::execute_<nCdhw16c>() const;
142 template void ref_shuffle_t<1>::execute_<nChw16c>() const;
143 template void ref_shuffle_t<1>::execute_<nCdhw8c>() const;
144 template void ref_shuffle_t<1>::execute_<nChw8c>() const;
145 template void ref_shuffle_t<1>::execute_<nCdhw4c>() const;
146 template void ref_shuffle_t<1>::execute_<nChw4c>() const;
147 template void ref_shuffle_t<1>::execute_<ncdhw>() const;
148 template void ref_shuffle_t<1>::execute_<nchw>() const;
149 template void ref_shuffle_t<1>::execute_<ndhwc>() const;
150 template void ref_shuffle_t<1>::execute_<nhwc>() const;
151 template void ref_shuffle_t<1>::execute_<any>() const;
157 // vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s