1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
5 #include "cpu_detector.hpp"
6 #include "blob_transform.hpp"
8 #include "blob_transform_sse42.hpp"
14 //----------------------------------------------------------------------
16 namespace InferenceEngine {
18 template <InferenceEngine::Precision::ePrecision PRC>
19 static void blob_copy_4d_t(Blob::Ptr src, Blob::Ptr dst) {
20 using data_t = typename InferenceEngine::PrecisionTrait<PRC>::value_type;
22 auto *src_ptr = src->buffer().as<data_t*>();
23 auto *dst_ptr = dst->buffer().as<data_t*>();
25 SizeVector dims = src->getTensorDesc().getDims();
32 const Layout src_l = src->layout();
33 const auto &src_blk_dsc = src->getTensorDesc().getBlockingDesc();
34 const auto &src_strides = src_blk_dsc.getStrides();
35 const auto N_src_stride = src_strides[0];
36 const auto C_src_stride = src_l == NHWC ? src_strides[3] : src_strides[1];
37 const auto H_src_stride = src_l == NHWC ? src_strides[1] : src_strides[2];
38 const auto W_src_stride = src_l == NHWC ? src_strides[2] : src_strides[3];
39 src_ptr += src_blk_dsc.getOffsetPadding();
41 const Layout dst_l = dst->layout();
42 const auto &dst_blk_desc = dst->getTensorDesc().getBlockingDesc();
43 const auto &dst_strides = dst_blk_desc.getStrides();
44 const auto N_dst_stride = dst_strides[0];
45 const auto C_dst_stride = dst_l == NHWC ? dst_strides[3] : dst_strides[1];
46 const auto H_dst_stride = dst_l == NHWC ? dst_strides[1] : dst_strides[2];
47 const auto W_dst_stride = dst_l == NHWC ? dst_strides[2] : dst_strides[3];
49 src_ptr += dst_blk_desc.getOffsetPadding();
52 if (src->layout() == NHWC && dst->layout() == NCHW && C == 3
53 && C_src_stride == 1 && W_src_stride == 3 && W_dst_stride == 1 &&
54 with_cpu_x86_sse42()) {
55 if (PRC == Precision::U8) {
56 blob_copy_4d_split_u8c3(reinterpret_cast<const uint8_t*>(src_ptr),
57 reinterpret_cast< uint8_t*>(dst_ptr),
58 N_src_stride, H_src_stride,
59 N_dst_stride, H_dst_stride, C_dst_stride,
60 static_cast<int>(N), static_cast<int>(H),
65 if (PRC == Precision::FP32) {
66 blob_copy_4d_split_f32c3(reinterpret_cast<const float*>(src_ptr),
67 reinterpret_cast< float*>(dst_ptr),
68 N_src_stride, H_src_stride,
69 N_dst_stride, H_dst_stride, C_dst_stride,
70 static_cast<int>(N), static_cast<int>(H),
76 if (src->layout() == NCHW && dst->layout() == NHWC && C == 3 &&
77 C_dst_stride == 1 && W_dst_stride == 3 && W_src_stride == 1 &&
78 with_cpu_x86_sse42()) {
79 if (PRC == Precision::U8) {
80 blob_copy_4d_merge_u8c3(reinterpret_cast<const uint8_t*>(src_ptr),
81 reinterpret_cast< uint8_t*>(dst_ptr),
82 N_src_stride, H_src_stride, C_src_stride,
83 N_dst_stride, H_dst_stride,
84 static_cast<int>(N), static_cast<int>(H),
89 if (PRC == Precision::FP32) {
90 blob_copy_4d_merge_f32c3(reinterpret_cast<const float*>(src_ptr),
91 reinterpret_cast< float*>(dst_ptr),
92 N_src_stride, H_src_stride, C_src_stride,
93 N_dst_stride, H_dst_stride,
94 static_cast<int>(N), static_cast<int>(H),
101 if (src->layout() == NHWC && dst->layout() == NCHW) {
102 for (int n = 0; n < N; n++) {
103 for (int c = 0; c < C; c++) {
104 data_t *dst_ptr_l = dst_ptr + n * N_dst_stride + c * C_dst_stride;
105 data_t *src_ptr_l = src_ptr + n * N_src_stride + c * C_src_stride;
106 for (int h = 0; h < H; h++) {
107 data_t *src_ptr_l_l = src_ptr_l + h*H_src_stride;
108 for (int w = 0; w < W; w++) {
109 *dst_ptr_l = *src_ptr_l_l;
110 src_ptr_l_l += W_src_stride;
116 } else if (src->layout() == NCHW && dst->layout() == NHWC) {
117 for (int n = 0; n < N; n++) {
118 for (int c = 0; c < C; c++) {
119 data_t *src_ptr_l = src_ptr + n * N_src_stride + c * C_src_stride;
120 data_t *dst_ptr_l = dst_ptr + n * N_dst_stride + c;
121 for (int h = 0; h < H; h++) {
122 data_t *src_ptr_l_l = src_ptr_l + h*H_src_stride;
123 for (int w = 0; w < W; w++) {
124 *dst_ptr_l = *src_ptr_l_l;
125 dst_ptr_l += W_dst_stride;
132 for (int i = 0; i < N*C*H*W; i++) {
133 dst_ptr[i] = src_ptr[i];
138 static inline void blob_copy_4d(Blob::Ptr src, Blob::Ptr dst) {
139 switch (src->precision()) {
140 case Precision::FP32:
142 blob_copy_4d_t<Precision::FP32>(src, dst);
145 case Precision::FP16:
148 blob_copy_4d_t<Precision::U16>(src, dst);
153 blob_copy_4d_t<Precision::U8>(src, dst);
157 THROW_IE_EXCEPTION << "Unsupported blob transformation for precision " << src->precision();
161 void blob_copy(Blob::Ptr src, Blob::Ptr dst) {
162 if (src->buffer() == nullptr)
163 THROW_IE_EXCEPTION << "Cannot copy blob data. Source is not allocated.";
165 if (dst->buffer() == nullptr)
166 THROW_IE_EXCEPTION << "Cannot copy blob data. Destination is not allocated.";
168 if (src->precision() != dst->precision())
169 THROW_IE_EXCEPTION << "Unimplemented blob transformation from precision "
170 << src->precision() << " to " << src->precision();
172 if (src->dims() != dst->dims())
173 THROW_IE_EXCEPTION << "Unimplemented blob transformation from different shapes ";
175 if (src->dims().size() == 4)
176 blob_copy_4d(src, dst);
178 THROW_IE_EXCEPTION << "Unimplemented blob transformation. Only 4d supported.";
181 } // namespace InferenceEngine