2 * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 #ifndef LUCI_INTERPRETER_PAL_ADD_COMMON_H
19 #define LUCI_INTERPRETER_PAL_ADD_COMMON_H
23 #include "ProcessBroadcastShapes.h"
25 namespace luci_interpreter_pal
28 // TODO: check if there real activation value
30 inline void Add(const ArithmeticParams ¶ms, const int flat_size, const T *input1_data,
31 const T *input2_data, T *output_data)
33 T activation_min, activation_max;
34 getActivationParams(params, &activation_min, &activation_max);
36 for (int i = 0; i < flat_size; ++i)
38 std::min(std::max(input1_data[i] + input2_data[i], activation_min), activation_max);
43 BroadcastAdd4DSlow(const ArithmeticParams ¶ms,
44 const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data,
45 const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
46 const luci_interpreter::RuntimeShape &output_shape, T *output_data)
50 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
51 const luci_interpreter::RuntimeShape extended_output_shape =
52 luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
54 T activation_min, activation_max;
55 getActivationParams(params, &activation_min, &activation_max);
57 // In Tensorflow, the dimensions are canonically named (batch_number, row,
58 // col, channel), with extents (batches, height, width, depth), with the
59 // trailing dimension changing most rapidly (channels has the smallest stride,
60 // typically 1 element).
62 // In generated C code, we store arrays with the dimensions reversed. The
63 // first dimension has smallest stride.
65 // We name our variables by their Tensorflow convention, but generate C code
66 // nesting loops such that the innermost loop has the smallest stride for the
67 // best cache behavior.
68 for (int b = 0; b < extended_output_shape.dims(0); ++b)
70 for (int y = 0; y < extended_output_shape.dims(1); ++y)
72 for (int x = 0; x < extended_output_shape.dims(2); ++x)
74 for (int c = 0; c < extended_output_shape.dims(3); ++c)
76 const int output_data_offset =
77 ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
78 extended_output_shape.dims(3) +
81 output_data[output_data_offset] =
82 std::min(std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)] +
83 input2_data[subscriptToIndex(desc2, b, y, x, c)],
92 } // namespace luci_interpreter_pal
94 #endif // LUCI_INTERPRETER_PAL_ADD_COMMON_H