axis_name = axis_names[axis];
}
- auto out = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+ auto out = genTensor(op, op.getOutputShape(0));
auto prefix = out->name() + "_concatenate_layer";
auto inputs_var = _constrBlock->var("std::vector<arm_compute::ICLTensor*>", prefix + "_inputs");
auto inputs = inputs_var->use();
// CLPermute does not support all kinds of permutations now.
// rank can be more than 2 in our models, so we can not use CLTranspose.
// This means we can support tensors with no more then one axis > 1.
- int axis = op.getAxis() < 0 ? rank + op.getAxis() : op.getAxis();
+ int axis = op.getAxis();
assert(axis == rank - 1);
int nof_long_axes = 0;
throw AclCppException("Unsupported Softmax operation with several dimensions greater than 1");
// Create the output tensor.
- Shape in_out_shape(op.getOutputShape(0));
- Shape sm_shape(in_out_shape);
-
- if (axis != 0) {
- int sm_dim = sm_shape.dim(axis);
- sm_shape.dim(axis) = sm_shape.dim(0);
- sm_shape.dim(0) = sm_dim;
- }
-
- Shape transposed_out_shape;
-
- switch (in_out_shape.rank()) {
- case 4:
- transposed_out_shape = transposeShape<3, 2, 1, 0>(in_out_shape);
- break;
- case 2:
- transposed_out_shape = transposeShape<1, 0>(in_out_shape);
- break;
- default:
- throw AclCppException("Unsupported number of dimensions in softmax");
- }
+ const Shape& in_out_shape = op.getOutputShape(0);
- auto out = genTensor(op, transposed_out_shape);
- auto prefix = out->name();
+ shared_ptr<ArtifactId> output = genTensor(op, in_out_shape);
+ auto layer_name_prefix = output->name();
if (axis == 0) {
// Simple version: do not need pre and post reshapes.
- // Apply the softmax operaion.
- auto sm = genLayer("arm_compute::CLSoftmaxLayer", prefix + "_softmax_layer",
- {AF::ref(in), AF::ref(out)});
- allocate(out);
+ // Apply the softmax operation.
+ auto sm = genLayer("arm_compute::CLSoftmaxLayer", layer_name_prefix + "_softmax_layer",
+ {AF::ref(in), AF::ref(output)});
+ allocate(output);
genLayerExecution(sm);
} else {
+ // TODO refactor this code, it works only with 1 batch
+
// Need to reshape before the Softmax application and after it.
// Then we need two tensors for intermediate results. This is because we do a couple of auxiliary
// reshapes: one to transform the input tensor to a unidimensional tensor and the second to
// transorm the result of the softmax operation back to the original form.
- auto tmp = genTensor(prefix + "_tmp", sm_shape);
- auto tmp2 = genTensor(prefix + "_tmp2", sm_shape);
+ Shape sm_shape(in_out_shape);
+
+ std::swap(sm_shape.dim(axis), sm_shape.dim(-1));
+
+ auto tmp = genTensor(layer_name_prefix + "_tmp", sm_shape);
+ auto tmp2 = genTensor(layer_name_prefix + "_tmp2", sm_shape);
// Do the input permutation.
- auto transp1 = genLayer("arm_compute::CLReshapeLayer", prefix + "_transp_layer1",
+ auto transp1 = genLayer("arm_compute::CLReshapeLayer", layer_name_prefix + "_transp_layer1",
{AF::ref(in), AF::ref(tmp)});
allocate(tmp);
genLayerExecution(transp1);
// Apply the softmax operaion.
- auto sm = genLayer("arm_compute::CLSoftmaxLayer", prefix + "_softmax_layer",
+ auto sm = genLayer("arm_compute::CLSoftmaxLayer", layer_name_prefix + "_softmax_layer",
{AF::ref(tmp), AF::ref(tmp2)});
allocate(tmp2);
genLayerExecution(sm);
// Reshape the output to the original form.
- auto transp2 = genLayer("arm_compute::CLReshapeLayer", prefix + "_transp_layer2",
- {AF::ref(tmp2), AF::ref(out)});
- allocate(out);
+ auto transp2 = genLayer("arm_compute::CLReshapeLayer", layer_name_prefix + "_transp_layer2",
+ {AF::ref(tmp2), AF::ref(output)});
+ allocate(output);
genLayerExecution(transp2);
}
}
_constrBlock->var("arm_compute::CLTensor&", output->name(), {}, {input});
return output;
}
- Shape transposed_shape = transposeShape<2, 1, 3, 0>(input_shape);
+ Shape transposed_shape = transposeShape<0, 3, 1, 2>(input_shape);
shared_ptr<ArtifactId> transposed_id =
genTensor(name, transposed_shape, false);
genTranspose(input, transposed_id, {0, 3, 1, 2});
_constrBlock->var("arm_compute::CLTensor&", output->name(), {}, {input});
return output;
}
- Shape transposed_shape = transposeShape<1, 3, 2, 0>(input_shape);
+ Shape transposed_shape = transposeShape<0, 2, 3, 1>(input_shape);
shared_ptr<ArtifactId> transposed_id =
genTensor(name, transposed_shape, false);
genTranspose(input, transposed_id, {0, 2, 3, 1});
// Generate auxiliary tensor to hold transposed output of pool in NCHW format
Shape transposed_output_shape = transposeShape<0, 3, 1, 2>(op.getOutputShape(0));
shared_ptr<ArtifactId> transposed_output =
- genTensor(layer_name + "_out_transpose", transposeShape<3, 2, 1, 0>(transposed_output_shape));
+ genTensor(layer_name + "_out_transpose", transposed_output_shape);
// Actual layer creation
shared_ptr<ArtifactId> layer = genLayer("arm_compute::CLPoolingLayer", layer_name,
}
void AclCppOpGenerator::visit(ops::FullyConnectedOp& op) {
- const TensorVariant& ir_weights = op.getWeights();
+ const TensorVariant ir_weights = transposeTensor<1, 0>(op.getWeights());
const Shape& ir_weights_shape = ir_weights.getShape();
auto& prev_nodes = op.getPrevNodes();
const Shape& out_shape = op.getOutputShape(0);
if (out_shape.rank() != 2)
throw AclCppException("Unsupported number of dimensions in fc layer");
- auto out = genTensor(op, transposeShape<1, 0>(out_shape));
+ auto out = genTensor(op, out_shape);
string operation_name = out->name() + "_fully_connected_layer";
// Create the weights tensor in the DOM and use its id.
transposed_input = genTransposeMIRtoACL(transposed_input_name, op.getInputShape(0), input);
transposed_output =
- genTensor(transposed_output_name, transposeShape<3, 2, 1, 0>(transposed_output_shape));
+ genTensor(transposed_output_name, transposed_output_shape);
break;
}
case 2:
- transposed_output_shape = out_shape;
- transposed_input = input;
- transposed_output = genTensor(tensorName(&op), transposeShape<1, 0>(transposed_output_shape));
- break;
case 1:
transposed_output_shape = out_shape;
transposed_input = input;
const auto ir_input_shape = op.getInputShape(0);
Shape ir_biases_shape(ir_input_shape.rank());
- // TODO remove this if after batch axis is restored in all operations in Model IR
- if (op.getPrevNodes()[0].op->getType() == Operation::Type::fullyConnected) {
- // Fully connected layer restores batch axis in result, so need to copy shape with redundant 1
- // Shape transpose is needed to generate axises in reverse order
- ir_biases_shape = transposeShape<1, 0>(op.getInputShape(0));
- } else {
- // ACL CLArithmeticAddition supports input tensors broadcasting.
- for (int i = 0; i < ir_input_shape.rank(); ++i)
- ir_biases_shape.dim(i) = 1;
+ // ACL CLArithmeticAddition supports input tensors broadcasting.
+ for (int i = 0; i < ir_input_shape.rank(); ++i)
+ ir_biases_shape.dim(i) = 1;
- ir_biases_shape.dim(2) = ir_biases.getShape().dim(0);
- }
+ ir_biases_shape.dim(1) = ir_biases.getShape().dim(0);
auto biases = genTensor(layer_name + "_biases", ir_biases_shape);
// Instantiate the CLArithmeticAddition object.
void AclCppOpGenerator::visit(ops::VariableOp& op) {
shared_ptr<ArtifactId> tensor;
if (cli::debugTranspose) {
- if (op.getOutputShape(0).rank() == 2)
- tensor = genTensor(op, transposeShape<1, 0>(op.getOutputShape(0)));
- else
- tensor = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+ tensor = genTensor(op, op.getOutputShape(0));
} else {
- if (op.getOutputShape(0).rank() == 2)
- tensor = genTensor(op, transposeShape<1, 0>(op.getOutputShape(0)));
+ if (op.getOutputShape(0).rank() == 4)
+ tensor = genTensor(op, transposeShape<0, 3, 1, 2>(op.getOutputShape(0)));
else
- tensor = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0)));
+ tensor = genTensor(op, op.getOutputShape(0));
}
allocate(tensor);
}
Shape out_shape = op.getOutputShape(0);
TensorVariant data = op.getValue();
- Shape transposed_shape;
- // FIXME This is temporary solution,
- // need to move this shape transposes into genTensor function and
- // implement transpose operation to support ranks greater than 2
- switch (out_shape.rank()) {
- case 2:
- transposed_shape = transposeShape<1, 0>(out_shape);
- break;
- case 1:
- transposed_shape = out_shape;
- break;
- default:
- throw AclCppException("Unsupported number of dimensions: " + to_string(out_shape.rank()));
- }
-
- shared_ptr<ArtifactId> out = genTensor(op, transposed_shape);
+ shared_ptr<ArtifactId> out = genTensor(op, out_shape);
allocate(out);
// Serialize the weights tensor and generate the function to deserialize it in the artifact.
}
}
- Shape transposed_shape;
- switch (out_shape.rank()) {
- case 2:
- transposed_shape = transposeShape<1, 0>(out_shape);
- break;
- case 1:
- transposed_shape = out_shape;
- break;
- default:
- throw AclCppException("Unsupported number of dimensions: " + to_string(out_shape.rank()));
- }
- shared_ptr<ArtifactId> out = genTensor(op, transposed_shape);
+ shared_ptr<ArtifactId> out = genTensor(op, out_shape);
// Create an instance of the CLReshapeLayer class as a member of the artifact class.
auto layer = genLayer("arm_compute::CLReshapeLayer", out->name() + "_reshape_layer",
transposed_output_shape = transposeShape<0, 3, 1, 2>(out_shape);
break;
case 2:
- transposed_output_shape = transposeShape<1, 0>(out_shape);
- break;
case 1:
transposed_output_shape = out_shape;
break;
const string transposed_output_name = output_tensor_name + "_transposed_output";
shared_ptr<ArtifactId> transposed_output =
- genTensor(transposed_output_name, transposeShape<3, 2, 1, 0>(transposed_output_shape));
+ genTensor(transposed_output_name, transposed_output_shape);
auto operation_name = transposed_output->name() + "_scale_layer";
const auto& ir_scales = op.getWeights();
// Reshape the IR scales tensor and generate the corresponding DOM tensor.
- const Shape ir_input_shape = transposeShape<2, 1, 3, 0>(op.getInputShape(0));
+ const Shape ir_input_shape = transposeShape<0, 3, 1, 2>(op.getInputShape(0));
Shape ir_scales_shape(ir_input_shape.rank());
// ACL CLArithmeticDivision supports input tensors broadcasting.
for (int i = 0; i < ir_input_shape.rank(); ++i)
ir_scales_shape.dim(i) = 1;
- ir_scales_shape.dim(2) = ir_scales.getShape().dim(0);
+ ir_scales_shape.dim(1) = ir_scales.getShape().dim(0);
auto scales = genTensor(operation_name + "_scales", ir_scales_shape);
// We do not use the genMultiplication() function here because the input needs broadcasting.
void AclCppOpGenerator::visit(ops::ElementwiseOp& op) {
// Create the output tensor in the DOM and obtain its identifier.
- auto out = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+ auto out = genTensor(op, op.getOutputShape(0));
auto& prev_nodes = op.getPrevNodes();
assert(prev_nodes.size() >= 2);
template <typename Op>
void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, const string& suffix) {
- auto ir_weights = transposeTensor<1, 0, 2, 3>(op.getKernel());
+ auto ir_weights = transposeTensor<3, 2, 0, 1>(op.getKernel());
const auto& ir_weights_shape = ir_weights.getShape();
- assert(ir_weights_shape.rank() == 4);
- Shape ir_biases_shape({ir_weights_shape.dim(-1)});
auto& prev_nodes = op.getPrevNodes();
assert(prev_nodes.size() == 1);
const string transposed_output_name = output_tensor_name + "_transposed_output";
Shape transposed_output_shape = transposeShape<0, 3, 1, 2>(op.getOutputShape(0));
shared_ptr<ArtifactId> transposed_output =
- genTensor(transposed_output_name, transposeShape<3, 2, 1, 0>(transposed_output_shape));
+ genTensor(transposed_output_name, transposed_output_shape);
string operation_name = output_tensor_name + suffix;
// Create the output tensor in the DOM and return its id.
shared_ptr<ArtifactId> output;
if (cli::debugTranspose)
- output = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+ output = genTensor(op, op.getOutputShape(0));
else
- output = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0)));
+ output = genTensor(op, transposeShape<0, 3, 1, 2>(op.getOutputShape(0)));
auto prefix = output->name() + "_activation_layer";
// create vector of initializers from Shape
shape_vectorized.reserve(ir_shape.rank());
for (int i = 0; i < ir_shape.rank(); ++i)
- shape_vectorized.push_back(ir_shape.dim(i));
+ shape_vectorized.push_back(ir_shape.dim(-i - 1));
const char* type_name = "arm_compute::TensorShape";
shared_ptr<ArtifactId> shape =
dimensions.at(i) = shape.dim(i);
}
- for (;;) {
- float v;
- memcpy(&v, tensor.at(coords), tensor.getElementSize());
- _parOut.write(tensor.at(coords), tensor.getElementSize());
- bool stop = true;
- int i;
-
- for (i = 0; i < shape.rank(); ++i) {
- if(coords.at(i) < dimensions.at(i) - 1) {
- ++coords.at(i);
- stop = false;
- break;
- }
- }
-
- if (stop) {
- break;
- } else {
- for (int j = 0; j < i; ++j)
- coords.at(j) = 0;
- }
- }
+ size_t data_size = tensor.getElementSize() * tensor.getShape().numElements();
+ _parOut.write(tensor.atOffset(0), data_size);
}
void AclCppOpGenerator::genDeserializations() {
if (op.getOutputShape(0).rank() != 4)
throw AclCppException("Unsupported number of dimensions in transpose operation");
// TODO replace transpose shape
- shared_ptr<ArtifactId> output = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+ shared_ptr<ArtifactId> output = genTensor(op, op.getOutputShape(0));
// Actual generation of operation and related stuff
genTranspose(input, output, mir_axis_order);