From 5cb9a4a4642de09414925f110485a247d10c0e4f Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EC=98=A4=ED=98=95=EC=84=9D/On-Device=20Lab=28SR=29/Staff?=
 =?utf8?q?=20Engineer/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?=
 <hseok82.oh@samsung.com>
Date: Wed, 27 Mar 2019 18:53:14 +0900
Subject: [PATCH] Fix cl codes' format (#4820)

Fix cl codes' format using c++ format

Signed-off-by: Hyeongseok Oh <hseok82.oh@samsung.com>
---
 .../src/core/CL/cl_kernels/activation_layer_ex.cl  |  99 ++++---
 .../src/core/CL/cl_kernels/arg_operation.cl        | 135 +++++----
 .../core/CL/cl_kernels/arithmetic_op_quantized.cl  | 193 ++++++++-----
 .../src/core/CL/cl_kernels/binary_logical_op.cl    | 122 ++++----
 libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl   | 238 +++++++++------
 .../src/core/CL/cl_kernels/depth_to_space.cl       | 202 +++++++------
 .../src/core/CL/cl_kernels/embedding_lookup.cl     | 123 +++++---
 libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl    |  63 ++--
 libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl | 156 ++++++----
 .../src/core/CL/cl_kernels/hashtable_lookup.cl     | 131 +++++----
 .../src/core/CL/cl_kernels/neg_tensor.cl           |  37 ++-
 libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl    | 127 ++++----
 .../src/core/CL/cl_kernels/permute_ex.cl           |  97 ++++---
 .../core/CL/cl_kernels/pixelwise_mul_quantized.cl  | 146 ++++++----
 libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl  | 100 ++++---
 .../src/core/CL/cl_kernels/prelu_quantized.cl      | 124 ++++----
 .../src/core/CL/cl_kernels/reduce_operation.cl     | 240 ++++++++-------
 .../src/core/CL/cl_kernels/space_to_batch.cl       | 321 +++++++++++++--------
 .../src/core/CL/cl_kernels/space_to_depth.cl       | 202 +++++++------
 .../src/core/CL/cl_kernels/squared_difference.cl   | 102 ++++---
 libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl |  61 ++--
 .../src/core/CL/cl_kernels/topkv2_quicksort.cl     | 125 ++++----
 .../src/core/CL/cl_kernels/topkv2_radixsort.cl     | 182 ++++++------
 scripts/command/format-check                       |   2 +-
 24 files changed, 1947 insertions(+), 1381 deletions(-)

diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl
index f54c7bd..5f73e41 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl
@@ -23,10 +23,7 @@
 #define RSQRT_OP(a) DIV_OP(CONST_ONE, sqrt((a)))
 
 // Inverse Square-root Activation
-inline TYPE rsqrt_op(TYPE x)
-{
-    return RSQRT_OP(x);
-}
+inline TYPE rsqrt_op(TYPE x) { return RSQRT_OP(x); }
 
 #define ACTIVATION_OP2(op, x) op##_op(x)
 #define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
@@ -35,55 +32,75 @@ inline TYPE rsqrt_op(TYPE x)
 
 /** This performs an activation function floating point inputs.
  *
- * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ * @note In order to perform the activation function "in-place",
+ *       the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type.
+ *       e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size.
+ *       e.g. -DVEC_SIZE=16
+ * @note Activation function should be given as a preprocessor argument using -DACT=name.
+ *       e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set
+ *       using -DA_VAL= and -DB_VAL= respectively.
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_ptr                            Pointer to the source image.
+ *                                                  Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements
+ *                                                  along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements
+ *                                                  along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension
+ *                                                  (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements
+ *                                                  along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element
+ *                                                  in the source image
+ * @param[out] output_ptr                           Pointer to the destination image.
+ *                                                  Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image
+ *                                                  in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements
+ *                                                  along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image
+ *                                                  in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements
+ *                                                  along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements
+ *                                                  along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element
+ *                                                  in the destination image
  */
-__kernel void activation_layer_ex(
-    TENSOR3D_DECLARATION(input)
+__kernel void activation_layer_ex(TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
+                                      ,
+                                  TENSOR3D_DECLARATION(output)
 #endif /* not IN_PLACE */
-)
+                                      )
 {
-    // Get pixels pointer
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  // Get pixels pointer
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
 #ifdef IN_PLACE
-    Tensor3D output = input;
+  Tensor3D output = input;
 #else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 #endif /* IN_PLACE */
 
-    // Load data
-    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+  // Load data
+  TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
 
-    // Perform activation
-    data = ACTIVATION_OP(ACT, data);
+  // Perform activation
+  data = ACTIVATION_OP(ACT, data);
 
-    // Store result
-    VSTORE(VEC_SIZE)
-    (data, 0, (__global DATA_TYPE *)output.ptr);
+  // Store result
+  VSTORE(VEC_SIZE)
+  (data, 0, (__global DATA_TYPE *)output.ptr);
 }
 
 #endif /* defined(ACT) */
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
index a7b3550..2a6dfc9 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
@@ -19,76 +19,95 @@
 #if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
 /** Perform arg_max/arg_min
  *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
- * -DOP_CODE = number. e.g. -DOP_CODE=1
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type.
+ *       e.g. -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ *            e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as
+ *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: U32
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ * types:
+ *                                                  U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension
+ *                                                  (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element
+ *                                                  in the source image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension
+ *                                                  (in bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image.
+ *                                                  Supported data types: U32
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
  * @param[in]  axis                                 Axis through which reduction occurs
  * @param[in]  dim                                  Dimension across the axis to be reduced.
  */
 
-__kernel void arg_op(TENSOR4D_DECLARATION(input),
-                     TENSOR4D_DECLARATION(output),
-		     const int axis,
-		     const int dim)
+__kernel void arg_op(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int axis,
+                     const int dim)
 {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
 
-    int indices[4] =
-    {
-        get_global_id(0),
-        get_global_id(1),
-        get_global_id(2) % DEPTH_OUT,
-        get_global_id(2) / DEPTH_OUT,
-    };
+  int indices[4] = {
+      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
+      get_global_id(2) / DEPTH_OUT,
+  };
 
-    DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
-    DATA_TYPE tval = value;
-    int idx = 0;
-    for(int i = 1; i < dim; ++i)
-    {
-      indices[axis] = i;
+  DATA_TYPE value =
+      *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+  DATA_TYPE tval = value;
+  int idx = 0;
+  for (int i = 1; i < dim; ++i)
+  {
+    indices[axis] = i;
 
-      #if OP_CODE == 1 // ArgMax
-        value = max(value, *((__global DATA_TYPE *)
-		             tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
-      #elif OP_CODE == 2 //ArgMin
-        value = min(value, *((__global DATA_TYPE *)
-		    	     tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
-      #else
-	return;
+#if OP_CODE == 1 // ArgMax
+    value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+                                                               indices[2], indices[3])));
+#elif OP_CODE == 2 // ArgMin
+    value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+                                                               indices[2], indices[3])));
+#else
+    return;
 
-      #endif
+#endif
 
-      if(tval!=value)
-      {
-        idx = indices[axis];
-        tval = value;
-      }
+    if (tval != value)
+    {
+      idx = indices[axis];
+      tval = value;
     }
+  }
 
-    *((__global uint *)out.ptr) = idx;
+  *((__global uint *)out.ptr) = idx;
 }
 #endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
index 5cd0a43..77e239f 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
@@ -24,7 +24,8 @@
 #define SUB(x, y) (x) - (y)
 #endif /* SATURATE */
 
-/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to
+ *  QASYMM8
  *
  * The following computations will be performed:
  *
@@ -36,91 +37,131 @@
  *  -# Shift the int32 accumulator by result_shift
  *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
  *
- * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The number of bits to shift left of input tensors must be passed at compile time using -DLEFT_SHIFT
- * @attention The offset, scalar scale factor and number of bits to shift right of input tensors must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, -DIN2_OFFSET, -RIN2_MULT_INT and -DIN2_SHIFT
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ * @attention The inputs and output data types need to be passed at compile time using
+ *            -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ *            e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+ * @attention The number of bits to shift left of input tensors must be passed at compile time using
+ *            -DLEFT_SHIFT
+ * @attention The offset, scalar scale factor and number of bits to shift right of input tensors
+ *            must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT,
+ -DIN2_OFFSET,
+ *            -RIN2_MULT_INT and -DIN2_SHIFT
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ *            must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and
+ -DRESULT_SHIFT
  *
- * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The inputs and output scale information of qasymm8 need to be passed at compile time using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT:
- * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f
- * @attention The inputs and output scale offset need to be passed at compile time using -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT:
- * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @attention The input and output data_types need to be passed at compile time using
+ *            -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ *            e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+ * @attention The inputs and output scale information of qasymm8 need to be passed at compile time
+ *            using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT:
+ *            e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f
+ * @attention The inputs and output scale offset need to be passed at compile time using
+ *            -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT:
+ *            e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise
+ *            wrapping policy will be used.
  *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  in1_ptr                           Pointer to the source tensor.
+ *                                               Supported data types: QASYMM8
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension
+ *                                               (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension
+ *                                               (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension
+ *                                               (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source
+ *                                               tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types:
+ *                                               QASYMM8
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension
+ *                                               (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension
+ *                                               (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension
+ *                                               (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source
+ *                                               tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor.
+ *                                               Supported data types: QASYMM8
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension
+ *                                               (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
+ *                                               per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension
+ *                                               (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension
+ *                                               (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed
+ *                                               per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination
+ *                                               tensor
  */
-__kernel void arithmetic_add_qasymm8(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
+__kernel void arithmetic_add_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2),
+                                     TENSOR3D_DECLARATION(out))
 {
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+  // Get pixels pointer
+  Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+  Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+  Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
-    // Load data
-    VEC_DATA_TYPE(int, 16)
-    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
-    VEC_DATA_TYPE(int, 16)
-    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
+  // Load data
+  VEC_DATA_TYPE(int, 16)
+  in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
+  VEC_DATA_TYPE(int, 16)
+  in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
 
-    // Get scaled value of two inputs
-    VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
-    VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
+  // Get scaled value of two inputs
+  VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
+  VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
 
-    VEC_DATA_TYPE(int, 16) left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT);
-    VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift;
-    VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift;
+  VEC_DATA_TYPE(int, 16)
+  left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT);
+  VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift;
+  VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift;
 
-    VEC_DATA_TYPE(int, 16) scaled_in1_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16);
-    VEC_DATA_TYPE(int, 16) scaled_in2_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16);
+  VEC_DATA_TYPE(int, 16)
+  scaled_in1_val =
+      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16);
+  VEC_DATA_TYPE(int, 16)
+  scaled_in2_val =
+      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16);
 
-    // Add inputs and multiply with a multiplier smaller than 1
-    VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val;
-    VEC_DATA_TYPE(int, 16) out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
-    out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
+  // Add inputs and multiply with a multiplier smaller than 1
+  VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val;
+  VEC_DATA_TYPE(int, 16)
+  out_val =
+      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
+  out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
 
-    VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
+  VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
 
-// TODO: Apply min-max BOUND to support fuse with relu.
-/*
-#if defined(MIN_BOUND)
-    res = max(res, (uchar16)MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    res = min(res, (uchar16)MAX_BOUND);
-#endif // defined(MAX_BOUND)
-*/
+  // TODO: Apply min-max BOUND to support fuse with relu.
+  /*
+  #if defined(MIN_BOUND)
+      res = max(res, (uchar16)MIN_BOUND);
+  #endif // defined(MIN_BOUND)
+  #if defined(MAX_BOUND)
+      res = min(res, (uchar16)MAX_BOUND);
+  #endif // defined(MAX_BOUND)
+  */
 
-    // Store result
-    VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)),
-                     0, (__global DATA_TYPE_OUT *)out.ptr);
+  // Store result
+  VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
 }
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
index bea61f5..8c87551 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
@@ -22,63 +22,85 @@
 
 #if defined(OP_CODE) && defined(DATA_TYPE)
 /** returns truth value of the two input tensors for BINARY LOGICAL OP.
- * where BINARY LOGICAL OP can be AND, OR.
+ *  where BINARY LOGICAL OP can be AND, OR.
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
- * -DOP_CODE = number. e.g. -DOP_CODE=1
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size.
+ *            e.g. -DVEC_SIZE=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as
+ *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
  *
- * @param[in]  input1_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
- * @param[in]  input1_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source tensor
- *
- * @param[in]  input2_ptr                            Pointer to the source tensor.Supported data types: QASYMM8
- * @param[in]  input2_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input2_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input2_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source tensor
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input1_ptr                            Pointer to the source tensor.
+ *                                                   Supported data types: QASYMM8
+ * @param[in]  input1_stride_x                       Stride of the source tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension
+ *                                                   (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                   tensor
+ * @param[in]  input2_ptr                            Pointer to the source tensor.
+ *                                                   Supported data types: QASYMM8
+ * @param[in]  input2_stride_x                       Stride of the source tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  input2_step_x                         input2_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input2_stride_y                       Stride of the source tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension
+ *                                                   (in bytes)
+ * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                   tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor.
+ *                                                   Supported data types: QASYMM8
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
  */
-__kernel void binary_logical_op(
-    TENSOR3D_DECLARATION(input1),
-    TENSOR3D_DECLARATION(input2),
-    TENSOR3D_DECLARATION(output))
+__kernel void binary_logical_op(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATION(input2),
+                                TENSOR3D_DECLARATION(output))
 {
-    Tensor3D input1  = CONVERT_TO_TENSOR3D_STRUCT(input1);
-    Tensor3D input2  = CONVERT_TO_TENSOR3D_STRUCT(input2);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+  Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1);
+  Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    #if OP_CODE == 1 // LOGICAL AND
-     VSTORE(VEC_SIZE)
-     (CONVERT(VLOAD(VEC_SIZE)
-	     (0, (__global DATA_TYPE *)input1.ptr) && VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
-             VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr);
+#if OP_CODE == 1 // LOGICAL AND
+  VSTORE(VEC_SIZE)
+  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) &&
+               VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+           VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+   0, (__global DATA_TYPE *)output.ptr);
 
-    #elif OP_CODE == 2 // LOGICAL OR
-     VSTORE(VEC_SIZE)
-     (CONVERT(VLOAD(VEC_SIZE)
-	     (0, (__global DATA_TYPE *)input1.ptr) || VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
-             VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr);
+#elif OP_CODE == 2 // LOGICAL OR
+  VSTORE(VEC_SIZE)
+  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) ||
+               VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+           VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+   0, (__global DATA_TYPE *)output.ptr);
 
-    #else // OP NOT SUPPORTED
-     return
+#else // OP NOT SUPPORTED
+  return
 
-    #endif
+#endif
 }
-#endif //if defined(OP_CODE) && defined(DATA_TYPE)
+#endif // if defined(OP_CODE) && defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
index 3d4675e..d5a0747 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
@@ -17,10 +17,10 @@
 #include "helpers.h"
 
 #ifndef SCALE
-#define SCALE    1.0f
+#define SCALE 1.0f
 #endif
 #ifndef OFFSET
-#define OFFSET   0
+#define OFFSET 0
 #endif
 #ifndef VEC_SIZE
 #define VEC_SIZE 1
@@ -29,118 +29,170 @@
 #if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
 /** Perform a cast operation on an input tensor.
  *
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
+ *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
  */
-__kernel void cast(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
+__kernel void cast(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
 {
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
-                             VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
-                     0, (__global DATA_TYPE_OUT *)output.ptr);
+  VSTORE(VEC_SIZE)
+  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
+           VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
+   0, (__global DATA_TYPE_OUT *)output.ptr);
 }
 
 /** Perform a cast operation on an QASYMM8 input tensor.
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Offset and Scale of input should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
+ *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+ * @attention Offset and Scale of input should be given as a preprocessor argument using
+ *            -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
  */
-__kernel void cast_qasymm_in(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
+__kernel void cast_qasymm_in(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
 {
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
-        VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-    VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
-    VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
+  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+  in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+  VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
+  VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
 
-    VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
-    VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
+  VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
+  VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
 
-    VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
-                     0, (__global DATA_TYPE_OUT *)output.ptr);
+  VSTORE(VEC_SIZE)
+  (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
+   (__global DATA_TYPE_OUT *)output.ptr);
 }
 
-
 /** Perform a cast operation on an QASYMM8 output tensor.
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Offset and Scale of output should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
+ *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+ * @attention Offset and Scale of output should be given as a preprocessor argument using
+ *            -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                 bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
  */
-__kernel void cast_qasymm_out(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
+__kernel void cast_qasymm_out(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
 {
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
-        VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-    VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
-    VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
+  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+  in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+  VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
+  VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
 
-    VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
-    VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
+  VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
+  VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
 
-    VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
-                     0, (__global DATA_TYPE_OUT *)output.ptr);
+  VSTORE(VEC_SIZE)
+  (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
+   (__global DATA_TYPE_OUT *)output.ptr);
 }
 #endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
index 08f7266..e005322 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
@@ -20,104 +20,142 @@
 /** Perform space to depth rearrangement of tensor
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument using -DZ_OUT=size. e.g. -DZ_OUT=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ *            e.g. -DDEPTH_OUT=16
+ * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
+ *            using -DZ_OUT=size. e.g. -DZ_OUT=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+ *            -DBLOCK_SIZE=1
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
-t_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
  */
-__kernel void depth_to_space_nchw(
-     TENSOR4D_DECLARATION(input),
-     TENSOR4D_DECLARATION(output))
- {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
+__kernel void depth_to_space_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
 
-    int out_index[4]={0};
-    int in_index[4]={0};
+  int out_index[4] = {0};
+  int in_index[4] = {0};
 
-    out_index[0] = get_global_id(0);//W
-    out_index[1] = get_global_id(1);//H
-    out_index[2] = get_global_id(2) % Z_OUT;//C
-    out_index[3] = get_global_id(2) / Z_OUT;//B
+  out_index[0] = get_global_id(0);         // W
+  out_index[1] = get_global_id(1);         // H
+  out_index[2] = get_global_id(2) % Z_OUT; // C
+  out_index[3] = get_global_id(2) / Z_OUT; // B
 
-    in_index[0] = out_index[0]/BLOCK_SIZE;
-    in_index[1] = out_index[1]/BLOCK_SIZE;
-    in_index[2] = out_index[2] + ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT;
-    in_index[3] = out_index[3];
+  in_index[0] = out_index[0] / BLOCK_SIZE;
+  in_index[1] = out_index[1] / BLOCK_SIZE;
+  in_index[2] = out_index[2] +
+                ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT;
+  in_index[3] = out_index[3];
 
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3]));
- }
+  *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
+      &in, in_index[0], in_index[1], in_index[2], in_index[3]));
+}
 #endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
 
 #if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
 /** Perform space to depth rearrangement of tensor (NHWC)
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument using -DZ_OUT=size. e.g. -DZ_OUT=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ *            e.g. -DDEPTH_OUT=16
+ * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
+ *            using -DZ_OUT=size. e.g. -DZ_OUT=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+ *            -DBLOCK_SIZE=1
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
-t_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
  */
-__kernel void depth_to_space_nhwc(
-     TENSOR4D_DECLARATION(input),
-     TENSOR4D_DECLARATION(output))
- {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
+__kernel void depth_to_space_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
 
-    int out_index[4]={0};
-    int in_index[4]={0};
+  int out_index[4] = {0};
+  int in_index[4] = {0};
 
-    out_index[0] = get_global_id(0);//C
-    out_index[1] = get_global_id(1);//W
-    out_index[2] = get_global_id(2) % Z_OUT;//H
-    out_index[3] = get_global_id(2) / Z_OUT;//B
+  out_index[0] = get_global_id(0);         // C
+  out_index[1] = get_global_id(1);         // W
+  out_index[2] = get_global_id(2) % Z_OUT; // H
+  out_index[3] = get_global_id(2) / Z_OUT; // B
 
-    in_index[0] = out_index[0] + ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT;
-    in_index[1] = out_index[1]/BLOCK_SIZE;
-    in_index[2] = out_index[2]/BLOCK_SIZE;
-    in_index[3] = out_index[3];
+  in_index[0] = out_index[0] +
+                ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT;
+  in_index[1] = out_index[1] / BLOCK_SIZE;
+  in_index[2] = out_index[2] / BLOCK_SIZE;
+  in_index[3] = out_index[3];
 
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3]));
- }
+  *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
+      &in, in_index[0], in_index[1], in_index[2], in_index[3]));
+}
 #endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
index 348458f..dd8cb6d 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
@@ -23,62 +23,91 @@
 #if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
 /** Perform embedding_lookup of input tensor
  *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
- * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ *       -DDATA_TYPE=short
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ *            -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using
+ *            -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
  *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                          output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data types: S32
- * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in bytes)
- * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data
+ *                                                   types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source
+ *                                                   tensor
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_w                          output_stride_w * number of elements along W
+ *                                                   processed per workitem(in bytes)
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported
+ *                                                   data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the source tensor in W dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
+ *                                                   destination tensor
+ * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data
+ *                                                   types: S32
+ * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in
+ *                                                   bytes)
+ * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups
+ *                                                   vector
  */
 
-__kernel void embedding_lookup(TENSOR4D_DECLARATION(input),
-                               TENSOR4D_DECLARATION(output),
+__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
                                VECTOR_DECLARATION(lookups))
 {
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-    Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
 
-    Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
+  Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
 
-    //lookup ids for based on the tensor dimensions
-    int lup_id[4] = {0};
+  // lookup ids for based on the tensor dimensions
+  int lup_id[4] = {0};
 
-    lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0)))
-	                       :get_global_id(0);
-    lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1)))
-	                       :get_global_id(1);
-    lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2)))
-	                       :get_global_id(2)%DEPTH_OUT;
-    lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
-                               :get_global_id(2) / DEPTH_OUT;
+  lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0)))
+                              : get_global_id(0);
+  lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1)))
+                              : get_global_id(1);
+  lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2)))
+                              : get_global_id(2) % DEPTH_OUT;
+  lup_id[3] = (NUM_DIMS == 4)
+                  ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+                  : get_global_id(2) / DEPTH_OUT;
 
-    in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y
-              + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
+  in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x +
+            lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
 
-    VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
-                     0, (__global DATA_TYPE *)out.ptr);
+  VSTORE(VEC_SIZE)
+  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0,
+   (__global DATA_TYPE *)out.ptr);
 }
 #endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl
index 69d94f3..cb80359 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl
@@ -24,34 +24,49 @@
 /** Perform an exponential operation on an input tensor.
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
  * @note Can only take floating point data types.
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
  */
-__kernel void exp_layer(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
+__kernel void exp_layer(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
 {
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    VSTORE(VEC_SIZE)
-    (exp(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr);
+  VSTORE(VEC_SIZE)
+  (exp(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr);
 }
 #endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
index 05560e8..26d2830 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
@@ -19,93 +19,121 @@
 #if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
 
 /** Performs the Gather operation along the chosen axis
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ *       -DDATA_TYPE=short
  * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
- * @attention Output tensor depth should be given as a preprocessor argument using -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
- * @attention Input tensor depth should be given as a preprocessor argument using -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ *            -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
+ * @attention Input tensor depth should be given as a preprocessor argument using
+ *            -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
  *
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per work item (in bytes)
- * @param[in]  input_stride_w                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per work item (in bytes)
- * @param[in]  input_offset_first_element_in_bytes   Offset of the first element in the source tensor
- * @param[in]  indices_ptr                           Pointer to the source tensor. Supported data types: S32
- * @param[in]  indices_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  indices_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  indices_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in  bytes)
- * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per work item (in bytes)
- * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per work item (in bytes)
- * @param[in]  output_offset_first_element_in_bytes  Offset of the first element in the destination tensor
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data
+ *                                                   types: U8/S8/U16/S16/U32/S32/F16/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X
+ *                                                   processed per work item (in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y
+ *                                                   processed per work item (in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z
+ *                                                   processed per work item (in bytes)
+ * @param[in]  input_stride_w                        Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_w                          input_stride_w * number of elements along W
+ *                                                   processed per work item (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   Offset of the first element in the source
+ *                                                   tensor
+ * @param[in]  indices_ptr                           Pointer to the source tensor. Supported data
+ *                                                   types: S32
+ * @param[in]  indices_stride_x                      Stride of the source tensor in X dimension (in
+ *                                                   bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X
+ *                                                   processed per workitem(in  bytes)
+ * @param[in]  indices_stride_y                      Stride of the source tensor in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y
+ *                                                   processed per workitem(in  bytes)
+ * @param[in]  indices_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z
+ *                                                   processed per workitem(in  bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the
+ *                                                   destination tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported
+ *                                                   data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per work item (in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per work item (in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per work item (in bytes)
+ * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W
+ *                                                   processed per work item (in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  Offset of the first element in the destination
+ *                                                   tensor
  */
-__kernel void gather(
-    TENSOR4D_DECLARATION(input),
-    TENSOR3D_DECLARATION(indices),
-    TENSOR4D_DECLARATION(output))
+__kernel void gather(TENSOR4D_DECLARATION(input), TENSOR3D_DECLARATION(indices),
+                     TENSOR4D_DECLARATION(output))
 {
-    const int px = get_global_id(0);
-    const int py = get_global_id(1);
-    const int pz = get_global_id(2) % OUTPUT_DIM_Z;
-    const int pw = get_global_id(2) / OUTPUT_DIM_Z;
+  const int px = get_global_id(0);
+  const int py = get_global_id(1);
+  const int pz = get_global_id(2) % OUTPUT_DIM_Z;
+  const int pw = get_global_id(2) / OUTPUT_DIM_Z;
 
-    const Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
-    const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
-    Tensor4D       output  = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
+  const Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
+  const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
+  Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
 
 #if AXIS == 0
 #if INDICES_DIM == 1
-    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0);
-    __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
 #elif INDICES_DIM == 2
-    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, px, py, 0);
-    __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0);
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0);
 #elif INDICES_DIM == 3
-    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, px, py, pz);
-    __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0);
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz);
+  __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0);
 #endif
 #elif AXIS == 1
 #if INDICES_DIM == 1
-    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0);
-    __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
 #elif INDICES_DIM == 2
-    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0);
-    __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0);
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0);
 #elif INDICES_DIM == 3
-    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw);
-    __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0);
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0);
 #endif
 #elif AXIS == 2
 #if INDICES_DIM == 1
-    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0);
-    __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
 #elif INDICES_DIM == 2
-    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0);
-    __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0);
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0);
 #endif
 #elif AXIS == 3
 #if INDICES_DIM == 1
-    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0);
-    __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
 #endif
-#endif //AXIS
+#endif // AXIS
 
-    *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
+  *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
 }
 
-#endif //defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
+#endif // defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
index ed74098..73f29e3 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
@@ -23,66 +23,95 @@
 #if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
 /** Perform hashtable_lookup of input tensor
  *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
- * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ *       -DDATA_TYPE=short
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ *            -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using
+ *            -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
  *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                          output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data types: S32
- * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in bytes)
- * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data
+ *                                                   types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source
+ *                                                   tensor
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_w                          output_stride_w * number of elements along W
+ *                                                   processed per workitem(in bytes)
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported
+ *                                                   data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the source tensor in W dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
+ *                                                   destination tensor
+ * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data
+ *                                                   types: S32
+ * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in
+ *                                                   bytes)
+ * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups
+ *                                                   vector
  */
-__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input),
-                               TENSOR4D_DECLARATION(output),
+__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
                                VECTOR_DECLARATION(lookups))
 {
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-    Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
 
-    Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
+  Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
 
-    int lup_id[4] = {0};
+  int lup_id[4] = {0};
 
-    lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0)))
-	                       :get_global_id(0);
-    lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1)))
-	                       :get_global_id(1);
-    lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2)))
-	                       :get_global_id(2)%DEPTH_OUT;
-    lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
-                               :get_global_id(2) / DEPTH_OUT;
+  lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0)))
+                              : get_global_id(0);
+  lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1)))
+                              : get_global_id(1);
+  lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2)))
+                              : get_global_id(2) % DEPTH_OUT;
+  lup_id[3] = (NUM_DIMS == 4)
+                  ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+                  : get_global_id(2) / DEPTH_OUT;
 
-    if (lup_id[NUM_DIMS-1] < 0)
-    {
-      VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr);
-      return;
-    }
+  if (lup_id[NUM_DIMS - 1] < 0)
+  {
+    VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr);
+    return;
+  }
 
-    in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y
-              + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
+  in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x +
+            lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
 
-    VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
-                     0, (__global DATA_TYPE *)out.ptr);
+  VSTORE(VEC_SIZE)
+  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0,
+   (__global DATA_TYPE *)out.ptr);
 }
 #endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
index e3aa463..4aa7883 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
@@ -23,26 +23,33 @@
 #if defined(DATA_TYPE)
 /** Performs a negation of input tensor.
  *
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
  *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: S16/S32/F16/F32.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types:
+ *                                               S16/S32/F16/F32.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in
+ *                                               bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed
+ *                                               per work item (in bytes)
  * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data
+ *                                               types: same as @p input_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in
+ *                                               bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
+ *                                               per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination
+ * image
+ *
  */
-__kernel void neg_tensor(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
+__kernel void neg_tensor(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
 {
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    VSTORE(VEC_SIZE)
-    (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr);
+  VSTORE(VEC_SIZE)
+  (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr);
 }
 #endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl
index 91106a8..e74c6b5 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl
@@ -16,71 +16,84 @@
  */
 #include "helpers.h"
 
-#if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE)
+#if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && \
+    defined(ZERO_VALUE)
 /** Basic function to pad a tensor
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention Input dimensions should be passed as a preprocessor argument using -DIW(width), -DIH(height), -DID(depth) and -DIB(batch). e.g. -DIW = 4
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ *            e.g. -DDEPTH_OUT=16
+ * @attention Input dimensions should be passed as a preprocessor argument using -DIW(width),
+ *            -DIH(height), -DID(depth) and -DIB(batch). e.g. -DIW = 4
  * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
  *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p inpu
-t_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- *
- * @param[in]  pad_values                           Padding values for each of the dimensions. Only pad values for Up(for
- *                                                  batch), Top(for height), Left(for width) and Front(for depth) are
- *				                    required. Supported data type: S32
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data
+ *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported
+ *                                                  data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination tensor
+ * @param[in]  pad_values                           Padding values for each of the dimensions. Only
+ *                                                  pad values for Up(for batch), Top(for height),
+ *                                                  Left(for width) and Front(for depth) are
+ *                                                  required. Supported data type: S32
  */
 
-__kernel void pad(
-     TENSOR4D_DECLARATION(input),
-     TENSOR4D_DECLARATION(output),
-     const int4 pad_values)
- {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+__kernel void pad(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int4 pad_values)
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
 
-    int index[4]={0};
+  int index[4] = {0};
 
-    index[0] = get_global_id(0);//W
-    index[1] = get_global_id(1);//H
-    index[2] = get_global_id(2) % DEPTH_OUT;//C
-    index[3] = get_global_id(2) / DEPTH_OUT;//N
+  index[0] = get_global_id(0);             // W
+  index[1] = get_global_id(1);             // H
+  index[2] = get_global_id(2) % DEPTH_OUT; // C
+  index[3] = get_global_id(2) / DEPTH_OUT; // N
 
-    if (index[0] < pad_values.x || index[0] >= (IW + pad_values.x) ||
-        index[1] < pad_values.y || index[1] >= (IH + pad_values.y) ||
-        index[2] < pad_values.z || index[2] >= (ID + pad_values.z) ||
-        index[3] < pad_values.w || index[3] >= (IB + pad_values.w))
-    {
-        *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
-    }
-    else
-    {
-        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)
-                                              tensor4D_offset(&in, index[0] - pad_values.x,
-                                                              index[1] - pad_values.y,
-                                                              index[2] - pad_values.z,
-                                                              index[3] - pad_values.w));
-    }
- }
+  if (index[0] < pad_values.x || index[0] >= (IW + pad_values.x) || index[1] < pad_values.y ||
+      index[1] >= (IH + pad_values.y) || index[2] < pad_values.z ||
+      index[2] >= (ID + pad_values.z) || index[3] < pad_values.w || index[3] >= (IB + pad_values.w))
+  {
+    *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
+  }
+  else
+  {
+    *((__global DATA_TYPE *)out.ptr) = *(
+        (__global DATA_TYPE *)tensor4D_offset(&in, index[0] - pad_values.x, index[1] - pad_values.y,
+                                              index[2] - pad_values.z, index[3] - pad_values.w));
+  }
+}
 
-#endif //if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE)
+#endif // if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) &&
+       // defined(ZERO_VALUE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl
index c628c88..637788a 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl
@@ -17,55 +17,66 @@
 
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && \
+    defined(P4)
 /** Perform a Generic permute operation on an input tensor of Shape DCHW.
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
+              e.g. -DDEPTH_IN=16
+ * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and
+              -DP4=int, e.g. -DP1=2
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U1
-6/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in b
-ytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in b
-ytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in b
-ytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
-t_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
  */
-__kernel void permute_generic(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
+__kernel void permute_generic(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
 {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
 
-    // WHCN format
-    int in_index[]={
-     get_global_id(0),
-     get_global_id(1),
-     get_global_id(2) % DEPTH_IN,
-     get_global_id(2) / DEPTH_IN,
-    };
+  // WHCN format
+  int in_index[] = {
+      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_IN, get_global_id(2) / DEPTH_IN,
+  };
 
-    // New locations based on Permuted index calc as out_index[index] = in_index[new_index]
-    *((__global DATA_TYPE *)
-      tensor4D_offset(&out, in_index[P1], in_index[P2], in_index[P3], in_index[P4])) = *((__global DATA_TYPE *)in.ptr);
+  // New locations based on Permuted index calc as out_index[index] = in_index[new_index]
+  *((__global DATA_TYPE *)tensor4D_offset(&out, in_index[P1], in_index[P2], in_index[P3],
+                                          in_index[P4])) = *((__global DATA_TYPE *)in.ptr);
 }
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) &&
+       // defined(P4)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
index ab1307e..2074d3c 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
@@ -24,7 +24,8 @@
 #define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
 
 #if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
-/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of
+ *  GEMMLowp to QASYMM8
  *
  * The following computations will be performed by the kernel:
  *
@@ -35,77 +36,100 @@
  *  -# Shift the int32 accumulator by result_shift
  *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
  *
- * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and -DIN2_OFFSET
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ * @attention The inputs and output data types need to be passed at compile time using
+ *            -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ *            e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+ * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and
+ *            -DIN2_OFFSET
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ *            must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and
+ *            -DRESULT_SHIFT
  *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types:
+ *                                               U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in
+ *                                               bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in
+ *                                               bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in
+ *                                               bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed
+ *                                               per workitem(in bytes)
  * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types:
+ *                                               U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in
+ *                                               bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in
+ *                                               bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in
+ *                                               bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed
+ *                                               per workitem(in bytes)
  * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data
+ *                                               types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in
+ *                                               bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
+ *                                               per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in
+ *                                              bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in
+ *                                               bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination
+ *                                               image
  * @param[in]  scale                             Float scaling factor. Supported data types: F32
  */
-__kernel void pixelwise_mul_qasymm8(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out),
-    const float scale)
+__kernel void pixelwise_mul_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2),
+                                    TENSOR3D_DECLARATION(out), const float scale)
 {
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+  // Get pixels pointer
+  Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+  Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+  Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
-    // Load data
-    VEC_DATA_TYPE(int, 16)
-    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
-    VEC_DATA_TYPE(int, 16)
-    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
+  // Load data
+  VEC_DATA_TYPE(int, 16)
+  in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
+  VEC_DATA_TYPE(int, 16)
+  in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
 
-    // Perform multiplication of two inputs
-    VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
-    VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
-    VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val;
+  // Perform multiplication of two inputs
+  VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
+  VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
+  VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val;
 
-    // Multiply with a multiplier smaller than 1
-    out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
-    out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
+  // Multiply with a multiplier smaller than 1
+  out_val =
+      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
+  out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
 
-    VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
+  VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
 
-// TODO: Apply min-max BOUND to support fuse with relu.
-/*
-#if defined(MIN_BOUND)
-    res = max(res, (uchar16)MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    res = min(res, (uchar16)MAX_BOUND);
-#endif // defined(MAX_BOUND)
-*/
+  // TODO: Apply min-max BOUND to support fuse with relu.
+  /*
+  #if defined(MIN_BOUND)
+      res = max(res, (uchar16)MIN_BOUND);
+  #endif // defined(MIN_BOUND)
+  #if defined(MAX_BOUND)
+      res = min(res, (uchar16)MAX_BOUND);
+  #endif // defined(MAX_BOUND)
+  */
 
-    // Store result
-    VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)),
-                     0, (__global DATA_TYPE_OUT *)out.ptr);
+  // Store result
+  VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
 }
 #endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
index 68da2ba..62a8901 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
@@ -25,50 +25,72 @@
  * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
  * @note Can only take floating point data types.
  *
- * @param[in]  input1_ptr                            Pointer to the source image. Supported Data types : F16/F32
- * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  input1_ptr                            Pointer to the source image. Supported Data
+ *                                                   types : F16/F32
+ * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in
+ *                                                   bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                   image
+ * @param[in]  alpha_ptr                             Pointer to the source image. Supported Data
+ *                                                   types : F16/F32
+ * @param[in]  alpha_stride_x                        Stride of the source image in X dimension (in
+ *                                                   bytes)
+ * @param[in]  alpha_step_x                          input2_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  alpha_stride_y                        Stride of the source image in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  alpha_step_y                          input2_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  alpha_stride_z                        Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  alpha_step_z                          input2_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  alpha_offset_first_element_in_bytes   The offset of the first element in the source
+ *                                                   image
  *
- * @param[in]  alpha_ptr                            Pointer to the source image. Supported Data types : F16/F32
- * @param[in]  alpha_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  alpha_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  alpha_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  alpha_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  alpha_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  alpha_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  alpha_offset_first_element_in_bytes  The offset of the first element in the source image
- *
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] output_ptr                            Pointer to the destination image. Supported
+ *                                                   data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination image in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination image in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
+ *                                                   destination image
  */
-__kernel void prelu(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(alpha),
-    TENSOR3D_DECLARATION(output))
+__kernel void prelu(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
+                    TENSOR3D_DECLARATION(output))
 {
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D alpha  = CONVERT_TO_TENSOR3D_STRUCT(alpha);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VSTORE(VEC_SIZE)
-      (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 ?
-                       VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) :
-	               VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr),
-       0, (__global DATA_TYPE *)output.ptr);
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
+  VSTORE(VEC_SIZE)
+  (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0
+       ? VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) *
+             VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr)
+       : VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr),
+   0, (__global DATA_TYPE *)output.ptr);
 }
 #endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
index 7e97b7e..e69728d 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
@@ -17,7 +17,8 @@
 #include "helpers.h"
 #define SUB(x, y) (x) - (y)
 
-#if defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE)
+#if defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && \
+    defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE)
 
 #define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
 #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
@@ -26,63 +27,86 @@
 #define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
 
 /** Returns result of prelu function implemented as below:
- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
+ *  f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
  *
- * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g.
+ *            -DDATA_TYPE_IN=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
  * @note Can only take uchar data types.
  *
- * @param[in]  input1_ptr                            Pointer to the source image. Supported Data types : QASYMM8
- * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source image
- *
- * @param[in]  alpha_ptr                            Pointer to the source image. Supported Data types : QASYMM8
- * @param[in]  alpha_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  alpha_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  alpha_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  alpha_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  alpha_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  alpha_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  alpha_offset_first_element_in_bytes  The offset of the first element in the source image
- *
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input1_ptr                            Pointer to the source image. Supported Data
+ *                                                   types : QASYMM8
+ * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in
+ *                                                   bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                   image
+ * @param[in]  alpha_ptr                             Pointer to the source image. Supported Data
+ *                                                   types : QASYMM8
+ * @param[in]  alpha_stride_x                        Stride of the source image in X dimension (in
+ *                                                   bytes)
+ * @param[in]  alpha_step_x                          input2_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  alpha_stride_y                        Stride of the source image in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  alpha_step_y                          input2_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  alpha_stride_z                        Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  alpha_step_z                          input2_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  alpha_offset_first_element_in_bytes   The offset of the first element in the source
+ *                                                   image
+ * @param[out] output_ptr                            Pointer to the destination image. Supported
+ *                                                   data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination image in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination image in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
+ *                                                   destination image
  */
-__kernel void prelu_qasymm8(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(alpha),
-    TENSOR3D_DECLARATION(output))
+__kernel void prelu_qasymm8(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
+                            TENSOR3D_DECLARATION(output))
 {
-    // Get pixels pointer
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+  // Get pixels pointer
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT);
-    VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT);
+  VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT);
+  VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT);
 
-    in_a = SUB(in_a, (VEC_INT)((int)OFF_IN1));
-    in_b = SUB(in_b, (VEC_INT)((int)OFF_IN2));
+  in_a = SUB(in_a, (VEC_INT)((int)OFF_IN1));
+  in_b = SUB(in_b, (VEC_INT)((int)OFF_IN2));
 
-    const VEC_FLOAT in1f32  = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
-    const VEC_FLOAT in2f32  = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
-    const VEC_FLOAT outf32 = in1f32 < 0 ? in1f32 * in2f32 : in1f32;
-    const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT));
-    const VEC_UCHAR res     = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
+  const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+  const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+  const VEC_FLOAT outf32 = in1f32 < 0 ? in1f32 * in2f32 : in1f32;
+  const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT));
+  const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
 
-    VSTORE(VEC_SIZE)
-    (res, 0, (__global uchar *)output.ptr);
+  VSTORE(VEC_SIZE)
+  (res, 0, (__global uchar *)output.ptr);
 }
 
-#endif // defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE)
+#endif // defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) &&
+       // defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
index 8bef493..d7ea2e2 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
@@ -19,134 +19,170 @@
 #if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
 /** Perform reduce max/min
  *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
- * -DOP_CODE = number. e.g. -DOP_CODE=1
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ *       -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ *            e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as
+ *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
  * @param[in]  axis                                 Axis through which reduction occurs
  * @param[in]  dim                                  Dimension across the axis to be reduced.
  */
-__kernel void reduce_min_max(TENSOR4D_DECLARATION(input),
-                             TENSOR4D_DECLARATION(output),
-                             const int axis,
-                             const int dim)
+__kernel void reduce_min_max(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+                             const int axis, const int dim)
 {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
 
-    int indices[4] =
-    {
-        get_global_id(0),
-        get_global_id(1),
-        get_global_id(2) % DEPTH_OUT,
-        get_global_id(2) / DEPTH_OUT,
-    };
+  int indices[4] = {
+      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
+      get_global_id(2) / DEPTH_OUT,
+  };
 
-    DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
-    for(int i = 1; i < dim; ++i)
-    {
-      indices[axis] = i;
+  DATA_TYPE value =
+      *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+  for (int i = 1; i < dim; ++i)
+  {
+    indices[axis] = i;
 
-      #if OP_CODE == 1 // REDUCE_MAX
-       value = max(value, *((__global DATA_TYPE *)
-			   tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
+#if OP_CODE == 1 // REDUCE_MAX
+    value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+                                                               indices[2], indices[3])));
 
-      #elif OP_CODE == 2 // REDUCE_MIN
-       value = min(value, *((__global DATA_TYPE *)
-			   tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
+#elif OP_CODE == 2 // REDUCE_MIN
+    value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+                                                               indices[2], indices[3])));
 
-      #else // OP NOT SUPPORTED
-       return;
+#else // OP NOT SUPPORTED
+    return;
 
-      #endif
-    }
+#endif
+  }
 
-    *((__global DATA_TYPE *)out.ptr) = value;
+  *((__global DATA_TYPE *)out.ptr) = value;
 }
 
 /** Perform reduce sum/mean
  *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
- * -DOP_CODE = number. e.g. -DOP_CODE=1
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ *       -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ *            e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as
+ *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
  * @param[in]  axis                                 Axis through which reduction occurs
  * @param[in]  dim                                  Dimension across the axis to be reduced.
  */
-__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input),
-                              TENSOR4D_DECLARATION(output),
-                              const int axis,
-                              const int dim)
+__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+                              const int axis, const int dim)
 {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
 
-    int indices[4] =
-    {
-        get_global_id(0),
-        get_global_id(1),
-        get_global_id(2) % DEPTH_OUT,
-        get_global_id(2) / DEPTH_OUT,
-    };
+  int indices[4] = {
+      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
+      get_global_id(2) / DEPTH_OUT,
+  };
 
-    DATA_TYPE sum_value = (DATA_TYPE)0;
-    for(int i = 0; i < dim; ++i)
-    {
-      indices[axis] = i;
-      sum_value += *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
-    }
+  DATA_TYPE sum_value = (DATA_TYPE)0;
+  for (int i = 0; i < dim; ++i)
+  {
+    indices[axis] = i;
+    sum_value += *(
+        (__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+  }
 
-    #if OP_CODE == 3 // REDUCE_SUM
-     *((__global DATA_TYPE *)out.ptr) = sum_value;
+#if OP_CODE == 3 // REDUCE_SUM
+  *((__global DATA_TYPE *)out.ptr) = sum_value;
 
-    #elif OP_CODE == 4 // REDUCE_MEAN
-     *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE);
+#elif OP_CODE == 4 // REDUCE_MEAN
+  *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE);
 
-    #else // OP NOT SUPPORTED
-     return;
+#else // OP NOT SUPPORTED
+  return;
 
-    #endif
+#endif
 }
 #endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
index a0fc2d5..7367da7 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
@@ -16,148 +16,235 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && \
+    defined(WIDTH_IN) && defined(ZERO_VALUE)
 /** Perform space to batch with input of 4D and NCHW format
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16
- * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16
- * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ *            e.g. -DDEPTH_OUT=16
+ * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size.
+ *            e.g. -DBATCH_IN=16
+ * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size.
+ *            e.g. -DHEIGHT_IN=16
+ * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size.
+ *            e.g. -DWIDTH_IN=16
  * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
  *
- * @param[in]  input_ptr                                   Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                              Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  input_stride_y                              Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  input_stride_z                              Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in  bytes)
- * @param[in]  input_stride_w                              Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  input_step_w                                input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the source tensor
- * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                             Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                             Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                             Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                               output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
- * @param[in]  block_size_ptr                              Pointer to the source tensor. Supported data types: S32
- * @param[in]  block_size_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in]  block_size_step_x                           block_size_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  block_size_offset_first_element_in_bytes    The offset of the first element in the destination tensor
- * @param[in]  padding_size_ptr                            Pointer to the source tensor. Supported data types: S32
- * @param[in]  padding_size_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  padding_size_step_x                         padding_size_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  padding_size_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  padding_size_step_y                         padding_size_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  padding_size_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  input_ptr                                   Pointer to the source tensor. Supported
+ *                                                         data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                              Stride of the source tensor in X
+ *                                                         dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along
+ *                                                         X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                              Stride of the source tensor in Y
+ *                                                         dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along
+ *                                                         Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                              Stride of the source tensor in Z
+ *                                                         dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along
+ *                                                         Z processed per workitem(in  bytes)
+ * @param[in]  input_stride_w                              Stride of the destination tensor in W
+ *                                                         dimension (in bytes)
+ * @param[in]  input_step_w                                input_stride_w * number of elements along
+ *                                                         W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the
+ *                                                         source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor.
+ *                                                         Supported data types: same as @p
+ *                                                         input_ptr
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X
+ *                                                         dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements
+ *                                                         along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the destination tensor in Y
+ * dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements
+ *                                                         along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the destination tensor in Z
+ *                                                         dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements
+ *                                                         along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                             Stride of the destination tensor in W
+ *                                                         dimension (in bytes)
+ * @param[in]  output_step_w                               output_stride_w * number of elements
+ *                                                         along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the
+ *                                                         destination tensor
+ * @param[in]  block_size_ptr                              Pointer to the source tensor. Supported
+ *                                                         data types: S32
+ * @param[in]  block_size_stride_x                         Stride of the source tensor in X
+ *                                                         dimension (in bytes)
+ * @param[in]  block_size_step_x                           block_size_stride_x * number of elements
+ *                                                         along X processed per workitem(in  bytes)
+ * @param[in]  block_size_offset_first_element_in_bytes    The offset of the first element in the
+ *                                                         destination tensor
+ * @param[in]  padding_size_ptr                            Pointer to the source tensor. Supported
+ *                                                         data types: S32
+ * @param[in]  padding_size_stride_x                       Stride of the source tensor in X
+ *                                                         dimension (in bytes)
+ * @param[in]  padding_size_step_x                         padding_size_stride_x * number of
+ *                                                         elements along X processed per workitem
+ *                                                         (in bytes)
+ * @param[in]  padding_size_stride_y                       Stride of the source tensor in Y
+ *                                                         dimension (in bytes)
+ * @param[in]  padding_size_step_y                         padding_size_stride_y * number of
+ *                                                         elements along Y processed per workitem
+ *                                                         (in  bytes)
+ * @param[in]  padding_size_offset_first_element_in_bytes  The offset of the first element in the
+ *                                                         destination tensor
  */
-__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input),
-                                     TENSOR4D_DECLARATION(output),
+__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
                                      VECTOR_DECLARATION(block_size),
                                      IMAGE_DECLARATION(padding_size))
 {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
 
-    int block_size_x = *((__global int *)(block_size_ptr));
-    int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
-    int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x;
-    int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x;
+  int block_size_x = *((__global int *)(block_size_ptr));
+  int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
+  int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x;
+  int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x;
 
-    int in_index[4] = {0, };
-    in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
-    in_index[1] = get_global_id(1) * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y));
-    in_index[2] = get_global_id(2) % DEPTH_OUT;
-    in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN;
+  int in_index[4] = {
+      0,
+  };
+  in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
+  in_index[1] = get_global_id(1) * block_size_y + shift_y -
+                *((__global int *)(padding_size_ptr + padding_size_stride_y));
+  in_index[2] = get_global_id(2) % DEPTH_OUT;
+  in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN;
 
-    if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN)
-    {
-        *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
-    }
-    else
-    {
-        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3]));
-    }
+  if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN)
+  {
+    *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
+  }
+  else
+  {
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
+        &in, in_index[0], in_index[1], in_index[2], in_index[3]));
+  }
 }
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) &&
+       // defined(WIDTH_IN) && defined(ZERO_VALUE)
 
-#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
+#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && \
+    defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
 /** Perform space to batch with input of 4D and NHWC format
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor depth should be given as a preprocessor argument using -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16
- * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16
- * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16
- * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ *            -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16
+ * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size.
+ *            e.g. -DBATCH_IN=16
+ * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size.
+ *            e.g. -DHEIGHT_IN=16
+ * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size.
+ *            e.g. -DWIDTH_IN=16
  * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
  *
- * @param[in]  input_ptr                                   Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                              Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  input_stride_y                              Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  input_stride_z                              Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in  bytes)
- * @param[in]  input_stride_w                              Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  input_step_w                                input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the source tensor
- * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                             Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                             Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                             Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                               output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
- * @param[in]  block_size_ptr                              Pointer to the source tensor. Supported data types: S32
- * @param[in]  block_size_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in]  block_size_step_x                           block_size_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  block_size_offset_first_element_in_bytes    The offset of the first element in the destination tensor
- * @param[in]  padding_size_ptr                            Pointer to the source tensor. Supported data types: S32
- * @param[in]  padding_size_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  padding_size_step_x                         padding_size_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  padding_size_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  padding_size_step_y                         padding_size_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  padding_size_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  input_ptr                                   Pointer to the source tensor. Supported
+ *                                                         data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                              Stride of the source tensor in X
+ *                                                         dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along
+ *                                                         X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                              Stride of the source tensor in Y
+ *                                                         dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along
+ *                                                         Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                              Stride of the source tensor in Z
+ *                                                         dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along
+ *                                                         Z processed per workitem(in  bytes)
+ * @param[in]  input_stride_w                              Stride of the destination tensor in W
+ *                                                         dimension (in bytes)
+ * @param[in]  input_step_w                                input_stride_w * number of elements along
+ *                                                         W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the
+ *                                                         source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor.
+ *                                                         Supported data types: same as @p
+ *                                                         input_ptr
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X
+ *                                                         dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements
+ *                                                         along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the destination tensor in Y
+ *                                                         dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements
+ *                                                         along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the destination tensor in Z
+ *                                                         dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements
+ *                                                         along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                             Stride of the destination tensor in W
+ *                                                         dimension (in bytes)
+ * @param[in]  output_step_w                               output_stride_w * number of elements
+ *                                                         along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the
+ *                                                         destination tensor
+ * @param[in]  block_size_ptr                              Pointer to the source tensor. Supported
+ *                                                         data types: S32
+ * @param[in]  block_size_stride_x                         Stride of the source tensor in X
+ *                                                         dimension (in bytes)
+ * @param[in]  block_size_step_x                           block_size_stride_x * number of elements
+ *                                                         along X processed per workitem(in  bytes)
+ * @param[in]  block_size_offset_first_element_in_bytes    The offset of the first element in the
+ *                                                         destination tensor
+ * @param[in]  padding_size_ptr                            Pointer to the source tensor. Supported
+ *                                                         data types: S32
+ * @param[in]  padding_size_stride_x                       Stride of the source tensor in X
+ *                                                         dimension (in bytes)
+ * @param[in]  padding_size_step_x                         padding_size_stride_x * number of
+ *                                                         elements along X processed per workitem
+ *                                                         (in  bytes)
+ * @param[in]  padding_size_stride_y                       Stride of the source tensor in Y
+ *                                                         dimension (in bytes)
+ * @param[in]  padding_size_step_y                         padding_size_stride_y * number of
+ *                                                         elements along Y processed per workitem
+ *                                                         (in bytes)
+ * @param[in]  padding_size_offset_first_element_in_bytes  The offset of the first element in the
+ *                                                         destination tensor
  */
-__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input),
-                                     TENSOR4D_DECLARATION(output),
+__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
                                      VECTOR_DECLARATION(block_size),
                                      IMAGE_DECLARATION(padding_size))
 {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT);
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT);
 
-    int block_size_x = *((__global int *)(block_size_ptr));
-    int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
-    int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x;
-    int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x;
+  int block_size_x = *((__global int *)(block_size_ptr));
+  int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
+  int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x;
+  int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x;
 
-    int in_index[4] = {0, };
-    in_index[0] = get_global_id(0) * VEC_SIZE;
-    in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
-    in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y));
-    in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN;
+  int in_index[4] = {
+      0,
+  };
+  in_index[0] = get_global_id(0) * VEC_SIZE;
+  in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
+  in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y -
+                *((__global int *)(padding_size_ptr + padding_size_stride_y));
+  in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN;
 
-    if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN)
-    {
-        VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr);
-    }
-    else
-    {
-        VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])),
-                                 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
-                         0, (__global DATA_TYPE *)out.ptr);
-    }
+  if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN)
+  {
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr);
+  }
+  else
+  {
+    VSTORE(VEC_SIZE)
+    (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1],
+                                                                      in_index[2], in_index[3])),
+             VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+     0, (__global DATA_TYPE *)out.ptr);
+  }
 }
 
-#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
+#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) &&
+       // defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
index 20eeb38..a26e762 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
@@ -20,104 +20,142 @@
 /** Perform space to depth rearrangement of tensor
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor argument using -DZ_IN=size. e.g. -DZ_IN=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
+ *            e.g. -DDEPTH_IN=16
+ * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
+ *            argument using -DZ_IN=size. e.g. -DZ_IN=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+ *            -DBLOCK_SIZE=1
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
-t_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
  */
-__kernel void space_to_depth_nchw(
-     TENSOR4D_DECLARATION(input),
-     TENSOR4D_DECLARATION(output))
- {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+__kernel void space_to_depth_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
 
-    int out_index[4]={0};
-    int in_index[4]={0};
+  int out_index[4] = {0};
+  int in_index[4] = {0};
 
-    in_index[0] = get_global_id(0);//W
-    in_index[1] = get_global_id(1);//H
-    in_index[2] = get_global_id(2) % Z_IN;//C
-    in_index[3] = get_global_id(2) / Z_IN;//B
+  in_index[0] = get_global_id(0);        // W
+  in_index[1] = get_global_id(1);        // H
+  in_index[2] = get_global_id(2) % Z_IN; // C
+  in_index[3] = get_global_id(2) / Z_IN; // B
 
-    out_index[0] = in_index[0]/BLOCK_SIZE;
-    out_index[1] = in_index[1]/BLOCK_SIZE;
-    out_index[2] = in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
-    out_index[3] = in_index[3];
+  out_index[0] = in_index[0] / BLOCK_SIZE;
+  out_index[1] = in_index[1] / BLOCK_SIZE;
+  out_index[2] =
+      in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
+  out_index[3] = in_index[3];
 
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) =                                                                                                 *((__global DATA_TYPE *)in.ptr);
- }
+  *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
+                                          out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+}
 #endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
 
 #if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
 /** Perform space to depth rearrangement of tensor
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor argument using -DZ_IN=size. e.g. -DZ_IN=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
+ *            e.g. -DDEPTH_IN=16
+ * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
+ *            argument using -DZ_IN=size. e.g. -DZ_IN=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+ *            -DBLOCK_SIZE=1
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
-t_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
  */
-__kernel void space_to_depth_nhwc(
-     TENSOR4D_DECLARATION(input),
-     TENSOR4D_DECLARATION(output))
- {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+__kernel void space_to_depth_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
 
-    int out_index[4]={0};
-    int in_index[4]={0};
+  int out_index[4] = {0};
+  int in_index[4] = {0};
 
-    in_index[0] = get_global_id(0);//C
-    in_index[1] = get_global_id(1);//W
-    in_index[2] = get_global_id(2) % Z_IN;//H
-    in_index[3] = get_global_id(2) / Z_IN;//B
+  in_index[0] = get_global_id(0);        // C
+  in_index[1] = get_global_id(1);        // W
+  in_index[2] = get_global_id(2) % Z_IN; // H
+  in_index[3] = get_global_id(2) / Z_IN; // B
 
-    out_index[0] = in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN;
-    out_index[1] = in_index[1]/BLOCK_SIZE;
-    out_index[2] = in_index[2]/BLOCK_SIZE;
-    out_index[3] = in_index[3];
+  out_index[0] =
+      in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN;
+  out_index[1] = in_index[1] / BLOCK_SIZE;
+  out_index[2] = in_index[2] / BLOCK_SIZE;
+  out_index[3] = in_index[3];
 
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) =                                                                                                 *((__global DATA_TYPE *)in.ptr);
- }
+  *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
+                                          out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+}
 #endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl
index 3e1a5c9..0e1e246 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl
@@ -24,52 +24,74 @@
 /** Returns true value of squared_difference of two tensors.
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
  * @note Can only take floating point data types.
  *
- * @param[in]  input1_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source image
- *
- * @param[in]  input2_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input2_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input2_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input2_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source image
- *
- * @param[out] output_ptr                            Pointer to the destination image. Supported data types: F16/F32
- * @param[in]  output_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input1_ptr                            Pointer to the source image. Supported data
+ *                                                   types: F16/F32
+ * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in
+ *                                                   bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                   image
+ * @param[in]  input2_ptr                            Pointer to the source image. Supported data
+ *                                                   types: F16/F32
+ * @param[in]  input2_stride_x                       Stride of the source image in X dimension (in
+ *                                                   bytes)
+ * @param[in]  input2_step_x                         input2_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input2_stride_y                       Stride of the source image in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                   image
+ * @param[out] output_ptr                            Pointer to the destination image. Supported
+ *                                                   data types: F16/F32
+ * @param[in]  output_stride_x                       Stride of the destination image in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination image in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                   destination image
  */
-__kernel void squared_difference(
-    TENSOR3D_DECLARATION(input1),
-    TENSOR3D_DECLARATION(input2),
-    TENSOR3D_DECLARATION(output))
+__kernel void squared_difference(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATION(input2),
+                                 TENSOR3D_DECLARATION(output))
 {
-    Tensor3D input1  = CONVERT_TO_TENSOR3D_STRUCT(input1);
-    Tensor3D input2  = CONVERT_TO_TENSOR3D_STRUCT(input2);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+  Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1);
+  Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    diff = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr)- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr);
+  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+  diff = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) -
+         VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr);
 
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    sq_diff = diff * diff;
+  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+  sq_diff = diff * diff;
 
-    VSTORE(VEC_SIZE)
-    (sq_diff, 0, (__global DATA_TYPE *)output.ptr);
+  VSTORE(VEC_SIZE)
+  (sq_diff, 0, (__global DATA_TYPE *)output.ptr);
 }
 #endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
index d97f23a..50472e4 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
@@ -17,10 +17,8 @@
 
 #include "helpers.h"
 
-__kernel void topkv2_init(VECTOR_DECLARATION(input),
-    __global float* in_key_buf,
-    __global int* in_ind_buf,
-    const int n)
+__kernel void topkv2_init(VECTOR_DECLARATION(input), __global float *in_key_buf,
+                          __global int *in_ind_buf, const int n)
 {
   int gid = get_global_id(0);
   int lws = get_local_size(0);
@@ -30,53 +28,54 @@ __kernel void topkv2_init(VECTOR_DECLARATION(input),
 
   Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
 
-  for(int i = 0; i < iter; ++i)
+  for (int i = 0; i < iter; ++i)
   {
     int idx = i * gws + gid;
-    in_key_buf[idx] = *(__global float*)(input.ptr + idx * input.stride_x);
+    in_key_buf[idx] = *(__global float *)(input.ptr + idx * input.stride_x);
     in_ind_buf[idx] = idx;
   }
 }
 
-__kernel void topkv2_find_first_negative(
-    __global float *out_key_buf,
-    __global int *first_negative_idx,
-    int n)
+__kernel void topkv2_find_first_negative(__global float *out_key_buf,
+                                         __global int *first_negative_idx, int n)
 {
   int gid = get_global_id(0);
 
-  if( gid == n - 1 )
+  if (gid == n - 1)
   {
     // if the last item is positive, the first negative index is n.
-    if( out_key_buf[gid] > 0.f )
+    if (out_key_buf[gid] > 0.f)
       *first_negative_idx = n;
-  } else if ( gid == 0 ) {
+  }
+  else if (gid == 0)
+  {
     // if the first item is negative, set it 0.
-    if( out_key_buf[gid] < 0.f )
+    if (out_key_buf[gid] < 0.f)
       *first_negative_idx = 0;
-  } else {
+  }
+  else
+  {
     // if its left is positive and it is negative, then it is the first negative item.
-    if( out_key_buf[gid-1] > 0.f && out_key_buf[gid] < 0.f )
+    if (out_key_buf[gid - 1] > 0.f && out_key_buf[gid] < 0.f)
       *first_negative_idx = gid;
   }
 }
 
-__kernel void topkv2_reorder_negatives(
-    __global float* in_key_buf,
-    __global float* out_key_buf,
-    __global float* in_ind_buf,
-    __global float* out_ind_buf,
-    __global int* first_negative_idx,
-    int n)
+__kernel void topkv2_reorder_negatives(__global float *in_key_buf, __global float *out_key_buf,
+                                       __global float *in_ind_buf, __global float *out_ind_buf,
+                                       __global int *first_negative_idx, int n)
 {
   int gid = get_global_id(0);
 
   int num_negs = n - *first_negative_idx;
   int in_idx;
 
-  if( gid < num_negs ) {
+  if (gid < num_negs)
+  {
     in_idx = n - 1 - gid;
-  } else {
+  }
+  else
+  {
     in_idx = gid - num_negs;
   }
 
@@ -84,12 +83,8 @@ __kernel void topkv2_reorder_negatives(
   out_ind_buf[gid] = in_ind_buf[in_idx];
 }
 
-__kernel void topkv2_store(
-    VECTOR_DECLARATION(values),
-    VECTOR_DECLARATION(indices),
-    __global float *out_key_buf,
-    __global int *out_ind_buf,
-    int n)
+__kernel void topkv2_store(VECTOR_DECLARATION(values), VECTOR_DECLARATION(indices),
+                           __global float *out_key_buf, __global int *out_ind_buf, int n)
 {
   int gid = get_global_id(0);
 
@@ -98,6 +93,6 @@ __kernel void topkv2_store(
 
   int idx = n - 1 - gid;
 
-  *(__global float*)(values.ptr + gid * values.stride_x) = out_key_buf[idx];
-  *(__global int*)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx];
+  *(__global float *)(values.ptr + gid * values.stride_x) = out_key_buf[idx];
+  *(__global int *)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx];
 }
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
index 0292fab..9594daf 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
@@ -17,114 +17,113 @@
 
 #include "helpers.h"
 
-__global inline float* get_vec_elem(Vector* vec, int idx)
+__global inline float *get_vec_elem(Vector *vec, int idx)
 {
-  return (__global float*)(vec->ptr + idx * vec->stride_x);
+  return (__global float *)(vec->ptr + idx * vec->stride_x);
 }
 
-__global inline int* get_vec_elem_int(Vector* vec, int idx)
+__global inline int *get_vec_elem_int(Vector *vec, int idx)
 {
-  return (__global int*)(vec->ptr + idx * vec->stride_x);
+  return (__global int *)(vec->ptr + idx * vec->stride_x);
 }
 
 // A utility function to swap two elements
 void swap(__global float *a, __global float *b)
 {
-    float t = *a;
-    *a = *b;
-    *b = t;
+  float t = *a;
+  *a = *b;
+  *b = t;
 }
 
 void swap_idx(__global int *a, __global int *b)
 {
-    int t = *a;
-    *a = *b;
-    *b = t;
+  int t = *a;
+  *a = *b;
+  *b = t;
 }
 
 /* This function is same in both iterative and recursive*/
-int partition (Vector* arr, __global int* indices, int l, int h)
+int partition(Vector *arr, __global int *indices, int l, int h)
 {
-    float x = *get_vec_elem(arr, h);
-    int i = (l - 1);
+  float x = *get_vec_elem(arr, h);
+  int i = (l - 1);
 
-    for (int j = l; j <= h- 1; j++)
+  for (int j = l; j <= h - 1; j++)
+  {
+    if (*get_vec_elem(arr, j) >= x)
     {
-        if (*get_vec_elem(arr, j) >= x)
-        {
-            i++;
-            swap (get_vec_elem(arr,i), get_vec_elem(arr,j));
-            swap_idx(&indices[i], &indices[j]);
-        }
+      i++;
+      swap(get_vec_elem(arr, i), get_vec_elem(arr, j));
+      swap_idx(&indices[i], &indices[j]);
     }
-    swap (get_vec_elem(arr, i + 1), get_vec_elem(arr, h));
-    swap_idx(&indices[i + 1], &indices[h]);
-    return (i + 1);
+  }
+  swap(get_vec_elem(arr, i + 1), get_vec_elem(arr, h));
+  swap_idx(&indices[i + 1], &indices[h]);
+  return (i + 1);
 }
 
 /* A[] --> Array to be sorted,
    l  --> Starting index,
    h  --> Ending index */
-void quickSortIterative (Vector* arr, __global int* indices, 
-    __global int *stack, int l, int h)
+void quickSortIterative(Vector *arr, __global int *indices, __global int *stack, int l, int h)
 {
-    // Create an auxiliary stack
+  // Create an auxiliary stack
+
+  // initialize top of stack
+  int top = -1;
 
-    // initialize top of stack
-    int top = -1;
+  // push initial values of l and h to stack
+  stack[++top] = l;
+  stack[++top] = h;
+
+  // Keep popping from stack while is not empty
+  while (top >= 0)
+  {
+    // Pop h and l
+    h = stack[top--];
+    l = stack[top--];
 
-    // push initial values of l and h to stack
-    stack[ ++top ] = l;
-    stack[ ++top ] = h;
+    // Set pivot element at its correct position
+    // in sorted array
+    int p = partition(arr, indices, l, h);
 
-    // Keep popping from stack while is not empty
-    while ( top >= 0 )
+    // If there are elements on left side of pivot,
+    // then push left side to stack
+    if (p - 1 > l)
     {
-        // Pop h and l
-        h = stack[ top-- ];
-        l = stack[ top-- ];
-
-        // Set pivot element at its correct position
-        // in sorted array
-        int p = partition( arr, indices, l, h );
-
-        // If there are elements on left side of pivot,
-        // then push left side to stack
-        if ( p-1 > l )
-        {
-            stack[ ++top ] = l;
-            stack[ ++top ] = p - 1;
-        }
-
-        // If there are elements on right side of pivot,
-        // then push right side to stack
-        if ( p+1 < h )
-        {
-            stack[ ++top ] = p + 1;
-            stack[ ++top ] = h;
-        }
+      stack[++top] = l;
+      stack[++top] = p - 1;
     }
+
+    // If there are elements on right side of pivot,
+    // then push right side to stack
+    if (p + 1 < h)
+    {
+      stack[++top] = p + 1;
+      stack[++top] = h;
+    }
+  }
 }
 
-__kernel void topkv2_quicksort(VECTOR_DECLARATION(input),
-    VECTOR_DECLARATION(topk_values), VECTOR_DECLARATION(topk_indices),
-    __global int* indices, __global int* temp_stack, int k, int n)
+__kernel void topkv2_quicksort(VECTOR_DECLARATION(input), VECTOR_DECLARATION(topk_values),
+                               VECTOR_DECLARATION(topk_indices), __global int *indices,
+                               __global int *temp_stack, int k, int n)
 {
   Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
   Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values);
   Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices);
 
-  for( int i = 0; i < n; ++i )
+  for (int i = 0; i < n; ++i)
   {
     indices[i] = i;
   }
 
-  quickSortIterative(&input, indices, temp_stack, 0, n-1);
+  quickSortIterative(&input, indices, temp_stack, 0, n - 1);
 
   // extract k items.
-  for(int i = 0; i < k; ++i)
+  for (int i = 0; i < k; ++i)
   {
-    *get_vec_elem(&topk_values, i)  = *get_vec_elem(&input, i);
+    *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i);
     *get_vec_elem_int(&topk_indices, i) = indices[i];
   }
 }
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
index c2c2d89..f6830d2 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
@@ -29,11 +29,8 @@
 // http://www.codercorner.com/RadixSortRevisited.htm
 
 // compute the histogram for each radix and each virtual processor for the pass
-__kernel void radixsort_histogram(__global float* in_key_buf,
-      __global int* d_Histograms,
-      const int pass,
-      __local int* loc_histo,
-      const int n)
+__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms,
+                                  const int pass, __local int *loc_histo, const int n)
 {
   int it = get_local_id(0);  // i local number of the processor
   int ig = get_global_id(0); // global number = i + g I
@@ -41,45 +38,48 @@ __kernel void radixsort_histogram(__global float* in_key_buf,
   int gr = get_group_id(0); // g group number
 
   int groups = get_num_groups(0);
-  int items  = get_local_size(0);
+  int items = get_local_size(0);
 
   // set the local histograms to zero
-  for(int ir=0;ir<_RADIX;ir++){
+  for (int ir = 0; ir < _RADIX; ir++)
+  {
     loc_histo[ir * items + it] = 0;
   }
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
   // range of keys that are analyzed by the work item
-  int size= n/groups/items; // size of the sub-list
-  int start= ig * size; // beginning of the sub-list
+  int size = n / groups / items; // size of the sub-list
+  int start = ig * size;         // beginning of the sub-list
 
   unsigned int key;
-  int shortkey,k;
+  int shortkey, k;
 
   // compute the index
   // the computation depends on the transposition
-  for(int j = 0; j < size ; j++) {
+  for (int j = 0; j < size; j++)
+  {
 #ifdef TRANSPOSE
-    k= groups * items * j + ig;
+    k = groups * items * j + ig;
 #else
-    k=j+start;
+    k = j + start;
 #endif
 
-    key = *((__global unsigned int*)(in_key_buf + k));
+    key = *((__global unsigned int *)(in_key_buf + k));
 
     // extract the group of _BITS bits of the pass
     // the result is in the range 0.._RADIX-1
-    shortkey=(( key >> (pass * _BITS)) & (_RADIX-1));
+    shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
 
     // increment the local histogram
-    loc_histo[shortkey *  items + it ]++;
+    loc_histo[shortkey * items + it]++;
   }
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
   // copy the local histogram to the global one
-  for(int ir=0;ir<_RADIX;ir++) {
+  for (int ir = 0; ir < _RADIX; ir++)
+  {
     d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
   }
 
@@ -88,98 +88,92 @@ __kernel void radixsort_histogram(__global float* in_key_buf,
 
 // initial transpose of the list for improving
 // coalescent memory access
-__kernel void transpose(const __global int* invect,
-      __global int* outvect,
-      const int nbcol,
-      const int nbrow,
-      const __global int* inperm,
-      __global int* outperm,
-      __local int* blockmat,
-      __local int* blockperm,
-      const int tilesize){
+__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol,
+                        const int nbrow, const __global int *inperm, __global int *outperm,
+                        __local int *blockmat, __local int *blockperm, const int tilesize)
+{
 
-  int i0 = get_global_id(0)*tilesize;  // first row index
-  int j = get_global_id(1);  // column index
+  int i0 = get_global_id(0) * tilesize; // first row index
+  int j = get_global_id(1);             // column index
 
-  int jloc = get_local_id(1);  // local column index
+  int jloc = get_local_id(1); // local column index
 
   // fill the cache
-  for(int iloc=0;iloc<tilesize;iloc++){
-    int k=(i0+iloc)*nbcol+j;  // position in the matrix
-    blockmat[iloc*tilesize+jloc]=invect[k];
+  for (int iloc = 0; iloc < tilesize; iloc++)
+  {
+    int k = (i0 + iloc) * nbcol + j; // position in the matrix
+    blockmat[iloc * tilesize + jloc] = invect[k];
 #ifdef PERMUT
-    blockperm[iloc*tilesize+jloc]=inperm[k];
+    blockperm[iloc * tilesize + jloc] = inperm[k];
 #endif
   }
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
   // first row index in the transpose
-  int j0=get_group_id(1)*tilesize;
+  int j0 = get_group_id(1) * tilesize;
 
   // put the cache at the good place
-  for(int iloc=0;iloc<tilesize;iloc++){
-    int kt=(j0+iloc)*nbrow+i0+jloc;  // position in the transpose
-    outvect[kt]=blockmat[jloc*tilesize+iloc];
+  for (int iloc = 0; iloc < tilesize; iloc++)
+  {
+    int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose
+    outvect[kt] = blockmat[jloc * tilesize + iloc];
 #ifdef PERMUT
-      outperm[kt]=blockperm[jloc*tilesize+iloc];
+    outperm[kt] = blockperm[jloc * tilesize + iloc];
 #endif
   }
-
 }
 
 // each virtual processor reorders its data using the scanned histogram
-__kernel void radixsort_reorder(__global float* in_key,
-    __global float* out_key,
-    __global int* d_Histograms,
-    const int pass,
-    __global int* indices_in,
-    __global int* indices_out,
-    __local int* loc_histo,
-    const int n){
+__kernel void radixsort_reorder(__global float *in_key, __global float *out_key,
+                                __global int *d_Histograms, const int pass,
+                                __global int *indices_in, __global int *indices_out,
+                                __local int *loc_histo, const int n)
+{
 
   int it = get_local_id(0);
   int ig = get_global_id(0);
 
   int gr = get_group_id(0);
-  int groups=get_num_groups(0);
-  int items=get_local_size(0);
+  int groups = get_num_groups(0);
+  int items = get_local_size(0);
 
-  int start= ig *(n/groups/items);
-  int size= n/groups/items;
+  int start = ig * (n / groups / items);
+  int size = n / groups / items;
 
   // take the histogram in the cache
-  for(int ir=0;ir<_RADIX;ir++){
-    loc_histo[ir * items + it]=
-      d_Histograms[items * (ir * groups + gr) + it];
+  for (int ir = 0; ir < _RADIX; ir++)
+  {
+    loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it];
   }
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  int newpos,shortkey,k,newpost;
+  int newpos, shortkey, k, newpost;
   unsigned int key;
 
-  for(int j= 0; j< size;j++){
+  for (int j = 0; j < size; j++)
+  {
 #ifdef TRANSPOSE
-      k= groups * items * j + ig;
+    k = groups * items * j + ig;
 #else
-      k=j+start;
+    k = j + start;
 #endif
     float org_value = in_key[k];
-    key = *(__global unsigned int*)(in_key + k);
-    shortkey=((key >> (pass * _BITS)) & (_RADIX-1));
+    key = *(__global unsigned int *)(in_key + k);
+    shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
 
-    newpos=loc_histo[shortkey * items + it];
+    newpos = loc_histo[shortkey * items + it];
 
 #ifdef TRANSPOSE
-    int ignew,jnew;
-    ignew= newpos/(n/groups/items);
-    jnew = newpos%(n/groups/items);
-    newpost = jnew * (groups*items) + ignew;
+    int ignew, jnew;
+    ignew = newpos / (n / groups / items);
+    jnew = newpos % (n / groups / items);
+    newpost = jnew * (groups * items) + ignew;
 #else
-    newpost=newpos;
+    newpost = newpos;
 #endif
 
-    //d_outKeys[newpost]= key;  // killing line !!!
+    // d_outKeys[newpost]= key;  // killing line !!!
     out_key[newpost] = org_value;
 
 #ifdef PERMUT
@@ -187,32 +181,35 @@ __kernel void radixsort_reorder(__global float* in_key,
 #endif
 
     newpos++;
-    loc_histo[shortkey * items + it]=newpos;
+    loc_histo[shortkey * items + it] = newpos;
   }
 }
 
 // perform a parallel prefix sum (a scan) on the local histograms
 // (see Blelloch 1990) each workitem worries about two memories
 // see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
-__kernel void radixsort_scanhistograms(__global int* histo, __local int* temp, __global int* globsum)
+__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp,
+                                       __global int *globsum)
 {
   int it = get_local_id(0);
   int ig = get_global_id(0);
   int decale = 1;
-  int n=get_local_size(0) * 2 ;
-  int gr=get_group_id(0);
+  int n = get_local_size(0) * 2;
+  int gr = get_group_id(0);
 
   // load input into local memory
   // up sweep phase
-  temp[2*it] = histo[2*ig];
-  temp[2*it+1] = histo[2*ig+1];
+  temp[2 * it] = histo[2 * ig];
+  temp[2 * it + 1] = histo[2 * ig + 1];
 
   // parallel prefix sum (algorithm of Blelloch 1990)
-  for (int d = n>>1; d > 0; d >>= 1){
+  for (int d = n >> 1; d > 0; d >>= 1)
+  {
     barrier(CLK_LOCAL_MEM_FENCE);
-    if (it < d){
-      int ai = decale*(2*it+1)-1;
-      int bi = decale*(2*it+2)-1;
+    if (it < d)
+    {
+      int ai = decale * (2 * it + 1) - 1;
+      int bi = decale * (2 * it + 2) - 1;
       temp[bi] += temp[ai];
     }
     decale *= 2;
@@ -221,51 +218,52 @@ __kernel void radixsort_scanhistograms(__global int* histo, __local int* temp, _
   // store the last element in the global sum vector
   // (maybe used in the next step for constructing the global scan)
   // clear the last element
-  if (it == 0) {
-    globsum[gr]=temp[n-1];
+  if (it == 0)
+  {
+    globsum[gr] = temp[n - 1];
     temp[n - 1] = 0;
   }
 
   // down sweep phase
-  for (int d = 1; d < n; d *= 2){
+  for (int d = 1; d < n; d *= 2)
+  {
     decale >>= 1;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (it < d){
-      int ai = decale*(2*it+1)-1;
-      int bi = decale*(2*it+2)-1;
+    if (it < d)
+    {
+      int ai = decale * (2 * it + 1) - 1;
+      int bi = decale * (2 * it + 2) - 1;
 
       int t = temp[ai];
       temp[ai] = temp[bi];
       temp[bi] += t;
     }
-
   }
   barrier(CLK_LOCAL_MEM_FENCE);
 
   // write results to device memory
 
-  histo[2*ig] = temp[2*it];
-  histo[2*ig+1] = temp[2*it+1];
+  histo[2 * ig] = temp[2 * it];
+  histo[2 * ig + 1] = temp[2 * it + 1];
 
   barrier(CLK_GLOBAL_MEM_FENCE);
-
 }
 
 // use the global sum for updating the local histograms
 // each work item updates two values
-__kernel void radixsort_pastehistograms( __global int* histo,__global int* globsum)
+__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum)
 {
   int ig = get_global_id(0);
-  int gr=get_group_id(0);
+  int gr = get_group_id(0);
 
   int s;
 
-  s=globsum[gr];
+  s = globsum[gr];
 
   // write results to device memory
-  histo[2*ig] += s;
-  histo[2*ig+1] += s;
+  histo[2 * ig] += s;
+  histo[2 * ig + 1] += s;
 
   barrier(CLK_GLOBAL_MEM_FENCE);
 }
diff --git a/scripts/command/format-check b/scripts/command/format-check
index cd9bfaa..8224870 100644
--- a/scripts/command/format-check
+++ b/scripts/command/format-check
@@ -68,7 +68,7 @@ check_cpp_files() {
     DIRECTORIES_NOT_TO_BE_TESTED=$1
 
     # Check c++ files
-    CPP_FILES_TO_CHECK=$(git ls-files '*.h' '*.cpp' '*.cc' ':!:include/NeuralNetworks.h')
+    CPP_FILES_TO_CHECK=$(git ls-files '*.h' '*.cpp' '*.cc' '*.cl' ':!:include/NeuralNetworks.h')
     ARR=($CPP_FILES_TO_CHECK)
     for s in ${DIRECTORIES_NOT_TO_BE_TESTED[@]}; do
         skip=${s#'.'/}/
-- 
2.7.4