From: 장지섭/On-Device Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>
Date: Thu, 31 Jan 2019 01:36:48 +0000 (+0900)
Subject: Remove unnecessary kernels from ARMComputeEX (#4325)
X-Git-Tag: submit/tizen/20190325.013700~284
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=563be3bef9b56806071416cde197574124180a55;p=platform%2Fcore%2Fml%2Fnnfw.git

Remove unnecessary kernels from ARMComputeEX (#4325)

This commit removes unnecessary kernels from ARMComputeEX

Signed-off-by: jiseob.jang <jiseob.jang@samsung.com>
---

diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h
deleted file mode 100644
index 1387897..0000000
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_KERNEL_H__
-#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_KERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform BATCH_TO_SPACE_ND operation */
-class CLBatchToSpaceNDKernel : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLBatchToSpaceNDKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLBatchToSpaceNDKernel(const CLBatchToSpaceNDKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLBatchToSpaceNDKernel &operator=(const CLBatchToSpaceNDKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  CLBatchToSpaceNDKernel(CLBatchToSpaceNDKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  CLBatchToSpaceNDKernel &operator=(CLBatchToSpaceNDKernel &&) = default;
-  /** Default destructor */
-  ~CLBatchToSpaceNDKernel() = default;
-  /** Initialise the kernel's input and output.
-   *
-   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, const int32_t *block_size);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input; /**< Source tensor */
-  ICLTensor *_output;      /**< Destination tensor */
-};
-
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLLogicalNotKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLLogicalNotKernel.h
deleted file mode 100644
index c199d01..0000000
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLLogicalNotKernel.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __ARM_COMPUTE_CLLOGICALNOTKERNEL_H__
-#define __ARM_COMPUTE_CLLOGICALNOTKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to return truth values of two input tensors for LogicalNOT*/
-class CLLogicalNotKernel : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLLogicalNotKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  CLLogicalNotKernel(const CLLogicalNotKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  CLLogicalNotKernel &operator=(const CLLogicalNotKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  CLLogicalNotKernel(CLLogicalNotKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  CLLogicalNotKernel &operator=(CLLogicalNotKernel &&) = default;
-  /** Initialize the kernel's input, output.
-   *
-   * @param[in]  input  Source tensor.
-   * @param[out] output  Output tensor.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input;
-  ICLTensor *_output;
-};
-
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLLOGICALNOTKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h
deleted file mode 100644
index 181a622..0000000
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__
-#define __ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the normalization layer kernel.
- */
-class CLNormalizationLayerExKernel : public ICLKernel
-{
-public:
-  /** Constructor */
-  CLNormalizationLayerExKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLNormalizationLayerExKernel(const CLNormalizationLayerExKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLNormalizationLayerExKernel &operator=(const CLNormalizationLayerExKernel &) = delete;
-  /** Default Move Constructor. */
-  CLNormalizationLayerExKernel(CLNormalizationLayerExKernel &&) = default;
-  /** Default move assignment operator */
-  CLNormalizationLayerExKernel &operator=(CLNormalizationLayerExKernel &&) = default;
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions
-   * [width, height, IFM],
-   *                       and an optional 4th dimension for batch of inputs. Data types supported:
-   * F16/F32.
-   * @param[out] output    Destination tensor. Output will have the same number of dimensions as
-   * input. Data types supported: same as @p input.
-   * @param[in]  norm_info Normalization layer information like the normalization type,
-   * normalization size and other parameters.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLNormalizationLayerKernel
-   *
-   * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions
-   * [width, height, IFM],
-   *                      and an optional 4th dimension for batch of inputs. Data types supported:
-   * F16/F32.
-   * @param[in] output    Destination tensor. Output will have the same number of dimensions as
-   * input. Data types supported: same as @p input.
-   * @param[in] norm_info Normalization layer information like the normalization type, normalization
-   * size and other parameters.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         NormalizationLayerInfo norm_info);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-  BorderSize border_size() const override;
-
-private:
-  const ICLTensor *_input;
-  ICLTensor *_output;
-  BorderSize _border_size;
-  bool _is_in_map;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h
deleted file mode 100644
index d579f5d..0000000
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file      CLPixelWiseDivisionKernel.h
- * @ingroup   COM_AI_RUNTIME
- * @brief     This file defines CLPixelWiseDivisionKernel class
- */
-
-#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__
-#define __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Interface for the pixelwise division kernel.
- */
-class CLPixelWiseDivisionKernel : public ICLKernel
-{
-public:
-  /**
-   * @brief Construct a CLPixelWiseDivisionKernel object
-   */
-  CLPixelWiseDivisionKernel();
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   */
-  CLPixelWiseDivisionKernel(const CLPixelWiseDivisionKernel &) = delete;
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   */
-  CLPixelWiseDivisionKernel &operator=(const CLPixelWiseDivisionKernel &) = delete;
-
-  /**
-   * @brief Construct a CLPixelWiseDivisionKernel object by using move constructor
-   * @param[in] CLPixelWiseDivisionKernel object to move
-   */
-  CLPixelWiseDivisionKernel(CLPixelWiseDivisionKernel &&) = default;
-
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param[in] CLPixelWiseDivisionKernel object to move
-   */
-  CLPixelWiseDivisionKernel &operator=(CLPixelWiseDivisionKernel &&) = default;
-
-  /**
-   * @brief Initialise the kernel's input, output and border mode.
-   * @param[in]  input1          An input tensor. Data types supported: U8/S16/F16/F32.
-   * @param[in]  input2          An input tensor. Data types supported: same as @p input1.
-   * @param[out] output          The output tensor, Data types supported: same as @p input1. Note:
-   * U8 requires both inputs to be U8.
-   * @param[in]  scale           Scale to apply after division.
-   *                             Scale must be positive and its value must be either 1/255 or 1/2^n
-   *                             where n is between 0 and 15.
-   * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-   * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
-   * even.
-   * @return N/A
-   */
-  void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
-                 ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
-
-  /**
-   * @brief Static function to check if given info will lead to a valid configuration of @ref
-   * CLPixelWiseDivisionKernel
-   * @param[in] input1          An input tensor info. Data types supported: U8/S16/F16/F32.
-   * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
-   * @param[in] output          The output tensor info, Data types supported: same as @p input1.
-   * Note: U8 requires both inputs to be U8.
-   * @param[in] scale           Scale to apply after division.
-   *                            Scale must be positive and its value must be either 1/255 or 1/2^n
-   * where n is between 0 and 15.
-   * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-   * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                         const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
-                         RoundingPolicy rounding_policy);
-
-  /**
-   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
-   *        queue.
-   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
-   *        been executed by the time this method returns.
-   * @param[in] window      Region on which to execute the kernel. (Must be a valid region of
-   *                        the window returned by window()).
-   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-  /**
-   * @brief The size of the border for that kernel
-   * @return The width in number of elements of the border.
-   */
-  BorderSize border_size() const override;
-
-private:
-  const ICLTensor *_input1;
-  const ICLTensor *_input2;
-  ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceExKernel.h
deleted file mode 100644
index 6368c38..0000000
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceExKernel.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file      CLStridedSliceExKernel.h
- * @ingroup   COM_AI_RUNTIME
- * @brief     This file defines CLStridedSliceExKernel class
- */
-
-#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__
-#define __ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
-* @brief Class to define an interface for the kernel to extract a strided slice of a tensor
-*/
-class CLStridedSliceExKernel : public ICLKernel
-{
-public:
-  /**
-   * @brief Construct a CLStridedSliceExKernel object
-   * */
-  CLStridedSliceExKernel();
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   * */
-  CLStridedSliceExKernel(const CLStridedSliceExKernel &) = delete;
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   * */
-  CLStridedSliceExKernel &operator=(const CLStridedSliceExKernel &) = delete;
-
-  /**
-   * @brief Construct a CLStridedSliceExKernel object by using default move constructor
-   * @param[in] CLStridedSliceExKernel object to move
-   * */
-  CLStridedSliceExKernel(CLStridedSliceExKernel &&) = default;
-
-  /**
-   * @brief Move assignment operator
-   * @param[in] CLStridedSliceExKernel object to move
-   * */
-  CLStridedSliceExKernel &operator=(CLStridedSliceExKernel &&) = default;
-
-  /**
-   * @brief Destruct this object
-   * */
-  ~CLStridedSliceExKernel() = default;
-
-  /**
-   * @brief Set the input and output of the kernel
-   * @param[in]  input          Source tensor. Data type supported:
-   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[out] output         Destination tensor. Data type supported: Same as @p input
-   * @param[in]  beginData      The begin tensor. Data types supported: S32.
-   *                            The number of dimensions must be 1.
-   *                            The length must be the same as the number of dimensions of input.
-   * @param[in]  endData        The end tensor. Data types supported: S32.
-   *                            The number of dimensions must be 1.
-   *                            The length must be the same as the number of dimensions of input.
-   * @param[in]  strideData     The stride tensor. Data types supported: S32.
-   *                            The number of dimensions must be 1.
-   *                            The length must be the same as the number of dimensions of input.
-   * @param[in]  beginMask      Mask for begin
-   * @param[in]  endMask        Mask for end
-   * @param[in]  shrinkAxisMask Mask for shrink axis.
-   * @return N/A
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
-                 ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
-                 int32_t shrinkAxisMask);
-
-  /**
-   * @brief Static function to check if given info will lead to a valid configuration of @ref
-   * CLStridedSliceExKernel
-   * @param[in]  input          The input tensor info. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[in]  output         The output tensor info, Data types supported: same as @p input1.
-   * @param[in]  begin          The begin tensor info. Data types supported: S32.
-   *                            The number of dimensions must be 1.
-   *                            The length must be the same as the number of dimensions of input.
-   * @param[in]  end            The end tensor info. Data types supported: S32.
-   *                            The number of dimensions must be 1.
-   *                            The length must be the same as the number of dimensions of input.
-   * @param[in]  stride         The stride tensor info. Data types supported: S32.
-   *                            The number of dimensions must be 1.
-   *                            The length must be the same as the number of dimensions of input.
-   * @param[in]  beginMask      Mask for begin
-   * @param[in]  endMask        Mask for end
-   * @param[in]  shrinkAxisMask Mask for shrink axis.
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         const ITensorInfo *begin, const ITensorInfo *end,
-                         const ITensorInfo *stride, int32_t beginMask, int32_t endMask,
-                         int32_t shrinkAxisMask);
-
-  /**
-   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
-   *        queue.
-   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
-   *        been executed by the time this method returns.
-   * @param[in] window      Region on which to execute the kernel. (Must be a valid region of
-   *                        the window returned by window()).
-   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input; /** Source tensor */
-  ICLTensor *_output;      /** Destination tensor */
-  ICLTensor *_beginData;   /** Start indices of input tensor */
-  ICLTensor *_endData;     /** Stop indices of input tensor */
-  ICLTensor *_stridesData; /** Strides tensor */
-  int32_t _beginMask;      /** Begin mask */
-  int32_t _endMask;        /** End mask */
-  int32_t _shrinkAxisMask; /** Shrink axis mask */
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 2ef886e..c5ab86b 100644
--- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -42,7 +42,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"activation_layer_ex", "activation_layer_ex.cl"},
     {"arg_op", "arg_operation.cl"},
     {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"},
-    {"batch_to_space_nd", "batch_to_space_nd.cl"},
     {"binary_logical_op", "binary_logical_op.cl"},
     {"cast", "cast.cl"},
     {"cast_qasymm_in", "cast.cl"},
@@ -57,19 +56,15 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"gather_1d", "gather.cl"},
     {"gather_1d_out", "gather.cl"},
     {"hashtable_lookup", "hashtable_lookup.cl"},
-    {"logical_not", "logical_not.cl"},
     {"neg_tensor", "neg_tensor.cl"},
     {"pad", "pad.cl"},
     {"permute_generic", "permute_ex.cl"},
     {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"},
-    {"pixelwise_div_float", "pixelwise_div_float.cl"},
-    {"pixelwise_div_int", "pixelwise_div_int.cl"},
     {"prelu", "prelu.cl"},
     {"prelu_qasymm8", "prelu_quantized.cl"},
     {"reduce_min_max", "reduce_operation.cl"},
     {"reduce_sum_mean", "reduce_operation.cl"},
     {"squared_difference", "squared_difference.cl"},
-    {"strided_slice_ex", "strided_slice_ex.cl"},
     {"topkv2_init", "topkv2.cl"},
     {"topkv2_find_first_negative", "topkv2.cl"},
     {"topkv2_reorder_negatives", "topkv2.cl"},
@@ -95,10 +90,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/arg_operation.clembed"
     },
     {
-        "batch_to_space_nd.cl",
-#include "./cl_kernels/batch_to_space_nd.clembed"
-    },
-    {
         "cast.cl",
 #include "./cl_kernels/cast.clembed"
     },
@@ -151,18 +142,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/pad.clembed"
     },
     {
-        "logical_not.cl",
-#include "./cl_kernels/logical_not.clembed"
-    },
-    {
-        "pixelwise_div_float.cl",
-#include "./cl_kernels/pixelwise_div_float.clembed"
-    },
-    {
-        "pixelwise_div_int.cl",
-#include "./cl_kernels/pixelwise_div_int.clembed"
-    },
-    {
         "prelu.cl",
 #include "./cl_kernels/prelu.clembed"
     },
@@ -187,10 +166,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/squared_difference.clembed"
     },
     {
-        "strided_slice_ex.cl",
-#include "./cl_kernels/strided_slice_ex.clembed"
-    },
-    {
         "topkv2.cl",
 #include "./cl_kernels/topkv2.clembed"
     },
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl
deleted file mode 100644
index ad6a48a..0000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT)
-/** Perform batch to space rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor batch should be given as a preprocessor argument using -DBATCH_OUT=size. e.g. -DBATCH_OUT=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE0=size. e.g. -DBLOCK_SIZE0=1
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p inpu
-t_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_nd(
-     TENSOR4D_DECLARATION(input),
-     TENSOR4D_DECLARATION(output))
- {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
-    int out_index[4]={0};
-    int in_index[4]={0};
-
-    out_index[0] = get_global_id(0);//W
-    out_index[1] = get_global_id(1);//H
-    out_index[2] = get_global_id(2) % DEPTH_OUT;//C
-    out_index[3] = get_global_id(2) / DEPTH_OUT;//N
-
-    in_index[0] = out_index[0]/BLOCK_SIZE1;
-    in_index[1] = out_index[1]/BLOCK_SIZE0;
-    in_index[2] = out_index[2];
-    in_index[3] = out_index[3] + ((out_index[1] % BLOCK_SIZE0) * BLOCK_SIZE0 + out_index[0] % BLOCK_SIZE1) * BATCH_OUT;
-
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3]));
- }
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/logical_not.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/logical_not.cl
deleted file mode 100644
index 5137988..0000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/logical_not.cl
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-/** returns truth value of the input tensor for LOGICAL NOT.
- *
- * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=bool
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Can only take boolean data types.
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- */
-__kernel void logical_not(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_data =
-        VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
-
-    VSTORE(VEC_SIZE) (CONVERT(!in_data, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
-                      0, (__global DATA_TYPE *)output.ptr);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
deleted file mode 100644
index aa05121..0000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifdef SATURATE
-#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
-#else /* SATURATE */
-#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
-#endif /* SATURATE */
-#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
-
-/** Performs a pixelwise division with float scale of either integer or float inputs.
- *
- * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
- * @attention The data type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES.
- * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
- * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided.
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16, F16, F32
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  scale                             Float scaling factor. Supported data types: F32
- */
-__kernel void pixelwise_div_float(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out),
-    const float scale)
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
-    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
-    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
-    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
-
-    // Perform division
-#ifdef DATA_TYPE_FLOAT
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    res = CONVERT(in1_data / in2_data * (DATA_TYPE_RES)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-#else  /* DATA_TYPE_FLOAT */
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data / in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
-#endif /* DATA_TYPE_FLOAT */
-
-    // Store result
-    vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
deleted file mode 100644
index fdfb780..0000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(SATURATE)
-#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
-#else // SATURATE
-#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x))
-#endif // SATURATE
-#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size)
-
-#define DIV_OP(x, y, scale, type, size) CONVERT_OP_INT((x) / (y) >> scale, type, size)
-
-/** Performs a pixelwise division with integer scale of integer inputs.
- *
- * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
- * @attention The data_type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES.
- * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/S16
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  scale                             Integer scaling factor. Supported data types: S32
- */
-__kernel void pixelwise_div_int(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out),
-    const uint scale)
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
-    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
-    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
-    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
-
-    // Perform division and store result
-    vstore16(DIV_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl
deleted file mode 100644
index b39c55b9..0000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT)
-/** Extracts a strided slice up to 4-dimensions
- *
- * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  starts                               The stride of X dimension of input tensor to be sliced. Supported data types: S32
- * @param[in]  strides                              The stride of Y dimension of input tensor to be sliced. Supported data types: S32
- */
-__kernel void strided_slice_ex(TENSOR4D_DECLARATION(input),
-                               TENSOR4D_DECLARATION(output),
-                               const int4 starts,
-                               const int4 strides)
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
-    int4 indices_in =
-    {
-            starts.x + (strides.x * get_global_id(0)),
-            starts.y + (strides.y * get_global_id(1)),
-            starts.z + (strides.z * (get_global_id(2) % DEPTH_OUT)),
-            starts.w + (strides.w * (get_global_id(2) / DEPTH_OUT)),
-    };
-    *((__global ELEMENT_DATA_TYPE *)out.ptr) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&in, indices_in.x, indices_in.y, indices_in.z, indices_in.w));
-}
-#endif // defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp
deleted file mode 100644
index 5a19106..0000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const int32_t *block_size)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(block_size[0] >= 1 && block_size[1] >= 1),
-                                  "Block size should be greater than or equal to 1.");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2),
-                                  "Input Depth should be equal to Output Depth");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      output->dimension(3) * block_size[0] * block_size[1] != input->dimension(3),
-      "Input batch should be equal to (output batch * block size[0] *block size[1])");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) % block_size[1]) ||
-                                      (output->dimension(1) % block_size[0]),
-                                  "Output height and width should be divisible by block size[0] "
-                                  "and block_size[1] respectively");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != input->dimension(0) * block_size[1]) ||
-                                      (output->dimension(1) != input->dimension(1) * block_size[0]),
-                                  "Output height and width should be equal to "
-                                  "input_height*blocksize[0] and input_width*blocksize[1] "
-                                  "respectively");
-
-  return Status{};
-}
-
-} // namespace
-
-CLBatchToSpaceNDKernel::CLBatchToSpaceNDKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLBatchToSpaceNDKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                       const int32_t *block_size)
-{
-
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
-
-  _input = input;
-  _output = output;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DBLOCK_SIZE0=" + support::cpp11::to_string(block_size[0]));
-  build_opts.emplace("-DBLOCK_SIZE1=" + support::cpp11::to_string(block_size[1]));
-  build_opts.emplace("-DBATCH_OUT=" + support::cpp11::to_string(output->info()->dimension(3)));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("batch_to_space_nd", build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLBatchToSpaceNDKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup output slice
-  Window slice_out(slice_in);
-  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_out.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_out);
-    add_4D_tensor_argument(idx, _output, slice_in);
-    enqueue(queue, *this, slice_in);
-  } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLLogicalNotKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLLogicalNotKernel.cpp
deleted file mode 100644
index 73d96ed..0000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLLogicalNotKernel.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLLogicalNotKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-  const TensorShape &out_shape = input->tensor_shape();
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
-
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-  return Status{};
-}
-} // namespace
-
-CLLogicalNotKernel::CLLogicalNotKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLLogicalNotKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-  _input = input;
-  _output = output;
-
-  // Create kernel
-  std::string kernel_name = "logical_not";
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  const ValidRegion &valid_region = input->info()->valid_region();
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-
-  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-  output_access.set_valid_region(win, valid_region);
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLLogicalNotKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (in_shape1.total_size() > 1)
-  {
-    can_collapse = (in_shape1.num_dimensions() > Window::DimZ);
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  Window slice = collapsed.first_slice_window_3D();
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice);
-    add_3D_tensor_argument(idx, _output, slice);
-
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp
deleted file mode 100644
index e6db3d9..0000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          NormalizationLayerInfo /*norm_info*/)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-
-  // Checks performed when output is configured
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-  }
-
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
-                                                        NormalizationLayerInfo norm_info)
-{
-  // Output tensor auto initialization if not yet initialized
-  auto_init_if_empty(*output, *input->clone());
-
-  const unsigned int norm_size = norm_info.norm_size();
-  bool is_in_map = norm_info.is_in_map();
-
-  const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0;
-  const BorderSize border_size = BorderSize(0, border_width);
-
-  const unsigned int num_elems_processed_per_iteration = 4;
-  const unsigned int num_elems_read_per_iteration =
-      is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_size / 2))
-                : num_elems_processed_per_iteration;
-
-  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-  // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside
-  // the kernel, avoiding padding
-  AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration);
-  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-  output_access.set_valid_region(win, input->valid_region());
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-CLNormalizationLayerExKernel::CLNormalizationLayerExKernel()
-    : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false)
-{
-}
-
-BorderSize CLNormalizationLayerExKernel::border_size() const { return _border_size; }
-
-void CLNormalizationLayerExKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                             NormalizationLayerInfo norm_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  // Output tensor auto initialization if not yet initialized
-  auto_init_if_empty(*output->info(), *input->info()->clone());
-
-  // Perform validation step
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
-
-  _input = input;
-  _output = output;
-
-  const unsigned int num_elems_processed_per_iteration = 4;
-  const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
-
-  // Set build options
-  CLBuildOptions build_opts;
-  build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  build_opts.add_option(
-      ("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
-  build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
-  build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
-  build_opts.add_option(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-  build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size())));
-  build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
-  build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
-
-  // Create kernel
-  std::string kernel_name =
-      _is_in_map ? "normalization_layer_in_map" : "normalization_layer_cross_map";
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  ICLKernel::configure_internal(win_config.second);
-
-  // Set config_id for enabling LWS tuning
-  _config_id = "normalization_layer_";
-  _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(
-      static_cast<std::underlying_type<NormType>::type>(norm_info.type()));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(norm_info.norm_size());
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(input->info()->dimension(0));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-Status CLNormalizationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                              NormalizationLayerInfo norm_info)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
-
-  return Status{};
-}
-
-void CLNormalizationLayerExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const int collapsed_dimension = _is_in_map ? Window::DimZ : 4;
-  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), collapsed_dimension);
-  Window slice = window_collapsed.first_slice_window_3D();
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice);
-    add_3D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice);
-  } while (window_collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
deleted file mode 100644
index b985aa7..0000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
-                          const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
-                          RoundingPolicy rounding_policy)
-{
-  ARM_COMPUTE_UNUSED(overflow_policy);
-  ARM_COMPUTE_UNUSED(rounding_policy);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
-                                                       DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
-                                                       DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
-
-  const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
-                                                         DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        output->data_type() == DataType::U8 &&
-            (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
-        "Output can only be U8 if both inputs are U8");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2,
-                                                        ITensorInfo *output)
-{
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  {
-    set_shape_if_empty(*output, out_shape);
-
-    if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
-    {
-      set_format_if_unknown(*output, Format::S16);
-    }
-    else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
-    {
-      set_format_if_unknown(*output, Format::F32);
-    }
-  }
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-  Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
-  Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
-  AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
-                        update_window_and_padding(win_input2, input2_access) ||
-                        update_window_and_padding(win, output_access);
-
-  output_access.set_valid_region(win, valid_region);
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-CLPixelWiseDivisionKernel::CLPixelWiseDivisionKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
-                                          ICLTensor *output, float scale,
-                                          ConvertPolicy overflow_policy,
-                                          RoundingPolicy rounding_policy)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(),
-                                                scale, overflow_policy, rounding_policy));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-  _input1 = input1;
-  _input2 = input2;
-  _output = output;
-
-  int scale_int = -1;
-  // Extract sign, exponent and mantissa
-  int exponent = 0;
-  float normalized_mantissa = std::frexp(scale, &exponent);
-  // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
-  // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <=
-  // 14
-  // Moreover, it will be negative as we deal with 1/2^n
-  if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
-  {
-    // Store the positive exponent. We know that we compute 1/2^n
-    // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
-    scale_int = std::abs(exponent - 1);
-  }
-
-  std::string data_type;
-  std::string compute_type;
-  // Check if it has float inputs and output
-  if (is_data_type_float(input1->info()->data_type()) ||
-      is_data_type_float(input2->info()->data_type()))
-  {
-    scale_int = -1;
-    compute_type = (input1->info()->data_type() == DataType::F32 ||
-                    input2->info()->data_type() == DataType::F32)
-                       ? "float"
-                       : "half";
-    data_type = "DATA_TYPE_FLOAT";
-  }
-  else
-  {
-    if (input1->info()->data_type() == DataType::S16 ||
-        input2->info()->data_type() == DataType::S16)
-    {
-      compute_type = "int";
-    }
-    else
-    {
-      compute_type = "ushort";
-    }
-    data_type = "DATA_TYPE_INT";
-  }
-
-  // Construct kernel name
-  std::string kernel_name = "pixelwise_div";
-  kernel_name += (scale_int >= 0) ? "_int" : "_float";
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace(
-      (overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type()))
-          ? "-DWRAP"
-          : "-DSATURATE");
-  build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz"
-                                                                  : "-DROUND=_rte");
-  build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_RES=" + compute_type);
-  build_opts.emplace("-D" + data_type);
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  // Set scale argument
-  unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the inputs and output parameters
-
-  if (scale_int >= 0)
-  {
-    _kernel.setArg(idx++, scale_int);
-  }
-  else
-  {
-    _kernel.setArg(idx++, scale);
-  }
-
-  ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                                           const ITensorInfo *output, float scale,
-                                           ConvertPolicy overflow_policy,
-                                           RoundingPolicy rounding_policy)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(),
-                                                            input2->clone().get(),
-                                                            output->clone().get())
-                                  .first);
-
-  return Status{};
-}
-
-void CLPixelWiseDivisionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-  const TensorShape &out_shape = _output->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-  {
-    can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
-    {
-      can_collapse = (in_shape1[d] == in_shape2[d]);
-    }
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-  const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-  Window slice = collapsed.first_slice_window_3D();
-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input1, slice_input1);
-    add_3D_tensor_argument(idx, _input2, slice_input2);
-    add_3D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice_input1);
-    collapsed.slide_window_slice_3D(slice_input2);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLPixelWiseDivisionKernel::border_size() const
-{
-  const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-  const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-  return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp
deleted file mode 100644
index 9e6c5af..0000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-
-using namespace arm_compute;
-
-CLStridedSliceExKernel::CLStridedSliceExKernel()
-    : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr),
-      _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0)
-{
-}
-
-Status CLStridedSliceExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                        const ITensorInfo *begin, const ITensorInfo *end,
-                                        const ITensorInfo *strides, int32_t /*beginMask*/,
-                                        int32_t /*endMask*/, int32_t /*shrinkAxisMask*/)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  ARM_COMPUTE_ERROR_ON(begin->num_dimensions() != 1 || begin->dimension(0) > 4);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(begin->tensor_shape(), end->tensor_shape(),
-                                              strides->tensor_shape());
-
-  return Status{};
-}
-
-// Return the index for the first element along that axis. This index will be a
-// positive integer between [0, axisSize - 1] that can be used to index
-// directly into the data.
-inline int32_t StartForAxis(int32_t beginMask, int32_t begin, int32_t stride,
-                            const TensorShape &inputShape, int32_t axis)
-{
-  // Begin with the specified index
-  int32_t start = begin;
-
-  // beginMask override
-  if (beginMask & 1 << axis)
-  {
-    if (stride > 0)
-    {
-      // Forward iteration - use the first element. These values will get
-      // clamped below (Note: We could have set them to 0 and axisSize-1, but
-      // use lowest() and max() to maintain symmetry with StopForAxis())
-      start = std::numeric_limits<int32_t>::lowest();
-    }
-    else
-    {
-      // Backward iteration - use the last element.
-      start = std::numeric_limits<int32_t>::max();
-    }
-  }
-
-  // Handle negative indices
-  int32_t axisSize = inputShape[axis];
-  if (start < 0)
-  {
-    start += axisSize;
-  }
-
-  // Clamping
-  start = arm_compute::utility::clamp(start, 0, axisSize - 1);
-
-  return start;
-}
-
-// Return the "real" index for the end of iteration along that axis. This is an
-// "end" in the traditional C sense, in that it points to one past the last
-// element. ie. So if you were iterating through all elements of a 1D array of
-// size 4, this function would return 4 as the stop, because it is one past the
-// "real" indices of 0, 1, 2 & 3.
-inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride,
-                           const TensorShape &inputShape, int32_t axis)
-{
-  // Begin with the specified index
-  int32_t stop = end;
-
-  // endMask override
-  if (endMask & (1 << axis))
-  {
-    if (stride > 0)
-    {
-      // Forward iteration - use the last element. These values will get
-      // clamped below
-      stop = std::numeric_limits<int32_t>::max();
-    }
-    else
-    {
-      // Backward iteration - use the first element.
-      stop = std::numeric_limits<int32_t>::lowest();
-    }
-  }
-
-  // Handle negative indices
-  int32_t axisSize = inputShape[axis];
-  if (stop < 0)
-  {
-    stop += axisSize;
-  }
-
-  // Clamping
-  // Because the end index points one past the last element, we need slightly
-  // different clamping ranges depending on the direction.
-  if (stride > 0)
-  {
-    // Forward iteration
-    stop = arm_compute::utility::clamp(stop, 0, axisSize);
-  }
-  else
-  {
-    // Backward iteration
-    stop = arm_compute::utility::clamp(stop, -1, axisSize - 1);
-  }
-
-  return stop;
-}
-
-inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
-{
-  int32_t ret = 0;
-  if (stride > 0)
-  {
-    ret = ((stop - start - 1) / stride) + 1;
-  }
-  else
-  {
-    ret = ((stop - start + 1) / stride) + 1;
-  }
-  ARM_COMPUTE_ERROR_ON_MSG(ret < 0, "The dimension must be the natural number");
-  return ret;
-}
-
-void CLStridedSliceExKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                       ICLTensor *beginData, ICLTensor *endData,
-                                       ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
-                                       int32_t shrinkAxisMask)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(),
-                                      endData->info(), stridesData->info(), beginMask, endMask,
-                                      shrinkAxisMask));
-
-  _input = input;
-  _output = output;
-  _beginData = beginData;
-  _endData = endData;
-  _stridesData = stridesData;
-  _beginMask = beginMask;
-  _endMask = endMask;
-  _shrinkAxisMask = shrinkAxisMask;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DELEMENT_DATA_TYPE=" +
-                     get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("strided_slice_ex", build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output->info(), Steps());
-  ICLKernel::configure_internal(win);
-}
-
-void CLStridedSliceExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  _beginData->map(queue);
-  _endData->map(queue);
-  _stridesData->map(queue);
-
-  std::vector<int32_t> starts;
-  std::vector<int32_t> strides;
-
-  for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n)
-  {
-    const TensorShape shape = _input->info()->tensor_shape();
-    starts.emplace_back(
-        StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n],
-                     reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n));
-
-    strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]);
-  }
-
-  for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++)
-  {
-    starts.emplace_back(0);
-    strides.emplace_back(1);
-  }
-  // TODO: Apply shrinkAxisMask
-
-  _beginData->unmap(queue);
-  _stridesData->unmap(queue);
-  _endData->unmap(queue);
-
-  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
-  const cl_int4 startsArg = {{
-      static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]),
-      static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]),
-  }};
-  _kernel.setArg<cl_int4>(idx++, startsArg);
-
-  const cl_int4 stridesArg = {{
-      static_cast<cl_int>(strides[0]), static_cast<cl_int>(strides[1]),
-      static_cast<cl_int>(strides[2]), static_cast<cl_int>(strides[3]),
-  }};
-  _kernel.setArg<cl_int4>(idx++, stridesArg);
-
-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup output slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_out);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp
deleted file mode 100644
index 26e3798..0000000
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/runtime/CL/functions/CLBatchToSpaceND.h"
-
-#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h"
-
-using namespace arm_compute;
-
-void CLBatchToSpaceND::configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLBatchToSpaceNDKernel>();
-  k->configure(input, output, block_size);
-  _kernel = std::move(k);
-}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLLogicalNot.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLLogicalNot.cpp
deleted file mode 100644
index 96caf9e..0000000
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLLogicalNot.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/runtime/CL/functions/CLLogicalNot.h"
-#include "arm_compute/core/CL/kernels/CLLogicalNotKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-void CLLogicalNot::configure(ICLTensor *input, ICLTensor *output)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLLogicalNotKernel>();
-  k->configure(input, output);
-  _kernel = std::move(k);
-}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp
deleted file mode 100644
index 276c455..0000000
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute;
-
-CLNormalizationLayerEx::CLNormalizationLayerEx() : _norm_kernel(), _border_handler() {}
-
-void CLNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
-                                       const NormalizationLayerInfo &norm_info)
-{
-  ARM_COMPUTE_ERROR_ON(input == nullptr);
-
-  // Configure normalization kernel
-  _norm_kernel.configure(input, output, norm_info);
-
-  // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-  _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
-}
-
-Status CLNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                        const NormalizationLayerInfo &norm_info)
-{
-  return CLNormalizationLayerExKernel::validate(input, output, norm_info);
-}
-
-void CLNormalizationLayerEx::run()
-{
-  // Run border handler
-  CLScheduler::get().enqueue(_border_handler, false);
-
-  // Run normalization kernel
-  CLScheduler::get().enqueue(_norm_kernel);
-}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp
deleted file mode 100644
index dc0baa8..0000000
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/runtime/CL/functions/CLPixelWiseDivision.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
-
-using namespace arm_compute;
-
-void CLPixelWiseDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
-                                    float scale, ConvertPolicy overflow_policy,
-                                    RoundingPolicy rounding_policy)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseDivisionKernel>();
-  k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
-  _kernel = std::move(k);
-
-  if (output->info()->dimension(0) > 1)
-  {
-    ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-    if (broadcasted_info->info()->dimension(0) == 1)
-    {
-      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-    }
-  }
-}
-
-Status CLPixelWiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                                     const ITensorInfo *output, float scale,
-                                     ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
-{
-  return CLPixelWiseDivisionKernel::validate(input1, input2, output, scale, overflow_policy,
-                                             rounding_policy);
-}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp
deleted file mode 100644
index be73534..0000000
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/runtime/CL/functions/CLStridedSliceEx.h"
-
-#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h"
-
-using namespace arm_compute;
-
-void CLStridedSliceEx::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
-                                 ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask,
-                                 int32_t endMask, int32_t shrinkAxisMask)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceExKernel>();
-  k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask);
-  _kernel = std::move(k);
-}